diff --git a/2-Performance_Counters/Handson/.master/Hands-On-Performance-Counters.ipynb b/2-Performance_Counters/Handson/.master/Hands-On-Performance-Counters.ipynb
index 5c1e55d50c03cbbaa46744b94cecc912b0db53f7..4bbe06ac9a84212432b42fec70930134d8ae5461 100644
--- a/2-Performance_Counters/Handson/.master/Hands-On-Performance-Counters.ipynb
+++ b/2-Performance_Counters/Handson/.master/Hands-On-Performance-Counters.ipynb
@@ -48,7 +48,7 @@
     "\n",
     "For the first task, we will measure quantities often used to characterize an application: cycles and instructions.\n",
     "\n",
-    "**TASK**: Please measure counters for completed instructions and run cycles. See the TODOs in [`poisson2d.ins_cyc.c`](/edit/Tasks/poisson2d.ins_cyc.c). You can either edit the files with Jupyter capabilities by clicking on the link of the file or use a dedicated editor (`vim` is available). The names of the counters to be implemented are `PM_INST_CMPL` and `PM_RUN_CYC`.\n",
+    "**TASK**: Please measure counters for completed instructions and run cycles. See the TODOs in file [`poisson2d.ins_cyc.c`](poisson2d.ins_cyc.c). You can either edit the files with Jupyter capabilities by clicking on the link of the file or selecting it in the file drawer on the left; or use a dedicated editor on the system(`vim` is available). The names of the counters to be implemented are `PM_INST_CMPL` and `PM_RUN_CYC`.\n",
     "\n",
     "After changing the source code, compile it with `make task1` or by executing the following cell (we need to change directories first, though).  \n",
     "*(Using the `Makefile` we have hidden quite a few intricacies from you in order to focus on the relevant content at hand. Don't worry too much about it right now – we'll un-hide it gradually during the course of the tutorial.)*\n",
@@ -65,7 +65,24 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/autofs/nccsopen-svm1_home/aherten/OpenPOWER-SC19/2-PAPI/Compiling/Solutions\n"
+      "/autofs/nccsopen-svm1_home/aherten/OpenPOWER-SC19/Prototyping/2-Performance_Counters/Handson/Solutions\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/autofs/nccsopen-svm1_home/aherten/OpenPOWER-SC18/2-PAPI/Compiling/Solutions\n"
      ]
     }
    ],
@@ -76,14 +93,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "gcc -DUSE_DOUBLE -Ofast -std=c99 -lm -lpapi  poisson2d.ins_cyc.c -o poisson2d.ins_cyc.bin\r\n"
+      "gcc -DUSE_DOUBLE -Ofast -std=c99 -lm -lpapi  poisson2d.ins_cyc.c -o poisson2d.ins_cyc.bin\n"
      ]
     }
    ],
@@ -100,17 +117,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
-     "evalue": "Error: Failed to connect to Jupyter notebook. \r\nhttp://localhost:8888/\r\nError: Invalid response: 500 Internal Server Error",
-     "output_type": "error"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
+      "100,64,32,0.0011,3324225,33235,33960,1859440,18357,25033\n"
+     ]
     }
    ],
    "source": [
     "!./poisson2d.ins_cyc.bin 100 64 32\n",
-    "# alternatively call !make run_task1, one of our shortcutts"
+    "# alternatively call !make run_task1"
    ]
   },
   {
@@ -126,7 +147,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 2,
    "metadata": {
     "scrolled": true
    },
@@ -135,554 +156,523 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "gcc -DUSE_DOUBLE -Ofast -std=c99 -lm -lpapi  poisson2d.ins_cyc.c -o poisson2d.ins_cyc.bin\n",
-      "bsub -W 60 -nnodes 1 -Is -P GEN110 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.ins_cyc.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv\n",
-      "Job <4318> is submitted to default queue <batch>.\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.ins_cyc.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.ins_cyc.bin.csv\n",
+      "Job <24059> is submitted to default queue <batch>.\n",
       "<<Waiting for dispatch ...>>\n",
       "<<Starting on login1>>\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,4,0.0012,548153,2735,3888,266504,1243,4753\n",
+      "200,32,4,0.0012,572978,2861,3639,261330,1235,4684\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,8,0.0014,1082153,5405,6558,668070,3227,6573\n",
+      "200,32,8,0.0014,1082978,5411,6189,601962,2914,5099\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,12,0.0014,1442153,7205,8358,872094,4181,12974\n",
+      "200,32,12,0.0014,1442978,7211,7989,811603,3992,5761\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,16,0.0015,1802153,9005,10158,1074585,5230,7975\n",
+      "200,32,16,0.0014,1802978,9011,9789,1017305,4988,7017\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,20,0.0015,2162153,10805,11958,1281118,6236,14107\n",
+      "200,32,20,0.0015,2162978,10811,11589,1221559,6002,7999\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,24,0.0016,2522153,12605,13758,1479347,7222,10037\n",
+      "200,32,24,0.0016,2522978,12611,13389,1435167,7037,9259\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,28,0.0019,2882153,14405,15558,1682827,8251,11219\n",
+      "200,32,28,0.0016,2882978,14411,15189,1633061,8054,9789\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,32,0.0017,3242153,16205,17358,1871170,9210,12109\n",
+      "200,32,32,0.0017,3242978,16211,16989,1842895,9092,10889\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,36,0.0018,3602153,18005,19158,2075730,10193,13063\n",
+      "200,32,36,0.0018,3602978,18011,18789,2042894,10108,12457\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,40,0.0019,3962153,19805,20958,2272736,11258,14491\n",
+      "200,32,40,0.0019,3962978,19811,20589,2261332,11191,14233\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,44,0.0019,4322153,21605,22758,2491982,12249,17554\n",
+      "200,32,44,0.0020,4322978,21611,22389,2458267,12112,14375\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,48,0.0020,4682153,23405,24558,2692600,13292,16003\n",
+      "200,32,48,0.0020,4682978,23411,24189,2658621,13164,15613\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,52,0.0020,5042153,25205,26358,2878730,14277,17055\n",
+      "200,32,52,0.0020,5042978,25211,25989,2866175,14190,16864\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,56,0.0021,5402153,27005,28158,3084915,15295,18583\n",
+      "200,32,56,0.0021,5402978,27011,27789,3080357,15237,21565\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,60,0.0022,5762153,28805,29958,3291836,16330,19233\n",
+      "200,32,60,0.0022,5762978,28811,29589,3283103,16278,18799\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,64,0.0023,6122153,30605,31758,3622134,17946,20887\n",
+      "200,32,64,0.0022,6122978,30611,31389,3587582,17820,19681\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,68,0.0024,6482153,32405,33558,3930512,19200,22297\n",
+      "200,32,68,0.0025,6482978,32411,33189,3893368,19284,20847\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,72,0.0027,6842153,34205,35358,4270649,20402,22797\n",
+      "200,32,72,0.0025,6842978,34211,34989,4289441,21278,22715\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,76,0.0025,7202153,36005,37158,4209408,20894,24035\n",
+      "200,32,76,0.0024,7202978,36011,36789,4208700,20936,22677\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,80,0.0025,7562153,37805,38958,4410712,21911,24986\n",
+      "200,32,80,0.0025,7562978,37811,38589,4409613,21897,23855\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,84,0.0026,7922153,39605,40758,4631259,23020,25649\n",
+      "200,32,84,0.0026,7922978,39611,40389,4611755,22921,24910\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,88,0.0027,8282153,41405,42558,4814218,23914,26743\n",
+      "200,32,88,0.0026,8282978,41411,42189,4821904,23974,26087\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,92,0.0027,8642153,43205,44358,5039020,24944,37612\n",
+      "200,32,92,0.0028,8642978,43211,43989,5104722,25036,38488\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,96,0.0030,9002153,45005,46158,5247046,26072,29012\n",
+      "200,32,96,0.0028,9002978,45011,45789,5238952,26060,27927\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,100,0.0029,9362153,46805,47958,5426721,26963,29831\n",
+      "200,32,100,0.0028,9362978,46811,47589,5441545,27049,29275\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,104,0.0029,9722153,48605,49758,5619647,27963,31679\n",
+      "200,32,104,0.0030,9722978,48611,49389,5920763,28136,72679\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,108,0.0030,10082153,50405,51558,5828776,28956,31626\n",
+      "200,32,108,0.0030,10082978,50411,51189,5853554,29106,31403\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,112,0.0031,10442153,52205,53358,6033005,30029,32674\n",
+      "200,32,112,0.0030,10442978,52211,52989,6053498,30123,32279\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,116,0.0031,10802153,54005,55158,6244763,30994,35257\n",
+      "200,32,116,0.0031,10802978,54011,54789,6296056,31338,33377\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,120,0.0032,11162153,55805,56958,6425499,31972,34572\n",
+      "200,32,120,0.0033,11162978,55811,56589,6468115,32146,33869\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,124,0.0033,11522153,57605,58758,6654149,33094,35931\n",
+      "200,32,124,0.0032,11522978,57611,58389,6675248,33233,35075\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,128,0.0033,11882153,59405,60558,6851733,34090,36755\n",
+      "200,32,128,0.0033,11882978,59411,60189,6894325,34338,36207\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,132,0.0034,12242153,61205,62358,7052529,35058,39834\n",
+      "200,32,132,0.0034,12242978,61211,61989,7093543,35299,37463\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,136,0.0035,12602153,63005,64158,7241645,36039,38957\n",
+      "200,32,136,0.0034,12602978,63011,63789,7312105,36353,48105\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,140,0.0035,12962153,64805,65958,7438548,37024,39702\n",
+      "200,32,140,0.0035,12962978,64811,65589,7503757,37375,39247\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,144,0.0036,13322153,66605,67758,7649807,38039,46041\n",
+      "200,32,144,0.0036,13322978,66611,67389,7692611,38277,40419\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,148,0.0037,13682153,68405,69558,7837686,39006,41671\n",
+      "200,32,148,0.0037,13682978,68411,69189,7968094,39656,42113\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,152,0.0037,14042153,70205,71358,8039582,40031,42707\n",
+      "200,32,152,0.0037,14042978,70211,70989,8122466,40468,42706\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,156,0.0038,14402153,72005,73158,8272212,41195,43645\n",
+      "200,32,156,0.0038,14402978,72011,72789,8328043,41484,45104\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,160,0.0040,14762153,73805,74958,8471858,42200,44594\n",
+      "200,32,160,0.0040,14762978,73811,74589,8547674,42493,54216\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,164,0.0039,15122153,75605,76758,8657085,43103,45699\n",
+      "200,32,164,0.0039,15122978,75611,76389,8738805,43542,45427\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,168,0.0039,15482153,77405,78558,8856462,44110,46863\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,168,0.0040,15482978,77411,78189,8948025,44560,46819\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,172,0.0040,15842153,79205,80358,9050337,45084,47600\n",
+      "200,32,172,0.0040,15842978,79211,79989,9186567,45735,47659\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,176,0.0041,16202153,81005,82158,9267755,46142,55546\n",
+      "200,32,176,0.0041,16202978,81011,81789,9391949,46573,70131\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,180,0.0042,16562153,82805,83958,9452041,47058,49763\n",
+      "200,32,180,0.0042,16562978,82811,83589,9549568,47559,54271\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,184,0.0042,16922153,84605,85758,9655929,48043,50875\n",
+      "200,32,184,0.0042,16922978,84611,85389,9766306,48609,58645\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,188,0.0043,17282153,86405,87558,9906002,49331,52491\n",
+      "200,32,188,0.0043,17282978,86411,87189,9974165,49613,56721\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,192,0.0043,17642153,88205,89358,10089481,50268,52937\n",
+      "200,32,192,0.0044,17642978,88211,88989,10187263,50734,52953\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,196,0.0044,18002153,90005,91158,10292606,51256,54507\n",
+      "200,32,196,0.0044,18002978,90011,90789,10386920,51763,53773\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,200,0.0045,18362153,91805,92958,10466174,52144,54851\n",
+      "200,32,200,0.0045,18362978,91811,92589,10593326,52744,54962\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,204,0.0045,18722153,93605,94758,10710242,53145,77999\n",
+      "200,32,204,0.0045,18722978,93611,94389,10791966,53796,55775\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,208,0.0046,19082153,95405,96558,10872705,54177,57081\n",
+      "200,32,208,0.0046,19082978,95411,96189,10993938,54691,56692\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,212,0.0047,19442153,97205,98358,11284063,56244,58937\n",
+      "200,32,212,0.0047,19442978,97211,97989,11183564,55716,57663\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,216,0.0047,19802153,99005,100158,11267668,56162,58869\n",
+      "200,32,216,0.0047,19802978,99011,99789,11413409,56842,65317\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,220,0.0048,20162153,100805,101958,11510801,57350,60362\n",
+      "200,32,220,0.0049,20162978,100811,101589,11747337,57952,85917\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,224,0.0051,20522153,102605,103758,11730908,58406,61013\n",
+      "200,32,224,0.0049,20522978,102611,103389,11967444,58993,147575\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,228,0.0050,20882153,104405,105558,11891323,59260,62051\n",
+      "200,32,228,0.0050,20882978,104411,105189,12176974,59986,107137\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,232,0.0050,21242153,106205,107358,12083458,60220,63113\n",
+      "200,32,232,0.0051,21242978,106211,106989,12243039,61011,62843\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,236,0.0050,21602153,108005,109158,12290078,61234,68599\n",
+      "200,32,236,0.0051,21602978,108011,108789,12454738,61985,74677\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,240,0.0051,21962153,109805,110958,12547828,62267,88616\n",
+      "200,32,240,0.0051,21962978,109811,110589,12632612,62912,64911\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,244,0.0052,22322153,111605,112758,12674066,63146,66333\n",
+      "200,32,244,0.0052,22322978,111611,112389,12844679,63954,74316\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,248,0.0052,22682153,113405,114558,12882346,64155,67081\n",
+      "200,32,248,0.0053,22682978,113411,114189,13049050,65048,67067\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,252,0.0053,23042153,115205,116358,13140221,65490,68231\n",
+      "200,32,252,0.0054,23042978,115211,115989,13274577,66113,68093\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,256,0.0054,23402153,117005,118158,13331460,66431,69187\n",
+      "200,32,256,0.0054,23402978,117011,117789,13479975,67191,69232\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,260,0.0054,23762153,118805,119958,13531478,67456,70141\n",
+      "200,32,260,0.0055,23762978,118811,119589,13702476,68321,70257\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,264,0.0055,24122153,120605,121758,13710546,68246,81094\n",
+      "200,32,264,0.0055,24122978,120611,121389,13885554,69178,71473\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,268,0.0055,24482153,122405,123558,13890638,69208,72412\n",
+      "200,32,268,0.0056,24482978,122411,123189,14091173,70236,72538\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,272,0.0056,24842153,124205,125358,14130816,70366,88752\n",
+      "200,32,272,0.0057,24842978,124211,124989,14277355,71142,73153\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,276,0.0057,25202153,126005,127158,14355067,71208,93990\n",
+      "200,32,276,0.0057,25202978,126011,126789,14477479,72149,74585\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,280,0.0057,25562153,127805,128958,14513593,72251,85857\n",
+      "200,32,280,0.0058,25562978,127811,128589,14807542,73365,106386\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,284,0.0059,25922153,129605,130758,14800806,73802,76775\n",
+      "200,32,284,0.0059,25922978,129611,130389,14919273,74349,83988\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,288,0.0059,26282153,131405,132558,14959572,74579,77267\n",
+      "200,32,288,0.0060,26282978,131411,132189,15262342,75369,108903\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,292,0.0059,26642153,133205,134358,15130033,75389,78361\n",
+      "200,32,292,0.0061,26642978,133211,133989,15457489,76550,112579\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,296,0.0060,27002153,135005,136158,15314583,76370,79151\n",
+      "200,32,296,0.0061,27002978,135011,135789,15587890,77470,113796\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,300,0.0061,27362153,136805,137958,15515700,77373,80055\n",
+      "200,32,300,0.0063,27362978,136811,137589,15736737,78474,80976\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,304,0.0061,27722153,138605,139758,15739536,78395,81351\n",
+      "200,32,304,0.0062,27722978,138611,139389,15931699,79424,85309\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,308,0.0062,28082153,140405,141558,15910915,79341,82085\n",
+      "200,32,308,0.0064,28082978,140411,141189,16127895,80426,82181\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,312,0.0063,28442153,142205,143358,16119259,80297,83271\n",
+      "200,32,312,0.0063,28442978,142211,142989,16353667,81487,91316\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,316,0.0063,28802153,144005,145158,16376727,81668,84481\n",
+      "200,32,316,0.0064,28802978,144011,144789,16544730,82526,84583\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,320,0.0064,29162153,145805,146958,16575917,82685,85800\n",
+      "200,32,320,0.0064,29162978,145811,146589,16778054,83692,85621\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,324,0.0065,29522153,147605,148758,16752101,83529,86861\n",
+      "200,32,324,0.0065,29522978,147611,148389,16975790,84670,86933\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,328,0.0065,29882153,149405,150558,16931954,84456,87199\n",
+      "200,32,328,0.0066,29882978,149411,150189,17193806,85651,95908\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,332,0.0066,30242153,151205,152358,17129562,85462,88022\n",
+      "200,32,332,0.0067,30242978,151211,151989,17391042,86658,92746\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,336,0.0067,30602153,153005,154158,17522378,87337,90235\n",
+      "200,32,336,0.0067,30602978,153011,153789,17579650,87566,101073\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,340,0.0067,30962153,154805,155958,17525540,87379,89947\n",
+      "200,32,340,0.0068,30962978,154811,155589,17823659,88601,131503\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,344,0.0068,31322153,156605,157758,17811817,88413,169057\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,344,0.0069,31322978,156611,157389,18045749,89720,131352\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,348,0.0069,31682153,158405,159558,17999372,89772,92601\n",
+      "200,32,348,0.0069,31682978,158411,159189,18233228,90790,129666\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,352,0.0069,32042153,160205,161358,18204371,90776,101494\n",
+      "200,32,352,0.0070,32042978,160211,160989,18429938,91908,93827\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,356,0.0070,32402153,162005,163158,18393456,91621,107055\n",
+      "200,32,356,0.0071,32402978,162011,162789,18723870,92891,169000\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,360,0.0070,32762153,163805,164958,18567077,92476,114024\n",
+      "200,32,360,0.0071,32762978,163811,164589,18839189,93872,104313\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,364,0.0072,33122153,165605,166758,18749614,93562,96291\n",
+      "200,32,364,0.0072,33122978,165611,166389,19052230,94828,108456\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,368,0.0073,33482153,167405,168558,18957503,94465,97467\n",
+      "200,32,368,0.0072,33482978,167411,168189,19224348,95828,106832\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,372,0.0072,33842153,169205,170358,19137907,95471,98421\n",
+      "200,32,372,0.0073,33842978,169211,169989,19409746,96825,98825\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,376,0.0073,34202153,171005,172158,19350029,96457,99505\n",
+      "200,32,376,0.0074,34202978,171011,171789,19635914,97934,100015\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,380,0.0075,34562153,172805,173958,19657158,97897,122483\n",
+      "200,32,380,0.0075,34562978,172811,173589,19901265,99194,108856\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,384,0.0075,34922153,174605,175758,20019224,98872,199167\n",
+      "200,32,384,0.0075,34922978,174611,175389,20087150,100132,113306\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,388,0.0075,35282153,176405,177558,19999785,99747,102911\n",
+      "200,32,388,0.0076,35282978,176411,177189,20289560,101187,111225\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,392,0.0077,35642153,178205,179358,20188679,100586,121054\n",
+      "200,32,392,0.0076,35642978,178211,178989,20478069,102158,104431\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,396,0.0076,36002153,180005,181158,20368637,101583,105060\n",
+      "200,32,396,0.0077,36002978,180011,180789,20703541,103136,118462\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,400,0.0077,36362153,181805,182958,20628698,102607,152896\n",
+      "200,32,400,0.0078,36362978,181811,182589,20889687,104097,116051\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,404,0.0078,36722153,183605,184758,20759711,103503,111551\n",
+      "200,32,404,0.0078,36722978,183611,184389,21103371,105019,150497\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,408,0.0078,37082153,185405,186558,21008339,104552,136230\n",
+      "200,32,408,0.0079,37082978,185411,186189,21343392,106235,146574\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,412,0.0080,37442153,187205,188358,21248565,105961,109252\n",
+      "200,32,412,0.0080,37442978,187211,187989,21499750,107213,116228\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,416,0.0080,37802153,189005,190158,21446394,106998,110446\n",
+      "200,32,416,0.0081,37802978,189011,189789,21769516,108354,153304\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,420,0.0081,38162153,190805,191958,21618503,107795,119989\n",
+      "200,32,420,0.0082,38162978,190811,191589,22016040,109333,166344\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,424,0.0081,38522153,192605,193758,21778142,108604,112064\n",
+      "200,32,424,0.0082,38522978,192611,193389,22124948,110298,112586\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,428,0.0081,38882153,194405,195558,21989784,109653,120306\n",
+      "200,32,428,0.0083,38882978,194411,195189,22375892,111391,164691\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,432,0.0082,39242153,196205,197358,22191881,110730,113916\n",
+      "200,32,432,0.0083,39242978,196211,196989,22605417,112244,161120\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,436,0.0083,39602153,198005,199158,22373426,111587,115657\n",
+      "200,32,436,0.0084,39602978,198011,198789,22698406,113231,115888\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,440,0.0084,39962153,199805,200958,22596402,112638,130342\n",
+      "200,32,440,0.0084,39962978,199811,200589,22946025,114347,124840\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,444,0.0084,40322153,201605,202758,22868323,114041,124888\n",
+      "200,32,444,0.0085,40322978,201611,202389,23138571,115404,122324\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,448,0.0085,40682153,203405,204558,23084361,115132,128588\n",
+      "200,32,448,0.0086,40682978,203411,204189,23382319,116666,118990\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,452,0.0086,41042153,205205,206358,23255449,115787,156348\n",
+      "200,32,452,0.0086,41042978,205211,205989,23582320,117634,123005\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,456,0.0088,41402153,207005,208158,23400730,116742,119985\n",
+      "200,32,456,0.0087,41402978,207011,207789,23777586,118606,121054\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,460,0.0087,41762153,208805,209958,23616057,117782,125672\n",
+      "200,32,460,0.0088,41762978,208811,209589,24021078,119638,157473\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,464,0.0088,42122153,210605,211758,23845815,118769,150383\n",
+      "200,32,464,0.0089,42122978,210611,211389,24177273,120536,137152\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,468,0.0089,42482153,212405,213558,23982677,119580,123029\n",
+      "200,32,468,0.0089,42482978,212411,213189,24354431,121510,124378\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,472,0.0090,42842153,214205,215358,24183894,120688,124270\n",
+      "200,32,472,0.0090,42842978,214211,214989,24680874,122798,163001\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,476,0.0090,43202153,216005,217158,24479273,122149,125974\n",
+      "200,32,476,0.0092,43202978,216011,216789,24806941,123695,126112\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,480,0.0091,43562153,217805,218958,24768939,123125,164217\n",
+      "200,32,480,0.0091,43562978,217811,218589,25036974,124855,131240\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,484,0.0092,43922153,219605,220758,24828983,123895,127390\n",
+      "200,32,484,0.0092,43922978,219611,220389,25277560,125834,159926\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,488,0.0091,44282153,221405,222558,25011559,124768,128788\n",
+      "200,32,488,0.0093,44282978,221411,222189,25492002,126931,169890\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,492,0.0092,44642153,223205,224358,25219550,125760,132732\n",
+      "200,32,492,0.0094,44642978,223211,223989,25799993,127811,292316\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,496,0.0093,45002153,225005,226158,25447017,126853,140428\n",
+      "200,32,496,0.0094,45002978,225011,225789,25879076,128748,186367\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,500,0.0093,45362153,226805,227958,25586059,127650,131094\n",
+      "200,32,500,0.0094,45362978,226811,227589,26021482,129705,143377\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,504,0.0094,45722153,228605,229758,25796559,128739,131932\n",
+      "200,32,504,0.0095,45722978,228611,229389,26309697,130875,185497\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,508,0.0095,46082153,230405,231558,26122261,130275,141242\n",
+      "200,32,508,0.0096,46082978,230411,231189,26445482,131853,134810\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,512,0.0095,46442153,232205,233358,26303806,130890,135216\n",
+      "200,32,512,0.0097,46442978,232211,232989,26722882,133313,135480\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,516,0.0096,46802153,234005,235158,26441241,131860,137807\n",
+      "200,32,516,0.0097,46802978,234011,234789,26902984,134116,143429\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,520,0.0097,47162153,235805,236958,26620814,132726,144193\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,520,0.0098,47162978,235811,236589,27143327,135173,182663\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,524,0.0097,47522153,237605,238758,26895547,133979,180810\n",
+      "200,32,524,0.0101,47522978,237611,238389,27899728,139067,143412\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,528,0.0098,47882153,239405,240558,27103175,134594,195038\n",
+      "200,32,528,0.0099,47882978,239411,240189,27539695,137281,153792\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,532,0.0099,48242153,241205,242358,27216804,135653,148537\n",
+      "200,32,532,0.0100,48242978,241211,241989,27665652,137957,156345\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,536,0.0100,48602153,243005,244158,27609711,137157,225927\n",
+      "200,32,536,0.0102,48602978,243011,243789,27888664,139123,142069\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,540,0.0101,48962153,244805,245958,27856165,138525,222412\n",
+      "200,32,540,0.0102,48962978,244811,245589,28116288,140162,167093\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,544,0.0101,49322153,246605,247758,27949313,139206,146089\n",
+      "200,32,544,0.0102,49322978,246611,247389,28395864,141365,191687\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,548,0.0102,49682153,248405,249558,28071639,140106,144061\n",
+      "200,32,548,0.0105,49682978,248411,249189,28539300,142352,144923\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,552,0.0102,50042153,250205,251358,28221254,140771,147826\n",
+      "200,32,552,0.0104,50042978,250211,250989,28772000,143499,153080\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,556,0.0103,50402153,252005,253158,28466442,141994,145849\n",
+      "200,32,556,0.0104,50402978,252011,252789,28943938,144344,160802\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,560,0.0105,50762153,253805,254958,28785863,142904,194917\n",
+      "200,32,560,0.0105,50762978,253811,254589,29192011,145318,205574\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,564,0.0105,51122153,255605,256758,28851831,143902,156411\n",
+      "200,32,564,0.0106,51122978,255611,256389,29371768,146296,173660\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,568,0.0106,51482153,257405,258558,29223120,145608,162476\n",
+      "200,32,568,0.0107,51482978,257411,258189,29607085,147402,185216\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,572,0.0108,51842153,259205,260358,29438332,146788,151895\n",
+      "200,32,572,0.0109,51842978,259211,259989,29760468,148529,150992\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,576,0.0108,52202153,261005,262158,29557331,147210,151262\n",
+      "200,32,576,0.0108,52202978,261011,261789,30001693,149671,152448\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,580,0.0108,52562153,262805,263958,29704990,148198,158557\n",
+      "200,32,580,0.0109,52562978,262811,263589,30194219,150474,161954\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,584,0.0108,52922153,264605,265758,29996452,149016,250006\n",
+      "200,32,584,0.0110,52922978,264611,265389,30465237,151575,196784\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,588,0.0109,53282153,266405,267558,30123135,150270,154069\n",
+      "200,32,588,0.0112,53282978,266411,267189,30866027,152658,345805\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,592,0.0110,53642153,268205,269358,30283611,150978,165439\n",
+      "200,32,592,0.0112,53642978,268211,268989,30806266,153631,162459\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,596,0.0110,54002153,270005,271158,30512807,152128,156216\n",
+      "200,32,596,0.0112,54002978,270011,270789,31013348,154624,161083\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,600,0.0111,54362153,271805,272958,30713954,153227,157015\n",
+      "200,32,600,0.0113,54362978,271811,272589,31227644,155782,158034\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,604,0.0113,54722153,273605,274758,31116246,155098,162946\n",
+      "200,32,604,0.0115,54722978,273611,274389,31534633,156837,219588\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,608,0.0113,55082153,275405,276558,31292429,155792,166047\n",
+      "200,32,608,0.0114,55082978,275411,276189,31675474,157869,168332\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,612,0.0113,55442153,277205,278358,31367681,156312,187819\n",
+      "200,32,612,0.0115,55442978,277211,277989,31953436,158989,218652\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,616,0.0114,55802153,279005,280158,31509163,156923,173955\n",
+      "200,32,616,0.0116,55802978,279011,279789,32108644,160138,180416\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,620,0.0115,56162153,280805,281958,31751550,158349,162413\n",
+      "200,32,620,0.0116,56162978,280811,281589,32277424,160849,182393\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,624,0.0116,56522153,282605,283758,32010052,159426,164990\n",
+      "200,32,624,0.0118,56522978,282611,283389,32423394,161797,164245\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,628,0.0116,56882153,284405,285558,32270071,160471,206182\n",
+      "200,32,628,0.0117,56882978,284411,285189,32609412,162678,167394\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,632,0.0118,57242153,286205,287358,32379821,161317,166154\n",
+      "200,32,632,0.0118,57242978,286211,286989,32869379,163975,168634\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,636,0.0118,57602153,288005,289158,32621237,162719,174455\n",
+      "200,32,636,0.0119,57602978,288011,288789,33151217,165037,223167\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,640,0.0118,57962153,289805,290958,32760054,163283,174727\n",
+      "200,32,640,0.0119,57962978,289811,290589,33341299,166215,181218\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,644,0.0119,58322153,291605,292758,32895462,163973,168568\n",
+      "200,32,644,0.0121,58322978,291611,292389,33649260,167751,199967\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,648,0.0119,58682153,293405,294558,33046462,164805,176098\n",
+      "200,32,648,0.0121,58682978,293411,294189,33719599,168221,178799\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,652,0.0120,59042153,295205,296358,33305627,166069,179927\n",
+      "200,32,652,0.0122,59042978,295211,295989,34067206,169536,235514\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,656,0.0121,59402153,297005,298158,33611780,166989,248127\n",
+      "200,32,656,0.0122,59402978,297011,297789,34164102,170144,235618\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,660,0.0121,59762153,298805,299958,33791922,168433,184984\n",
+      "200,32,660,0.0123,59762978,298811,299589,34456636,171594,235316\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,664,0.0121,60122153,300605,301758,33927065,169140,182483\n",
+      "200,32,664,0.0124,60122978,300611,301389,34541178,172177,211827\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,668,0.0124,60482153,302405,303558,34476798,171567,188679\n",
+      "200,32,668,0.0124,60482978,302411,303189,34905159,173832,222673\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,672,0.0123,60842153,304205,305358,34350802,171240,175365\n",
+      "200,32,672,0.0126,60842978,304211,304989,34988298,174422,188003\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,676,0.0123,61202153,306005,307158,34529315,172118,202239\n",
+      "200,32,676,0.0126,61202978,306011,306789,35263092,175911,185984\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,680,0.0124,61562153,307805,308958,34716545,172878,244909\n",
+      "200,32,680,0.0127,61562978,307811,308589,35503073,176323,305860\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,684,0.0126,61922153,309605,310758,35111667,174820,186347\n",
+      "200,32,684,0.0128,61922978,309611,310389,35672483,178036,180851\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,688,0.0126,62282153,311405,312558,35200811,175517,179013\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,688,0.0128,62282978,311411,312189,35790039,178289,217803\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,692,0.0126,62642153,313205,314358,35391859,176015,252609\n",
+      "200,32,692,0.0128,62642978,313211,313989,36045752,179866,188983\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,696,0.0127,63002153,315005,316158,35696188,177815,200506\n",
+      "200,32,696,0.0130,63002978,315011,315789,36175144,180438,195986\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,700,0.0128,63362153,316805,317958,35825556,178736,191521\n",
+      "200,32,700,0.0131,63362978,316811,317589,36529049,182248,184897\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,704,0.0129,63722153,318605,319758,36008866,179237,218743\n",
+      "200,32,704,0.0130,63722978,318611,319389,36611747,182765,185703\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,708,0.0129,64082153,320405,321558,36282257,180511,214158\n",
+      "200,32,708,0.0130,64082978,320411,321189,36811496,183626,191140\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,712,0.0129,64442153,322205,323358,36251857,180793,191833\n",
+      "200,32,712,0.0131,64442978,322211,322989,37060383,184588,255521\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,716,0.0131,64802153,324005,325158,36828270,182903,229477\n",
+      "200,32,716,0.0132,64802978,324011,324789,37267356,185684,240236\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,720,0.0130,65162153,325805,326958,36775140,183107,213910\n",
+      "200,32,720,0.0132,65162978,325811,326589,37393434,186562,204926\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,724,0.0131,65522153,327605,328758,36946255,184028,240244\n",
+      "200,32,724,0.0133,65522978,327611,328389,37611724,187635,203956\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,728,0.0132,65882153,329405,330558,37189420,185485,206103\n",
+      "200,32,728,0.0135,65882978,329411,330189,37844476,188685,217329\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,732,0.0133,66242153,331205,332358,37526856,187108,192940\n",
+      "200,32,732,0.0136,66242978,331211,331989,38097715,189879,238003\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,736,0.0134,66602153,333005,334158,37747623,188004,201070\n",
+      "200,32,736,0.0136,66602978,333011,333789,38249665,190960,193797\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,740,0.0134,66962153,334805,335958,37844347,188709,198675\n",
+      "200,32,740,0.0137,66962978,334811,335589,38496135,191882,202980\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,744,0.0134,67322153,336605,337758,37874634,189009,203611\n",
+      "200,32,744,0.0136,67322978,336611,337389,38643004,192776,211409\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,748,0.0136,67682153,338405,339558,38360815,190893,193995\n",
+      "200,32,748,0.0138,67682978,338411,339189,38834497,193752,204307\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,752,0.0137,68042153,340205,341358,38702052,192377,222451\n",
+      "200,32,752,0.0139,68042978,340211,340989,39026422,194674,207102\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,756,0.0136,68402153,342005,343158,38548177,192033,249435\n",
+      "200,32,756,0.0139,68402978,342011,342789,39292510,195755,242534\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,760,0.0138,68762153,343805,344958,39152996,194437,272148\n",
+      "200,32,760,0.0140,68762978,343811,344589,39445808,196904,199749\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,764,0.0138,69122153,345605,346758,39070056,194876,204988\n",
+      "200,32,764,0.0140,69122978,345611,346389,39707448,198140,208159\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,768,0.0138,69482153,347405,348558,39192485,195337,208507\n",
+      "200,32,768,0.0141,69482978,347411,348189,39961335,199314,213386\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,772,0.0139,69842153,349205,350358,39509976,197063,216644\n",
+      "200,32,772,0.0142,69842978,349211,349989,40195551,200268,262442\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,776,0.0140,70202153,351005,352158,39643299,197720,238164\n",
+      "200,32,776,0.0143,70202978,351011,351789,40369481,201262,243178\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,780,0.0141,70562153,352805,353958,40047395,199611,212284\n",
+      "200,32,780,0.0143,70562978,352811,353589,40454251,201889,204769\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,784,0.0142,70922153,354605,355758,40474213,201350,218018\n",
+      "200,32,784,0.0143,70922978,354611,355389,40804167,203132,292206\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,788,0.0143,71282153,356405,357558,40369690,200941,270257\n",
+      "200,32,788,0.0144,71282978,356411,357189,40880258,203888,220805\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,792,0.0143,71642153,358205,359358,40667289,202430,244792\n",
+      "200,32,792,0.0145,71642978,358211,358989,41141375,205195,222680\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,796,0.0145,72002153,360005,361158,41245212,205315,244622\n",
+      "200,32,796,0.0145,72002978,360011,360789,41346667,205890,276619\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,800,0.0144,72362153,361805,362958,41042713,204407,249254\n",
+      "200,32,800,0.0146,72362978,361811,362589,41586665,207290,248916\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,804,0.0145,72722153,363605,364758,41137099,205254,211445\n",
+      "200,32,804,0.0147,72722978,363611,364389,41696398,208106,211465\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,808,0.0145,73082153,365405,366558,41267168,205869,210553\n",
+      "200,32,808,0.0148,73082978,365411,366189,41978951,209272,255137\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,812,0.0146,73442153,367205,368358,41538016,207083,242270\n",
+      "200,32,812,0.0148,73442978,367211,367989,42187366,209918,283393\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,816,0.0147,73802153,369005,370158,41856937,208198,257079\n",
+      "200,32,816,0.0149,73802978,369011,369789,42482639,211214,322437\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,820,0.0149,74162153,370805,371958,42581251,211598,220361\n",
+      "200,32,820,0.0149,74162978,370811,371589,42512865,212010,227823\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,824,0.0148,74522153,372605,373758,42106929,210144,214780\n",
+      "200,32,824,0.0151,74522978,372611,373389,42861251,213412,278868\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,828,0.0151,74882153,374405,375558,42954101,213100,216189\n",
+      "200,32,828,0.0151,74882978,374411,375189,42979335,214191,262439\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,832,0.0150,75242153,376205,377358,42591682,212393,217281\n",
+      "200,32,832,0.0152,75242978,376211,376989,43402619,215543,296991\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,836,0.0150,75602153,378005,379158,42833889,213607,225147\n",
+      "200,32,836,0.0152,75602978,378011,378789,43382253,216450,232179\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,840,0.0151,75962153,379805,380958,42888365,213833,258282\n",
+      "200,32,840,0.0154,75962978,379811,380589,43665001,217538,261020\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,844,0.0151,76322153,381605,382758,43234463,215605,228741\n",
+      "200,32,844,0.0154,76322978,381611,382389,43762162,218196,232967\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,848,0.0152,76682153,383405,384558,43340508,216058,240778\n",
+      "200,32,848,0.0156,76682978,383411,384189,44077885,219619,233562\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,852,0.0154,77042153,385205,386358,43964132,218702,263707\n",
+      "200,32,852,0.0155,77042978,385211,385989,44269902,220266,357562\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,856,0.0155,77402153,387005,388158,43738562,218168,230126\n",
+      "200,32,856,0.0156,77402978,387011,387789,44458368,221658,275183\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,860,0.0154,77762153,388805,389958,44071523,219837,238185\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,860,0.0156,77762978,388811,389589,44599845,222530,244104\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,864,0.0155,78122153,390605,391758,44411093,221177,232408\n",
+      "200,32,864,0.0158,78122978,390611,391389,44856987,223898,229495\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,868,0.0157,78482153,392405,393558,44526424,222013,237960\n",
+      "200,32,868,0.0157,78482978,392411,393189,45070339,224667,268426\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,872,0.0158,78842153,394205,395358,45188815,224084,346189\n",
+      "200,32,872,0.0158,78842978,394211,394989,45243346,225686,238504\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,876,0.0156,79202153,396005,397158,44700630,222996,237268\n",
+      "200,32,876,0.0160,79202978,396011,396789,45425044,226467,285843\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,880,0.0158,79562153,397805,398958,45208957,224813,328325\n",
+      "200,32,880,0.0160,79562978,397811,398589,45637897,227585,255503\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,884,0.0159,79922153,399605,400758,45474656,226439,239215\n",
+      "200,32,884,0.0163,79922978,399611,400389,45922301,228540,294854\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,888,0.0160,80282153,401405,402558,45766475,227867,240911\n",
+      "200,32,888,0.0161,80282978,401411,402189,46210377,229936,317062\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,892,0.0160,80642153,403205,404358,45940503,228819,243891\n",
+      "200,32,892,0.0161,80642978,403211,403989,46224897,230736,244030\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,896,0.0161,81002153,405005,406158,45973712,229111,241548\n",
+      "200,32,896,0.0163,81002978,405011,405789,46706945,232252,393574\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,900,0.0162,81362153,406805,407958,46447521,230613,346027\n",
+      "200,32,900,0.0163,81362978,406811,407589,46846573,233803,243774\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,904,0.0163,81722153,408605,409758,46859527,233117,305572\n",
+      "200,32,904,0.0165,81722978,408611,409389,47211102,235424,247115\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,908,0.0164,82082153,410405,411558,47123610,234871,284329\n",
+      "200,32,908,0.0165,82082978,410411,411189,47420647,236067,308146\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,912,0.0166,82442153,412205,413358,47816182,237201,366650\n",
+      "200,32,912,0.0167,82442978,412211,412989,47664515,237299,252663\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,916,0.0166,82802153,414005,415158,47456504,236767,248921\n",
+      "200,32,916,0.0166,82802978,414011,414789,47825500,238210,307878\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,920,0.0165,83162153,415805,416958,47592162,237459,265738\n",
+      "200,32,920,0.0168,83162978,415811,416589,48024315,239591,249230\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,924,0.0167,83522153,417605,418758,48057683,239541,276783\n",
+      "200,32,924,0.0168,83522978,417611,418389,48204506,240348,286103\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,928,0.0167,83882153,419405,420558,48171706,239841,277682\n",
+      "200,32,928,0.0168,83882978,419411,420189,48474452,241766,272232\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,932,0.0170,84242153,421205,422358,48721591,242883,245719\n",
+      "200,32,932,0.0169,84242978,421211,421989,48643328,242408,310910\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,936,0.0169,84602153,423005,424158,48377712,241387,254877\n",
+      "200,32,936,0.0170,84602978,423011,423789,49041567,243670,350571\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,940,0.0169,84962153,424805,425958,48721762,242855,255300\n",
+      "200,32,940,0.0171,84962978,424811,425589,49009612,244295,313509\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,944,0.0170,85322153,426605,427758,49035991,243372,370914\n",
+      "200,32,944,0.0171,85322978,426611,427389,49257311,245620,259650\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,948,0.0171,85682153,428405,429558,49070436,244800,262067\n",
+      "200,32,948,0.0172,85682978,428411,429189,49415667,246533,254714\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,952,0.0171,86042153,430205,431358,49234273,245636,258683\n",
+      "200,32,952,0.0172,86042978,430211,430989,49711139,247671,319628\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,956,0.0172,86402153,432005,433158,49586922,247001,316148\n",
+      "200,32,956,0.0174,86402978,432011,432789,49856592,248552,271876\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,960,0.0172,86762153,433805,434958,49640943,247637,284307\n",
+      "200,32,960,0.0174,86762978,433811,434589,50136102,249978,265617\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,964,0.0177,87122153,435605,436758,51436885,256453,266477\n",
+      "200,32,964,0.0176,87122978,435611,436389,50925446,253713,295499\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,968,0.0178,87482153,437405,438558,51146832,254991,267861\n",
+      "200,32,968,0.0178,87482978,437411,438189,51035835,253858,318894\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,972,0.0177,87842153,439205,440358,51377929,256333,274159\n",
+      "200,32,972,0.0177,87842978,439211,439989,51188317,255334,306288\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,976,0.0179,88202153,441005,442158,51360933,256336,265049\n",
+      "200,32,976,0.0178,88202978,441011,441789,51436023,256205,289239\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,980,0.0179,88562153,442805,443958,51845435,258521,293602\n",
+      "200,32,980,0.0179,88562978,442811,443589,51703656,257814,300077\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,984,0.0180,88922153,444605,445758,52129373,259818,262711\n",
+      "200,32,984,0.0179,88922978,444611,445389,51801305,257947,349721\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,988,0.0181,89282153,446405,447558,52262963,260903,278224\n",
+      "200,32,988,0.0181,89282978,446411,447189,52056854,259676,262216\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,992,0.0182,89642153,448205,449358,52407317,261432,272849\n",
+      "200,32,992,0.0182,89642978,448211,448989,52237864,260535,269494\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,996,0.0184,90002153,450005,451158,53286503,265403,275404\n",
+      "200,32,996,0.0183,90002978,450011,450789,52526126,262024,274178\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,1000,0.0182,90362153,451805,452958,53051777,264487,273734\n",
+      "200,32,1000,0.0182,90362978,451811,452589,52578843,262284,265526\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,1004,0.0183,90722153,453605,454758,53153647,264834,340140\n",
+      "200,32,1004,0.0183,90722978,453611,454389,52896370,263840,273834\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,1008,0.0183,91082153,455405,456558,53025643,264711,274578\n",
+      "200,32,1008,0.0183,91082978,455411,456189,53074476,264385,308471\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,1012,0.0185,91442153,457205,458358,53709439,267192,353247\n",
+      "200,32,1012,0.0184,91442978,457211,457989,53382079,266422,284446\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,1016,0.0186,91802153,459005,460158,54036527,268786,339099\n",
+      "200,32,1016,0.0186,91802978,459011,459789,53434221,266486,275700\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,1020,0.0186,92162153,460805,461958,54154888,269844,327020\n",
+      "200,32,1020,0.0186,92162978,460811,461589,53712164,268036,277528\n",
       "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,1024,0.0183,92522153,462605,463758,52875104,262839,332332\n",
-      "mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .\n"
+      "200,32,1024,0.0187,92522978,462611,463389,53754294,268076,276795\n",
+      "mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.ins_cyc.bin.csv .\n"
      ]
     }
    ],
@@ -694,17 +684,18 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Once the run is completed, let's have a look at the data!\n",
+    "Once the run is completed, let's study the data!\n",
     "\n",
     "This can be done best in the interactive version of the Jupyter Notebook. In case this version of the description is unavailable to you, call the Makefile target `make graph_task1` (either with X forwarding, or download the resulting PDF)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import numpy as np\n",
     "import seaborn as sns\n",
     "import pandas as pd\n",
     "import matplotlib.pyplot as plt\n",
@@ -714,9 +705,25 @@
     "plt.rcParams['figure.figsize'] = [14, 6]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute the following cell if you want to switch to color-blind-safer colors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.set_palette(\"colorblind\")"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -750,8 +757,7 @@
        "      <th>PM_RUN_CYC (total)</th>\n",
        "      <th>PM_RUN_CYC (min)</th>\n",
        "      <th>PM_RUN_CYC (max)</th>\n",
-       "      <th>Instructions / Loop Iteration</th>\n",
-       "      <th>Cycles / Loop Iteration</th>\n",
+       "      <th>Grid Points</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -761,14 +767,13 @@
        "      <td>32</td>\n",
        "      <td>4</td>\n",
        "      <td>0.0012</td>\n",
-       "      <td>548153</td>\n",
-       "      <td>2735</td>\n",
-       "      <td>3888</td>\n",
-       "      <td>266883</td>\n",
-       "      <td>1237</td>\n",
-       "      <td>4793</td>\n",
-       "      <td>21.367188</td>\n",
-       "      <td>9.664062</td>\n",
+       "      <td>572978</td>\n",
+       "      <td>2861</td>\n",
+       "      <td>3639</td>\n",
+       "      <td>261330</td>\n",
+       "      <td>1235</td>\n",
+       "      <td>4684</td>\n",
+       "      <td>128</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
@@ -776,14 +781,13 @@
        "      <td>32</td>\n",
        "      <td>8</td>\n",
        "      <td>0.0014</td>\n",
-       "      <td>1082153</td>\n",
-       "      <td>5405</td>\n",
-       "      <td>6558</td>\n",
-       "      <td>668819</td>\n",
-       "      <td>3214</td>\n",
-       "      <td>6623</td>\n",
-       "      <td>21.113281</td>\n",
-       "      <td>12.554688</td>\n",
+       "      <td>1082978</td>\n",
+       "      <td>5411</td>\n",
+       "      <td>6189</td>\n",
+       "      <td>601962</td>\n",
+       "      <td>2914</td>\n",
+       "      <td>5099</td>\n",
+       "      <td>256</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -791,44 +795,41 @@
        "      <td>32</td>\n",
        "      <td>12</td>\n",
        "      <td>0.0014</td>\n",
-       "      <td>1442153</td>\n",
-       "      <td>7205</td>\n",
-       "      <td>8358</td>\n",
-       "      <td>872913</td>\n",
-       "      <td>4187</td>\n",
-       "      <td>11640</td>\n",
-       "      <td>18.763021</td>\n",
-       "      <td>10.903646</td>\n",
+       "      <td>1442978</td>\n",
+       "      <td>7211</td>\n",
+       "      <td>7989</td>\n",
+       "      <td>811603</td>\n",
+       "      <td>3992</td>\n",
+       "      <td>5761</td>\n",
+       "      <td>384</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>200</td>\n",
        "      <td>32</td>\n",
        "      <td>16</td>\n",
-       "      <td>0.0015</td>\n",
-       "      <td>1802153</td>\n",
-       "      <td>9005</td>\n",
-       "      <td>10158</td>\n",
-       "      <td>1077532</td>\n",
-       "      <td>5254</td>\n",
-       "      <td>8147</td>\n",
-       "      <td>17.587891</td>\n",
-       "      <td>10.261719</td>\n",
+       "      <td>0.0014</td>\n",
+       "      <td>1802978</td>\n",
+       "      <td>9011</td>\n",
+       "      <td>9789</td>\n",
+       "      <td>1017305</td>\n",
+       "      <td>4988</td>\n",
+       "      <td>7017</td>\n",
+       "      <td>512</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>200</td>\n",
        "      <td>32</td>\n",
        "      <td>20</td>\n",
-       "      <td>0.0016</td>\n",
-       "      <td>2162153</td>\n",
-       "      <td>10805</td>\n",
-       "      <td>11958</td>\n",
-       "      <td>1277957</td>\n",
-       "      <td>6209</td>\n",
-       "      <td>9015</td>\n",
-       "      <td>16.882812</td>\n",
-       "      <td>9.701562</td>\n",
+       "      <td>0.0015</td>\n",
+       "      <td>2162978</td>\n",
+       "      <td>10811</td>\n",
+       "      <td>11589</td>\n",
+       "      <td>1221559</td>\n",
+       "      <td>6002</td>\n",
+       "      <td>7999</td>\n",
+       "      <td>640</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -836,28 +837,28 @@
       ],
       "text/plain": [
        "   iter  ny  nx  Runtime  PM_INST_CMPL (total)  PM_INST_CMPL (min)  \\\n",
-       "0   200  32   4   0.0012                548153                2735   \n",
-       "1   200  32   8   0.0014               1082153                5405   \n",
-       "2   200  32  12   0.0014               1442153                7205   \n",
-       "3   200  32  16   0.0015               1802153                9005   \n",
-       "4   200  32  20   0.0016               2162153               10805   \n",
+       "0   200  32   4   0.0012                572978                2861   \n",
+       "1   200  32   8   0.0014               1082978                5411   \n",
+       "2   200  32  12   0.0014               1442978                7211   \n",
+       "3   200  32  16   0.0014               1802978                9011   \n",
+       "4   200  32  20   0.0015               2162978               10811   \n",
        "\n",
        "    PM_INST_CMPL (max)  PM_RUN_CYC (total)  PM_RUN_CYC (min)  \\\n",
-       "0                 3888              266883              1237   \n",
-       "1                 6558              668819              3214   \n",
-       "2                 8358              872913              4187   \n",
-       "3                10158             1077532              5254   \n",
-       "4                11958             1277957              6209   \n",
+       "0                 3639              261330              1235   \n",
+       "1                 6189              601962              2914   \n",
+       "2                 7989              811603              3992   \n",
+       "3                 9789             1017305              4988   \n",
+       "4                11589             1221559              6002   \n",
        "\n",
-       "    PM_RUN_CYC (max)  Instructions / Loop Iteration  Cycles / Loop Iteration  \n",
-       "0               4793                      21.367188                 9.664062  \n",
-       "1               6623                      21.113281                12.554688  \n",
-       "2              11640                      18.763021                10.903646  \n",
-       "3               8147                      17.587891                10.261719  \n",
-       "4               9015                      16.882812                 9.701562  "
+       "    PM_RUN_CYC (max)  Grid Points  \n",
+       "0               4684          128  \n",
+       "1               5099          256  \n",
+       "2               5761          384  \n",
+       "3               7017          512  \n",
+       "4               7999          640  "
       ]
      },
-     "execution_count": 77,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -865,40 +866,171 @@
    "source": [
     "plt.rcParams['figure.figsize'] = [14, 6]\n",
     "df = pd.read_csv(\"poisson2d.ins_cyc.bin.csv\", skiprows=range(2, 50000, 2))  # Read in the CSV file from the bench run; parse with Pandas\n",
-    "common.normalize(df, \"PM_INST_CMPL (min)\", \"Instructions / Loop Iteration\")  # Normalize to each grid cell\n",
-    "common.normalize(df, \"PM_RUN_CYC (min)\", \"Cycles / Loop Iteration\")\n",
+    "df[\"Grid Points\"] = df[\"nx\"] * df[\"ny\"]  # Add a new column of the number of grid points (the product of nx and ny)\n",
     "df.head()  # Display the head of the Pandas dataframe"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's have a look at the counters we've just measured and see how they scaling with increasing number of grid points.\n",
+    "\n",
+    "*In the following, we are always using the minimal value of the counter (indicated by »(min)«) as this should give us an estimate of the best achievable result of the architecture.*"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "image/png": "\n",
+      "image/png": "\n",
       "text/plain": [
        "<Figure size 1008x432 with 2 Axes>"
       ]
      },
-     "metadata": {},
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
+    "df.set_index(\"Grid Points\")[\"PM_RUN_CYC (min)\"].plot(ax=ax1, legend=True);\n",
+    "df.set_index(\"Grid Points\")[\"PM_INST_CMPL (min)\"].plot(ax=ax2, legend=True);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Although some slight variations can be seen for run cycles for many grid points, the correlation looks quite linear (as one would naively expect). Let's test that by fitting a linear function!\n",
+    "\n",
+    "*The details of the fitting have been extracted into dedicated function, `print_and_return_fit()`, of the `common.py` helper file. If you're interested, [go have a look at it](common.py).* "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def linear_function(x, a, b):\n",
+    "    return a*x+b"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Counter   PM_RUN_CYC (min) is proportional to the grid points (nx*ny) by a factor of  8.1021 (± 0.0057)\n",
+      "Counter PM_INST_CMPL (min) is proportional to the grid points (nx*ny) by a factor of 14.0630 (± 0.0003)\n"
+     ]
+    }
+   ],
+   "source": [
+    "fit_parameters, fit_covariance = common.print_and_return_fit(\n",
+    "    [\"PM_RUN_CYC (min)\", \"PM_INST_CMPL (min)\"], \n",
+    "    df.set_index(\"Grid Points\"), \n",
+    "    linear_function,\n",
+    "    format_uncertainty=\".4f\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's overlay our fits to the graphs from before."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 1008x432 with 2 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
      "output_type": "display_data"
     }
    ],
    "source": [
-    "# Plot Cycles and Instructions - both per grid cell\n",
     "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
-    "df.set_index(\"nx\")[\"Cycles / Loop Iteration\"].plot(ax=ax1, legend=True);\n",
-    "df.set_index(\"nx\")[\"Instructions / Loop Iteration\"].plot(ax=ax2, legend=True);"
+    "for ax, pmu_counter in zip([ax1, ax2], [\"PM_RUN_CYC (min)\", \"PM_INST_CMPL (min)\"]):\n",
+    "    df.set_index(\"Grid Points\")[pmu_counter].plot(ax=ax, legend=True);\n",
+    "    ax.plot(\n",
+    "        df[\"Grid Points\"], \n",
+    "        linear_function(df[\"Grid Points\"], *fit_parameters[pmu_counter]), \n",
+    "        linestyle=\"--\", \n",
+    "        label=\"Fit: {:.2f} * x + {:.2f}\".format(*fit_parameters[pmu_counter])\n",
+    "    )\n",
+    "    ax.legend();"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "What is your result? What value do the graphs come asymptotically close too?\n",
+    "Please execute the next cell to summarize the first task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The algorithm under investigation runs about 8 cycles and executes about 14 instructions per grid point\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"The algorithm under investigation runs about {:.0f} cycles and executes about {:.0f} instructions per grid point\".format(\n",
+    "    *[fit_parameters[pmu_counter][0] for pmu_counter in [\"PM_RUN_CYC (min)\", \"PM_INST_CMPL (min)\"]]\n",
+    "))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Bonus:**\n",
     "\n",
+    "The linear fits also calculate a y intersection (»`b`«). How do you interpret this value?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "exercise": "solution"
+   },
+   "source": [
+    "The y axis intersection; that is, `b` of the linear fit, is the inherent overhead of the program execution. Even if our program would not compute any stencil operation at all for any grid point, it would still complete this many (~1800) instructions and run this many (~680) cycles. Interestingly, it is also the unparallelizable overhead of this (toy) example."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "We are revisiting the graph in a little while.\n",
     "\n",
     "[Back to top](#toc)"
@@ -915,7 +1047,10 @@
     "\n",
     "Let's compare your estimate to what the system actually does!\n",
     "\n",
-    "<a name=\"task2-a\"></a>**TASK A**: Please measure counters for loads and stores. See the TODOs in [`poisson2d.ld_st.c`](/edit/Tasks/poisson2d.ld_st.c). This time, implement `PM_LD_CMPL` and `PM_ST_CMPL`.\n",
+    "### Task A\n",
+    "<a name=\"task2-a\"></a>\n",
+    "\n",
+    "Please measure counters for loads and stores. See the TODOs in [`poisson2d.ld_st.c`](/edit/Tasks/poisson2d.ld_st.c). This time, implement `PM_LD_CMPL` and `PM_ST_CMPL`.\n",
     "\n",
     "Compile with `make task2`, test your program with a single run with `make run_task2`, and then finally submit a benchmarking run to the batch system with `make bench_task2`. The following cell will take care of all this.\n",
     "\n",
@@ -924,561 +1059,530 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "gcc -DUSE_DOUBLE -Ofast -std=c99 -lm -lpapi  poisson2d.ld_st.c -o poisson2d.ld_st.bin\n",
-      "bsub -W 60 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.ld_st.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv\n",
-      "Job <4032> is submitted to default queue <batch>.\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.ld_st.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.ld_st.bin.csv\n",
+      "Job <24416> is submitted to default queue <batch>.\n",
       "<<Waiting for dispatch ...>>\n",
       "<<Starting on login1>>\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,4,0.0012,95115,474,789,21343,106,249\n",
+      "200,32,4,0.0012,119819,598,817,32902,164,266\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,8,0.0014,137115,684,999,33343,166,309\n",
+      "200,32,8,0.0013,161819,808,1027,56902,284,386\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,12,0.0014,197115,984,1299,45343,226,369\n",
+      "200,32,12,0.0014,221819,1108,1327,71902,359,461\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,16,0.0015,257115,1284,1599,63343,316,459\n",
+      "200,32,16,0.0015,281819,1408,1627,86902,434,536\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,20,0.0016,317115,1584,1899,75343,376,519\n",
+      "200,32,20,0.0015,341819,1708,1927,101902,509,611\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,24,0.0016,377115,1884,2199,93343,466,609\n",
+      "200,32,24,0.0016,401819,2008,2227,116902,584,686\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,28,0.0017,437115,2184,2499,105343,526,669\n",
+      "200,32,28,0.0016,461819,2308,2527,131902,659,761\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,32,0.0017,497115,2484,2799,123343,616,759\n",
+      "200,32,32,0.0018,521819,2608,2827,146902,734,836\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,36,0.0018,557115,2784,3099,135343,676,819\n",
+      "200,32,36,0.0018,581819,2908,3127,161902,809,911\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,40,0.0020,617115,3084,3399,153343,766,909\n",
+      "200,32,40,0.0018,641819,3208,3427,176902,884,986\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,44,0.0019,677115,3384,3699,165343,826,969\n",
+      "200,32,44,0.0019,701819,3508,3727,191902,959,1061\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,48,0.0020,737115,3684,3999,183343,916,1059\n",
+      "200,32,48,0.0020,761819,3808,4027,206902,1034,1136\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,52,0.0021,797115,3984,4299,195343,976,1119\n",
+      "200,32,52,0.0020,821819,4108,4327,221902,1109,1211\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,56,0.0021,857115,4284,4599,213343,1066,1209\n",
+      "200,32,56,0.0021,881819,4408,4627,236902,1184,1286\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,60,0.0023,917115,4584,4899,225343,1126,1269\n",
+      "200,32,60,0.0022,941819,4708,4927,251902,1259,1361\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,64,0.0023,977115,4884,5199,243343,1216,1359\n",
+      "200,32,64,0.0023,1001819,5008,5227,266902,1334,1436\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,68,0.0024,1037115,5184,5499,255343,1276,1419\n",
+      "200,32,68,0.0023,1061819,5308,5527,281902,1409,1511\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,72,0.0025,1097115,5484,5799,273343,1366,1509\n",
+      "200,32,72,0.0025,1121819,5608,5827,296902,1484,1586\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,76,0.0025,1157115,5784,6099,285343,1426,1569\n",
+      "200,32,76,0.0028,1181819,5908,6127,311902,1559,1661\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,80,0.0025,1217115,6084,6399,303343,1516,1659\n",
+      "200,32,80,0.0025,1241819,6208,6427,326902,1634,1736\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,84,0.0026,1277115,6384,6699,315343,1576,1719\n",
+      "200,32,84,0.0026,1301819,6508,6727,341902,1709,1811\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,88,0.0027,1337115,6684,6999,333343,1666,1809\n",
+      "200,32,88,0.0026,1361819,6808,7027,356902,1784,1886\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,92,0.0027,1397115,6984,7299,345343,1726,1869\n",
+      "200,32,92,0.0027,1421819,7108,7327,371902,1859,1961\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,96,0.0028,1457115,7284,7599,363343,1816,1959\n",
+      "200,32,96,0.0028,1481819,7408,7627,386902,1934,2036\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,100,0.0029,1517115,7584,7899,375343,1876,2019\n",
+      "200,32,100,0.0029,1541819,7708,7927,401902,2009,2111\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,104,0.0029,1577115,7884,8199,393343,1966,2109\n",
+      "200,32,104,0.0029,1601819,8008,8227,416902,2084,2186\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,108,0.0030,1637115,8184,8499,405343,2026,2169\n",
+      "200,32,108,0.0031,1661819,8308,8527,431902,2159,2261\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,112,0.0030,1697115,8484,8799,423343,2116,2259\n",
+      "200,32,112,0.0030,1721819,8608,8827,446902,2234,2336\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,116,0.0031,1757115,8784,9099,435343,2176,2319\n",
+      "200,32,116,0.0031,1781819,8908,9127,461902,2309,2411\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,120,0.0033,1817115,9084,9399,453343,2266,2409\n",
+      "200,32,120,0.0032,1841819,9208,9427,476902,2384,2486\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,124,0.0032,1877115,9384,9699,465343,2326,2469\n",
+      "200,32,124,0.0033,1901819,9508,9727,491902,2459,2561\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,128,0.0033,1937115,9684,9999,483343,2416,2559\n",
+      "200,32,128,0.0033,1961819,9808,10027,506902,2534,2636\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,132,0.0034,1997115,9984,10299,495343,2476,2619\n",
+      "200,32,132,0.0034,2021819,10108,10327,521902,2609,2711\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,136,0.0035,2057115,10284,10599,513343,2566,2709\n",
+      "200,32,136,0.0035,2081819,10408,10627,536902,2684,2786\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,140,0.0035,2117115,10584,10899,525343,2626,2769\n",
+      "200,32,140,0.0036,2141819,10708,10927,551902,2759,2861\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,144,0.0036,2177115,10884,11199,543343,2716,2859\n",
+      "200,32,144,0.0036,2201819,11008,11227,566902,2834,2936\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,148,0.0036,2237115,11184,11499,555343,2776,2919\n",
+      "200,32,148,0.0036,2261819,11308,11527,581902,2909,3011\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,152,0.0037,2297115,11484,11799,573343,2866,3009\n",
+      "200,32,152,0.0037,2321819,11608,11827,596902,2984,3086\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,156,0.0038,2357115,11784,12099,585343,2926,3069\n",
+      "200,32,156,0.0038,2381819,11908,12127,611902,3059,3161\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,160,0.0038,2417115,12084,12399,603343,3016,3159\n",
+      "200,32,160,0.0040,2441819,12208,12427,626902,3134,3236\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,164,0.0039,2477115,12384,12699,615343,3076,3219\n",
+      "200,32,164,0.0039,2501819,12508,12727,641902,3209,3311\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,168,0.0039,2537115,12684,12999,633343,3166,3309\n",
+      "200,32,168,0.0040,2561819,12808,13027,656902,3284,3386\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,172,0.0040,2597115,12984,13299,645343,3226,3369\n",
+      "200,32,172,0.0040,2621819,13108,13327,671902,3359,3461\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,176,0.0041,2657115,13284,13599,663343,3316,3459\n",
+      "200,32,176,0.0041,2681819,13408,13627,686902,3434,3536\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,180,0.0041,2717115,13584,13899,675343,3376,3519\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,180,0.0041,2741819,13708,13927,701902,3509,3611\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,184,0.0042,2777115,13884,14199,693343,3466,3609\n",
+      "200,32,184,0.0042,2801819,14008,14227,716902,3584,3686\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,188,0.0043,2837115,14184,14499,705343,3526,3669\n",
+      "200,32,188,0.0044,2861819,14308,14527,731902,3659,3761\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,192,0.0043,2897115,14484,14799,723343,3616,3759\n",
+      "200,32,192,0.0044,2921819,14608,14827,746902,3734,3836\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,196,0.0044,2957115,14784,15099,735343,3676,3819\n",
+      "200,32,196,0.0045,2981819,14908,15127,761902,3809,3911\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,200,0.0045,3017115,15084,15399,753343,3766,3909\n",
+      "200,32,200,0.0045,3041819,15208,15427,776902,3884,3986\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,204,0.0045,3077115,15384,15699,765343,3826,3969\n",
+      "200,32,204,0.0045,3101819,15508,15727,791902,3959,4061\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,208,0.0046,3137115,15684,15999,783343,3916,4059\n",
+      "200,32,208,0.0046,3161819,15808,16027,806902,4034,4136\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,212,0.0047,3197115,15984,16299,795343,3976,4119\n",
+      "200,32,212,0.0047,3221819,16108,16327,821902,4109,4211\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,216,0.0047,3257115,16284,16599,813343,4066,4209\n",
+      "200,32,216,0.0047,3281819,16408,16627,836902,4184,4286\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,220,0.0048,3317115,16584,16899,825343,4126,4269\n",
+      "200,32,220,0.0048,3341819,16708,16927,851902,4259,4361\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,224,0.0049,3377115,16884,17199,843343,4216,4359\n",
+      "200,32,224,0.0049,3401819,17008,17227,866902,4334,4436\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,228,0.0049,3437115,17184,17499,855343,4276,4419\n",
+      "200,32,228,0.0050,3461819,17308,17527,881902,4409,4511\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,232,0.0050,3497115,17484,17799,873343,4366,4509\n",
+      "200,32,232,0.0050,3521819,17608,17827,896902,4484,4586\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,236,0.0051,3557115,17784,18099,885343,4426,4569\n",
+      "200,32,236,0.0051,3581819,17908,18127,911902,4559,4661\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,240,0.0052,3617115,18084,18399,903343,4516,4659\n",
+      "200,32,240,0.0051,3641819,18208,18427,926902,4634,4736\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,244,0.0052,3677115,18384,18699,915343,4576,4719\n",
+      "200,32,244,0.0052,3701819,18508,18727,941902,4709,4811\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,248,0.0052,3737115,18684,18999,933343,4666,4809\n",
+      "200,32,248,0.0053,3761819,18808,19027,956902,4784,4886\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,252,0.0054,3797115,18984,19299,945343,4726,4869\n",
+      "200,32,252,0.0053,3821819,19108,19327,971902,4859,4961\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,256,0.0054,3857115,19284,19599,963343,4816,4959\n",
+      "200,32,256,0.0054,3881819,19408,19627,986902,4934,5036\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,260,0.0054,3917115,19584,19899,975343,4876,5019\n",
+      "200,32,260,0.0055,3941819,19708,19927,1001902,5009,5111\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,264,0.0055,3977115,19884,20199,993343,4966,5109\n",
+      "200,32,264,0.0055,4001819,20008,20227,1016902,5084,5186\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,268,0.0056,4037115,20184,20499,1005343,5026,5169\n",
+      "200,32,268,0.0056,4061819,20308,20527,1031902,5159,5261\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,272,0.0056,4097115,20484,20799,1023343,5116,5259\n",
+      "200,32,272,0.0057,4121819,20608,20827,1046902,5234,5336\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,276,0.0057,4157115,20784,21099,1035343,5176,5319\n",
+      "200,32,276,0.0057,4181819,20908,21127,1061902,5309,5411\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,280,0.0057,4217115,21084,21399,1053343,5266,5409\n",
+      "200,32,280,0.0058,4241819,21208,21427,1076902,5384,5486\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,284,0.0058,4277115,21384,21699,1065343,5326,5469\n",
+      "200,32,284,0.0059,4301819,21508,21727,1091902,5459,5561\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,288,0.0059,4337115,21684,21999,1083343,5416,5559\n",
+      "200,32,288,0.0059,4361819,21808,22027,1106902,5534,5636\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,292,0.0059,4397115,21984,22299,1095343,5476,5619\n",
+      "200,32,292,0.0060,4421819,22108,22327,1121902,5609,5711\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,296,0.0061,4457115,22284,22599,1113343,5566,5709\n",
+      "200,32,296,0.0061,4481819,22408,22627,1136902,5684,5786\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,300,0.0061,4517115,22584,22899,1125343,5626,5769\n",
+      "200,32,300,0.0061,4541819,22708,22927,1151902,5759,5861\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,304,0.0061,4577115,22884,23199,1143343,5716,5859\n",
+      "200,32,304,0.0062,4601819,23008,23227,1166902,5834,5936\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,308,0.0062,4637115,23184,23499,1155343,5776,5919\n",
+      "200,32,308,0.0063,4661819,23308,23527,1181902,5909,6011\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,312,0.0063,4697115,23484,23799,1173343,5866,6009\n",
+      "200,32,312,0.0064,4721819,23608,23827,1196902,5984,6086\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,316,0.0064,4757115,23784,24099,1185343,5926,6069\n",
+      "200,32,316,0.0066,4781819,23908,24127,1211902,6059,6161\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,320,0.0064,4817115,24084,24399,1203343,6016,6159\n",
+      "200,32,320,0.0065,4841819,24208,24427,1226902,6134,6236\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,324,0.0065,4877115,24384,24699,1215343,6076,6219\n",
+      "200,32,324,0.0065,4901819,24508,24727,1241902,6209,6311\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,328,0.0065,4937115,24684,24999,1233343,6166,6309\n",
+      "200,32,328,0.0069,4961819,24808,25027,1256902,6284,6386\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,332,0.0066,4997115,24984,25299,1245343,6226,6369\n",
+      "200,32,332,0.0066,5021819,25108,25327,1271902,6359,6461\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,336,0.0066,5057115,25284,25599,1263343,6316,6459\n",
+      "200,32,336,0.0067,5081819,25408,25627,1286902,6434,6536\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,340,0.0068,5117115,25584,25899,1275343,6376,6519\n",
+      "200,32,340,0.0068,5141819,25708,25927,1301902,6509,6611\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,344,0.0068,5177115,25884,26199,1293343,6466,6609\n",
+      "200,32,344,0.0069,5201819,26008,26227,1316902,6584,6686\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,348,0.0069,5237115,26184,26499,1305343,6526,6669\n",
+      "200,32,348,0.0069,5261819,26308,26527,1331902,6659,6761\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,352,0.0071,5297115,26484,26799,1323343,6616,6759\n",
+      "200,32,352,0.0070,5321819,26608,26827,1346902,6734,6836\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,356,0.0070,5357115,26784,27099,1335343,6676,6819\n",
+      "200,32,356,0.0070,5381819,26908,27127,1361902,6809,6911\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,360,0.0070,5417115,27084,27399,1353343,6766,6909\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,360,0.0071,5441819,27208,27427,1376902,6884,6986\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,364,0.0071,5477115,27384,27699,1365343,6826,6969\n",
+      "200,32,364,0.0072,5501819,27508,27727,1391902,6959,7061\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,368,0.0072,5537115,27684,27999,1383343,6916,7059\n",
+      "200,32,368,0.0072,5561819,27808,28027,1406902,7034,7136\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,372,0.0073,5597115,27984,28299,1395343,6976,7119\n",
+      "200,32,372,0.0073,5621819,28108,28327,1421902,7109,7211\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,376,0.0073,5657115,28284,28599,1413343,7066,7209\n",
+      "200,32,376,0.0074,5681819,28408,28627,1436902,7184,7286\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,380,0.0074,5717115,28584,28899,1425343,7126,7269\n",
+      "200,32,380,0.0074,5741819,28708,28927,1451902,7259,7361\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,384,0.0074,5777115,28884,29199,1443343,7216,7359\n",
+      "200,32,384,0.0075,5801819,29008,29227,1466902,7334,7436\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,388,0.0075,5837115,29184,29499,1455343,7276,7419\n",
+      "200,32,388,0.0076,5861819,29308,29527,1481902,7409,7511\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,392,0.0076,5897115,29484,29799,1473343,7366,7509\n",
+      "200,32,392,0.0076,5921819,29608,29827,1496902,7484,7586\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,396,0.0076,5957115,29784,30099,1485343,7426,7569\n",
+      "200,32,396,0.0077,5981819,29908,30127,1511902,7559,7661\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,400,0.0078,6017115,30084,30399,1503343,7516,7659\n",
+      "200,32,400,0.0078,6041819,30208,30427,1526902,7634,7736\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,404,0.0078,6077115,30384,30699,1515343,7576,7719\n",
+      "200,32,404,0.0079,6101819,30508,30727,1541902,7709,7811\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,408,0.0078,6137115,30684,30999,1533343,7666,7809\n",
+      "200,32,408,0.0079,6161819,30808,31027,1556902,7784,7886\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,412,0.0079,6197115,30984,31299,1545343,7726,7869\n",
+      "200,32,412,0.0080,6221819,31108,31327,1571902,7859,7961\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,416,0.0080,6257115,31284,31599,1563343,7816,7959\n",
+      "200,32,416,0.0081,6281819,31408,31627,1586902,7934,8036\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,420,0.0080,6317115,31584,31899,1575343,7876,8019\n",
+      "200,32,420,0.0081,6341819,31708,31927,1601902,8009,8111\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,424,0.0081,6377115,31884,32199,1593343,7966,8109\n",
+      "200,32,424,0.0082,6401819,32008,32227,1616902,8084,8186\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,428,0.0081,6437115,32184,32499,1605343,8026,8169\n",
+      "200,32,428,0.0082,6461819,32308,32527,1631902,8159,8261\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,432,0.0082,6497115,32484,32799,1623343,8116,8259\n",
+      "200,32,432,0.0085,6521819,32608,32827,1646902,8234,8336\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,436,0.0083,6557115,32784,33099,1635343,8176,8319\n",
+      "200,32,436,0.0084,6581819,32908,33127,1661902,8309,8411\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,440,0.0083,6617115,33084,33399,1653343,8266,8409\n",
+      "200,32,440,0.0084,6641819,33208,33427,1676902,8384,8486\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,444,0.0084,6677115,33384,33699,1665343,8326,8469\n",
+      "200,32,444,0.0085,6701819,33508,33727,1691902,8459,8561\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,448,0.0085,6737115,33684,33999,1683343,8416,8559\n",
+      "200,32,448,0.0087,6761819,33808,34027,1706902,8534,8636\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,452,0.0085,6797115,33984,34299,1695343,8476,8619\n",
+      "200,32,452,0.0087,6821819,34108,34327,1721902,8609,8711\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,456,0.0086,6857115,34284,34599,1713343,8566,8709\n",
+      "200,32,456,0.0087,6881819,34408,34627,1736902,8684,8786\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,460,0.0087,6917115,34584,34899,1725343,8626,8769\n",
+      "200,32,460,0.0088,6941819,34708,34927,1751902,8759,8861\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,464,0.0088,6977115,34884,35199,1743343,8716,8859\n",
+      "200,32,464,0.0088,7001819,35008,35227,1766902,8834,8936\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,468,0.0088,7037115,35184,35499,1755343,8776,8919\n",
+      "200,32,468,0.0089,7061819,35308,35527,1781902,8909,9011\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,472,0.0089,7097115,35484,35799,1773343,8866,9009\n",
+      "200,32,472,0.0090,7121819,35608,35827,1796902,8984,9086\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,476,0.0090,7157115,35784,36099,1785343,8926,9069\n",
+      "200,32,476,0.0091,7181819,35908,36127,1811902,9059,9161\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,480,0.0090,7217115,36084,36399,1803343,9016,9159\n",
+      "200,32,480,0.0091,7241819,36208,36427,1826902,9134,9236\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,484,0.0091,7277115,36384,36699,1815343,9076,9219\n",
+      "200,32,484,0.0092,7301819,36508,36727,1841902,9209,9311\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,488,0.0091,7337115,36684,36999,1833343,9166,9309\n",
+      "200,32,488,0.0093,7361819,36808,37027,1856902,9284,9386\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,492,0.0092,7397115,36984,37299,1845343,9226,9369\n",
+      "200,32,492,0.0094,7421819,37108,37327,1871902,9359,9461\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,496,0.0093,7457115,37284,37599,1863343,9316,9459\n",
+      "200,32,496,0.0095,7481819,37408,37627,1886902,9434,9536\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,500,0.0093,7517115,37584,37899,1875343,9376,9519\n",
+      "200,32,500,0.0094,7541819,37708,37927,1901902,9509,9611\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,504,0.0094,7577115,37884,38199,1893343,9466,9609\n",
+      "200,32,504,0.0095,7601819,38008,38227,1916902,9584,9686\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,508,0.0095,7637115,38184,38499,1905343,9526,9669\n",
+      "200,32,508,0.0096,7661819,38308,38527,1931902,9659,9761\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,512,0.0095,7697115,38484,38799,1923343,9616,9759\n",
+      "200,32,512,0.0097,7721819,38608,38827,1946902,9734,9836\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,516,0.0096,7757115,38784,39099,1938343,9691,9834\n",
+      "200,32,516,0.0098,7781819,38908,39127,1961902,9809,9911\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,520,0.0097,7817115,39084,39399,1953343,9766,9909\n",
+      "200,32,520,0.0098,7841819,39208,39427,1976902,9884,9986\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,524,0.0097,7877115,39384,39699,1968343,9841,9984\n",
+      "200,32,524,0.0099,7901819,39508,39727,1991902,9959,10061\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,528,0.0098,7937115,39684,39999,1983343,9916,10059\n",
+      "200,32,528,0.0099,7961819,39808,40027,2006902,10034,10136\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,532,0.0099,7997115,39984,40299,1998343,9991,10134\n",
+      "200,32,532,0.0100,8021819,40108,40327,2021902,10109,10211\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,536,0.0100,8057115,40284,40599,2013343,10066,10209\n",
+      "200,32,536,0.0101,8081819,40408,40627,2036902,10184,10286\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,540,0.0101,8117115,40584,40899,2028343,10141,10284\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,540,0.0101,8141819,40708,40927,2051902,10259,10361\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,544,0.0101,8177115,40884,41199,2043343,10216,10359\n",
+      "200,32,544,0.0103,8201819,41008,41227,2066902,10334,10436\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,548,0.0102,8237115,41184,41499,2058343,10291,10434\n",
+      "200,32,548,0.0103,8261819,41308,41527,2081902,10409,10511\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,552,0.0103,8297115,41484,41799,2073343,10366,10509\n",
+      "200,32,552,0.0104,8321819,41608,41827,2096902,10484,10586\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,556,0.0104,8357115,41784,42099,2088343,10441,10584\n",
+      "200,32,556,0.0106,8381819,41908,42127,2111902,10559,10661\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,560,0.0104,8417115,42084,42399,2103343,10516,10659\n",
+      "200,32,560,0.0106,8441819,42208,42427,2126902,10634,10736\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,564,0.0105,8477115,42384,42699,2118343,10591,10734\n",
+      "200,32,564,0.0106,8501819,42508,42727,2141902,10709,10811\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,568,0.0106,8537115,42684,42999,2133343,10666,10809\n",
+      "200,32,568,0.0107,8561819,42808,43027,2156902,10784,10886\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,572,0.0106,8597115,42984,43299,2148343,10741,10884\n",
+      "200,32,572,0.0108,8621819,43108,43327,2171902,10859,10961\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,576,0.0107,8657115,43284,43599,2163343,10816,10959\n",
+      "200,32,576,0.0109,8681819,43408,43627,2186902,10934,11036\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,580,0.0109,8717115,43584,43899,2178343,10891,11034\n",
+      "200,32,580,0.0110,8741819,43708,43927,2201902,11009,11111\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,584,0.0108,8777115,43884,44199,2193343,10966,11109\n",
+      "200,32,584,0.0110,8801819,44008,44227,2216902,11084,11186\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,588,0.0110,8837115,44184,44499,2208343,11041,11184\n",
+      "200,32,588,0.0110,8861819,44308,44527,2231902,11159,11261\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,592,0.0110,8897115,44484,44799,2223343,11116,11259\n",
+      "200,32,592,0.0111,8921819,44608,44827,2246902,11234,11336\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,596,0.0111,8957115,44784,45099,2238343,11191,11334\n",
+      "200,32,596,0.0113,8981819,44908,45127,2261902,11309,11411\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,600,0.0111,9017115,45084,45399,2253343,11266,11409\n",
+      "200,32,600,0.0113,9041819,45208,45427,2276902,11384,11486\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,604,0.0112,9077115,45384,45699,2268343,11341,11484\n",
+      "200,32,604,0.0114,9101819,45508,45727,2291902,11459,11561\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,608,0.0113,9137115,45684,45999,2283343,11416,11559\n",
+      "200,32,608,0.0115,9161819,45808,46027,2306902,11534,11636\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,612,0.0113,9197115,45984,46299,2298343,11491,11634\n",
+      "200,32,612,0.0115,9221819,46108,46327,2321902,11609,11711\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,616,0.0114,9257115,46284,46599,2313343,11566,11709\n",
+      "200,32,616,0.0115,9281819,46408,46627,2336902,11684,11786\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,620,0.0115,9317115,46584,46899,2328343,11641,11784\n",
+      "200,32,620,0.0116,9341819,46708,46927,2351902,11759,11861\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,624,0.0115,9377115,46884,47199,2343343,11716,11859\n",
+      "200,32,624,0.0117,9401819,47008,47227,2366902,11834,11936\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,628,0.0115,9437115,47184,47499,2358343,11791,11934\n",
+      "200,32,628,0.0117,9461819,47308,47527,2381902,11909,12011\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,632,0.0117,9497115,47484,47799,2373343,11866,12009\n",
+      "200,32,632,0.0118,9521819,47608,47827,2396902,11984,12086\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,636,0.0118,9557115,47784,48099,2388343,11941,12084\n",
+      "200,32,636,0.0119,9581819,47908,48127,2411902,12059,12161\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,640,0.0119,9617115,48084,48399,2403343,12016,12159\n",
+      "200,32,640,0.0119,9641819,48208,48427,2426902,12134,12236\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,644,0.0118,9677115,48384,48699,2418343,12091,12234\n",
+      "200,32,644,0.0121,9701819,48508,48727,2441902,12209,12311\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,648,0.0119,9737115,48684,48999,2433343,12166,12309\n",
+      "200,32,648,0.0121,9761819,48808,49027,2456902,12284,12386\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,652,0.0121,9797115,48984,49299,2448343,12241,12384\n",
+      "200,32,652,0.0121,9821819,49108,49327,2471902,12359,12461\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,656,0.0121,9857115,49284,49599,2463343,12316,12459\n",
+      "200,32,656,0.0122,9881819,49408,49627,2486902,12434,12536\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,660,0.0122,9917115,49584,49899,2478343,12391,12534\n",
+      "200,32,660,0.0123,9941819,49708,49927,2501902,12509,12611\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,664,0.0122,9977115,49884,50199,2493343,12466,12609\n",
+      "200,32,664,0.0123,10001819,50008,50227,2516902,12584,12686\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,668,0.0123,10037115,50184,50499,2508343,12541,12684\n",
+      "200,32,668,0.0124,10061819,50308,50527,2531902,12659,12761\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,672,0.0123,10097115,50484,50799,2523343,12616,12759\n",
+      "200,32,672,0.0124,10121819,50608,50827,2546902,12734,12836\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,676,0.0125,10157115,50784,51099,2538343,12691,12834\n",
+      "200,32,676,0.0126,10181819,50908,51127,2561902,12809,12911\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,680,0.0124,10217115,51084,51399,2553343,12766,12909\n",
+      "200,32,680,0.0126,10241819,51208,51427,2576902,12884,12986\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,684,0.0125,10277115,51384,51699,2568343,12841,12984\n",
+      "200,32,684,0.0127,10301819,51508,51727,2591902,12959,13061\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,688,0.0126,10337115,51684,51999,2583343,12916,13059\n",
+      "200,32,688,0.0128,10361819,51808,52027,2606902,13034,13136\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,692,0.0126,10397115,51984,52299,2598343,12991,13134\n",
+      "200,32,692,0.0128,10421819,52108,52327,2621902,13109,13211\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,696,0.0127,10457115,52284,52599,2613343,13066,13209\n",
+      "200,32,696,0.0129,10481819,52408,52627,2636902,13184,13286\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,700,0.0128,10517115,52584,52899,2628343,13141,13284\n",
+      "200,32,700,0.0131,10541819,52708,52927,2651902,13259,13361\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,704,0.0129,10577115,52884,53199,2643343,13216,13359\n",
+      "200,32,704,0.0131,10601819,53008,53227,2666902,13334,13436\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,708,0.0129,10637115,53184,53499,2658343,13291,13434\n",
+      "200,32,708,0.0130,10661819,53308,53527,2681902,13409,13511\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,712,0.0129,10697115,53484,53799,2673343,13366,13509\n",
+      "200,32,712,0.0131,10721819,53608,53827,2696902,13484,13586\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,716,0.0130,10757115,53784,54099,2688343,13441,13584\n",
+      "200,32,716,0.0132,10781819,53908,54127,2711902,13559,13661\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,720,0.0130,10817115,54084,54399,2703343,13516,13659\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,720,0.0132,10841819,54208,54427,2726902,13634,13736\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,724,0.0132,10877115,54384,54699,2718343,13591,13734\n",
+      "200,32,724,0.0134,10901819,54508,54727,2741902,13709,13811\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,728,0.0131,10937115,54684,54999,2733343,13666,13809\n",
+      "200,32,728,0.0134,10961819,54808,55027,2756902,13784,13886\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,732,0.0133,10997115,54984,55299,2748343,13741,13884\n",
+      "200,32,732,0.0134,11021819,55108,55327,2771902,13859,13961\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,736,0.0135,11057115,55284,55599,2763343,13816,13959\n",
+      "200,32,736,0.0135,11081819,55408,55627,2786902,13934,14036\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,740,0.0134,11117115,55584,55899,2778343,13891,14034\n",
+      "200,32,740,0.0137,11141819,55708,55927,2801902,14009,14111\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,744,0.0134,11177115,55884,56199,2793343,13966,14109\n",
+      "200,32,744,0.0138,11201819,56008,56227,2816902,14084,14186\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,748,0.0135,11237115,56184,56499,2808343,14041,14184\n",
+      "200,32,748,0.0137,11261819,56308,56527,2831902,14159,14261\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,752,0.0136,11297115,56484,56799,2823343,14116,14259\n",
+      "200,32,752,0.0138,11321819,56608,56827,2846902,14234,14336\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,756,0.0136,11357115,56784,57099,2838343,14191,14334\n",
+      "200,32,756,0.0139,11381819,56908,57127,2861902,14309,14411\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,760,0.0138,11417115,57084,57399,2853343,14266,14409\n",
+      "200,32,760,0.0140,11441819,57208,57427,2876902,14384,14486\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,764,0.0139,11477115,57384,57699,2868343,14341,14484\n",
+      "200,32,764,0.0140,11501819,57508,57727,2891902,14459,14561\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,768,0.0138,11537115,57684,57999,2883343,14416,14559\n",
+      "200,32,768,0.0141,11561819,57808,58027,2906902,14534,14636\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,772,0.0140,11597115,57984,58299,2898343,14491,14634\n",
+      "200,32,772,0.0141,11621819,58108,58327,2921902,14609,14711\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,776,0.0140,11657115,58284,58599,2913343,14566,14709\n",
+      "200,32,776,0.0142,11681819,58408,58627,2936902,14684,14786\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,780,0.0142,11717115,58584,58899,2928343,14641,14784\n",
+      "200,32,780,0.0143,11741819,58708,58927,2951902,14759,14861\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,784,0.0141,11777115,58884,59199,2943343,14716,14859\n",
+      "200,32,784,0.0144,11801819,59008,59227,2966902,14834,14936\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,788,0.0143,11837115,59184,59499,2958343,14791,14934\n",
+      "200,32,788,0.0144,11861819,59308,59527,2981902,14909,15011\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,792,0.0143,11897115,59484,59799,2973343,14866,15009\n",
+      "200,32,792,0.0145,11921819,59608,59827,2996902,14984,15086\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,796,0.0146,11957115,59784,60099,2988343,14941,15084\n",
+      "200,32,796,0.0145,11981819,59908,60127,3011902,15059,15161\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,800,0.0144,12017115,60084,60399,3003343,15016,15159\n",
+      "200,32,800,0.0147,12041819,60208,60427,3026902,15134,15236\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,804,0.0145,12077115,60384,60699,3018343,15091,15234\n",
+      "200,32,804,0.0147,12101819,60508,60727,3041902,15209,15311\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,808,0.0146,12137115,60684,60999,3033343,15166,15309\n",
+      "200,32,808,0.0148,12161819,60808,61027,3056902,15284,15386\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,812,0.0146,12197115,60984,61299,3048343,15241,15384\n",
+      "200,32,812,0.0148,12221819,61108,61327,3071902,15359,15461\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,816,0.0146,12257115,61284,61599,3063343,15316,15459\n",
+      "200,32,816,0.0150,12281819,61408,61627,3086902,15434,15536\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,820,0.0148,12317115,61584,61899,3078343,15391,15534\n",
+      "200,32,820,0.0149,12341819,61708,61927,3101902,15509,15611\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,824,0.0149,12377115,61884,62199,3093343,15466,15609\n",
+      "200,32,824,0.0150,12401819,62008,62227,3116902,15584,15686\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,828,0.0149,12437115,62184,62499,3108343,15541,15684\n",
+      "200,32,828,0.0151,12461819,62308,62527,3131902,15659,15761\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,832,0.0149,12497115,62484,62799,3123343,15616,15759\n",
+      "200,32,832,0.0152,12521819,62608,62827,3146902,15734,15836\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,836,0.0151,12557115,62784,63099,3138343,15691,15834\n",
+      "200,32,836,0.0152,12581819,62908,63127,3161902,15809,15911\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,840,0.0150,12617115,63084,63399,3153343,15766,15909\n",
+      "200,32,840,0.0153,12641819,63208,63427,3176902,15884,15986\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,844,0.0152,12677115,63384,63699,3168343,15841,15984\n",
+      "200,32,844,0.0153,12701819,63508,63727,3191902,15959,16061\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,848,0.0152,12737115,63684,63999,3183343,15916,16059\n",
+      "200,32,848,0.0154,12761819,63808,64027,3206902,16034,16136\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,852,0.0153,12797115,63984,64299,3198343,15991,16134\n",
+      "200,32,852,0.0155,12821819,64108,64327,3221902,16109,16211\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,856,0.0153,12857115,64284,64599,3213343,16066,16209\n",
+      "200,32,856,0.0156,12881819,64408,64627,3236902,16184,16286\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,860,0.0155,12917115,64584,64899,3228343,16141,16284\n",
+      "200,32,860,0.0156,12941819,64708,64927,3251902,16259,16361\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,864,0.0156,12977115,64884,65199,3243343,16216,16359\n",
+      "200,32,864,0.0157,13001819,65008,65227,3266902,16334,16436\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,868,0.0157,13037115,65184,65499,3258343,16291,16434\n",
+      "200,32,868,0.0158,13061819,65308,65527,3281902,16409,16511\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,872,0.0156,13097115,65484,65799,3273343,16366,16509\n",
+      "200,32,872,0.0159,13121819,65608,65827,3296902,16484,16586\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,876,0.0157,13157115,65784,66099,3288343,16441,16584\n",
+      "200,32,876,0.0159,13181819,65908,66127,3311902,16559,16661\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,880,0.0158,13217115,66084,66399,3303343,16516,16659\n",
+      "200,32,880,0.0160,13241819,66208,66427,3326902,16634,16736\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,884,0.0158,13277115,66384,66699,3318343,16591,16734\n",
+      "200,32,884,0.0160,13301819,66508,66727,3341902,16709,16811\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,888,0.0159,13337115,66684,66999,3333343,16666,16809\n",
+      "200,32,888,0.0161,13361819,66808,67027,3356902,16784,16886\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,892,0.0160,13397115,66984,67299,3348343,16741,16884\n",
+      "200,32,892,0.0162,13421819,67108,67327,3371902,16859,16961\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,896,0.0161,13457115,67284,67599,3363343,16816,16959\n",
+      "200,32,896,0.0163,13481819,67408,67627,3386902,16934,17036\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,900,0.0162,13517115,67584,67899,3378343,16891,17034\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,900,0.0164,13541819,67708,67927,3401902,17009,17111\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,904,0.0163,13577115,67884,68199,3393343,16966,17109\n",
+      "200,32,904,0.0165,13601819,68008,68227,3416902,17084,17186\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,908,0.0164,13637115,68184,68499,3408343,17041,17184\n",
+      "200,32,908,0.0165,13661819,68308,68527,3431902,17159,17261\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,912,0.0165,13697115,68484,68799,3423343,17116,17259\n",
+      "200,32,912,0.0166,13721819,68608,68827,3446902,17234,17336\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,916,0.0165,13757115,68784,69099,3438343,17191,17334\n",
+      "200,32,916,0.0166,13781819,68908,69127,3461902,17309,17411\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,920,0.0165,13817115,69084,69399,3453343,17266,17409\n",
+      "200,32,920,0.0167,13841819,69208,69427,3476902,17384,17486\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,924,0.0168,13877115,69384,69699,3468343,17341,17484\n",
+      "200,32,924,0.0168,13901819,69508,69727,3491902,17459,17561\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,928,0.0167,13937115,69684,69999,3483343,17416,17559\n",
+      "200,32,928,0.0169,13961819,69808,70027,3506902,17534,17636\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,932,0.0169,13997115,69984,70299,3498343,17491,17634\n",
+      "200,32,932,0.0175,14021819,70108,70327,3521902,17609,17711\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,936,0.0168,14057115,70284,70599,3513343,17566,17709\n",
+      "200,32,936,0.0170,14081819,70408,70627,3536902,17684,17786\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,940,0.0169,14117115,70584,70899,3528343,17641,17784\n",
+      "200,32,940,0.0171,14141819,70708,70927,3551902,17759,17861\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,944,0.0169,14177115,70884,71199,3543343,17716,17859\n",
+      "200,32,944,0.0171,14201819,71008,71227,3566902,17834,17936\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,948,0.0170,14237115,71184,71499,3558343,17791,17934\n",
+      "200,32,948,0.0172,14261819,71308,71527,3581902,17909,18011\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,952,0.0171,14297115,71484,71799,3573343,17866,18009\n",
+      "200,32,952,0.0172,14321819,71608,71827,3596902,17984,18086\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,956,0.0173,14357115,71784,72099,3588343,17941,18084\n",
+      "200,32,956,0.0173,14381819,71908,72127,3611902,18059,18161\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,960,0.0172,14417115,72084,72399,3603343,18016,18159\n",
+      "200,32,960,0.0174,14441819,72208,72427,3626902,18134,18236\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,964,0.0177,14477115,72384,72699,3618343,18091,18234\n",
+      "200,32,964,0.0176,14501819,72508,72727,3641902,18209,18311\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,968,0.0177,14537115,72684,72999,3633343,18166,18309\n",
+      "200,32,968,0.0178,14561819,72808,73027,3656902,18284,18386\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,972,0.0177,14597115,72984,73299,3648343,18241,18384\n",
+      "200,32,972,0.0177,14621819,73108,73327,3671902,18359,18461\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,976,0.0179,14657115,73284,73599,3663343,18316,18459\n",
+      "200,32,976,0.0178,14681819,73408,73627,3686902,18434,18536\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,980,0.0180,14717115,73584,73899,3678343,18391,18534\n",
+      "200,32,980,0.0179,14741819,73708,73927,3701902,18509,18611\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,984,0.0180,14777115,73884,74199,3693343,18466,18609\n",
+      "200,32,984,0.0179,14801819,74008,74227,3716902,18584,18686\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,988,0.0180,14837115,74184,74499,3708343,18541,18684\n",
+      "200,32,988,0.0180,14861819,74308,74527,3731902,18659,18761\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,992,0.0181,14897115,74484,74799,3723343,18616,18759\n",
+      "200,32,992,0.0181,14921819,74608,74827,3746902,18734,18836\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,996,0.0184,14957115,74784,75099,3738343,18691,18834\n",
+      "200,32,996,0.0182,14981819,74908,75127,3761902,18809,18911\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,1000,0.0182,15017115,75084,75399,3753343,18766,18909\n",
+      "200,32,1000,0.0182,15041819,75208,75427,3776902,18884,18986\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,1004,0.0183,15077115,75384,75699,3768343,18841,18984\n",
+      "200,32,1004,0.0183,15101819,75508,75727,3791902,18959,19061\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,1008,0.0184,15137115,75684,75999,3783343,18916,19059\n",
+      "200,32,1008,0.0183,15161819,75808,76027,3806902,19034,19136\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,1012,0.0185,15197115,75984,76299,3798343,18991,19134\n",
+      "200,32,1012,0.0184,15221819,76108,76327,3821902,19109,19211\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,1016,0.0185,15257115,76284,76599,3813343,19066,19209\n",
+      "200,32,1016,0.0185,15281819,76408,76627,3836902,19184,19286\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,1020,0.0186,15317115,76584,76899,3828343,19141,19284\n",
+      "200,32,1020,0.0185,15341819,76708,76927,3851902,19259,19361\n",
       "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,1024,0.0183,15377115,76884,77199,3843343,19216,19359\n",
-      "mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .\n"
+      "200,32,1024,0.0186,15401819,77008,77227,3866902,19334,19436\n",
+      "mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.ld_st.bin.csv .\n"
      ]
     }
    ],
@@ -1490,12 +1594,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Once the run finished, let's plot it again with the following cell (non-interactive: `make graph_task2a`)."
+    "Once the run finished, let's plot it again in the course of the following cells (non-interactive: `make graph_task2a`)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -1529,8 +1633,7 @@
        "      <th>PM_ST_CMPL (total)</th>\n",
        "      <th>PM_ST_CMPL (min)</th>\n",
        "      <th>PM_ST_CMPL (max)</th>\n",
-       "      <th>Loads / Loop Iteration</th>\n",
-       "      <th>Stores / Loop Iteration</th>\n",
+       "      <th>Grid Points</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -1540,29 +1643,27 @@
        "      <td>32</td>\n",
        "      <td>4</td>\n",
        "      <td>0.0012</td>\n",
-       "      <td>95115</td>\n",
-       "      <td>474</td>\n",
-       "      <td>789</td>\n",
-       "      <td>21343</td>\n",
-       "      <td>106</td>\n",
-       "      <td>249</td>\n",
-       "      <td>3.703125</td>\n",
-       "      <td>0.828125</td>\n",
+       "      <td>119819</td>\n",
+       "      <td>598</td>\n",
+       "      <td>817</td>\n",
+       "      <td>32902</td>\n",
+       "      <td>164</td>\n",
+       "      <td>266</td>\n",
+       "      <td>128</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>200</td>\n",
        "      <td>32</td>\n",
        "      <td>8</td>\n",
-       "      <td>0.0014</td>\n",
-       "      <td>137115</td>\n",
-       "      <td>684</td>\n",
-       "      <td>999</td>\n",
-       "      <td>33343</td>\n",
-       "      <td>166</td>\n",
-       "      <td>309</td>\n",
-       "      <td>2.671875</td>\n",
-       "      <td>0.648438</td>\n",
+       "      <td>0.0013</td>\n",
+       "      <td>161819</td>\n",
+       "      <td>808</td>\n",
+       "      <td>1027</td>\n",
+       "      <td>56902</td>\n",
+       "      <td>284</td>\n",
+       "      <td>386</td>\n",
+       "      <td>256</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -1570,14 +1671,13 @@
        "      <td>32</td>\n",
        "      <td>12</td>\n",
        "      <td>0.0014</td>\n",
-       "      <td>197115</td>\n",
-       "      <td>984</td>\n",
-       "      <td>1299</td>\n",
-       "      <td>45343</td>\n",
-       "      <td>226</td>\n",
-       "      <td>369</td>\n",
-       "      <td>2.562500</td>\n",
-       "      <td>0.588542</td>\n",
+       "      <td>221819</td>\n",
+       "      <td>1108</td>\n",
+       "      <td>1327</td>\n",
+       "      <td>71902</td>\n",
+       "      <td>359</td>\n",
+       "      <td>461</td>\n",
+       "      <td>384</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
@@ -1585,29 +1685,27 @@
        "      <td>32</td>\n",
        "      <td>16</td>\n",
        "      <td>0.0015</td>\n",
-       "      <td>257115</td>\n",
-       "      <td>1284</td>\n",
-       "      <td>1599</td>\n",
-       "      <td>63343</td>\n",
-       "      <td>316</td>\n",
-       "      <td>459</td>\n",
-       "      <td>2.507812</td>\n",
-       "      <td>0.617188</td>\n",
+       "      <td>281819</td>\n",
+       "      <td>1408</td>\n",
+       "      <td>1627</td>\n",
+       "      <td>86902</td>\n",
+       "      <td>434</td>\n",
+       "      <td>536</td>\n",
+       "      <td>512</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>200</td>\n",
        "      <td>32</td>\n",
        "      <td>20</td>\n",
-       "      <td>0.0016</td>\n",
-       "      <td>317115</td>\n",
-       "      <td>1584</td>\n",
-       "      <td>1899</td>\n",
-       "      <td>75343</td>\n",
-       "      <td>376</td>\n",
-       "      <td>519</td>\n",
-       "      <td>2.475000</td>\n",
-       "      <td>0.587500</td>\n",
+       "      <td>0.0015</td>\n",
+       "      <td>341819</td>\n",
+       "      <td>1708</td>\n",
+       "      <td>1927</td>\n",
+       "      <td>101902</td>\n",
+       "      <td>509</td>\n",
+       "      <td>611</td>\n",
+       "      <td>640</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1615,59 +1713,130 @@
       ],
       "text/plain": [
        "   iter  ny  nx  Runtime  PM_LD_CMPL (total)  PM_LD_CMPL (min)  \\\n",
-       "0   200  32   4   0.0012               95115               474   \n",
-       "1   200  32   8   0.0014              137115               684   \n",
-       "2   200  32  12   0.0014              197115               984   \n",
-       "3   200  32  16   0.0015              257115              1284   \n",
-       "4   200  32  20   0.0016              317115              1584   \n",
+       "0   200  32   4   0.0012              119819               598   \n",
+       "1   200  32   8   0.0013              161819               808   \n",
+       "2   200  32  12   0.0014              221819              1108   \n",
+       "3   200  32  16   0.0015              281819              1408   \n",
+       "4   200  32  20   0.0015              341819              1708   \n",
        "\n",
        "    PM_LD_CMPL (max)  PM_ST_CMPL (total)  PM_ST_CMPL (min)   PM_ST_CMPL (max)  \\\n",
-       "0                789               21343               106                249   \n",
-       "1                999               33343               166                309   \n",
-       "2               1299               45343               226                369   \n",
-       "3               1599               63343               316                459   \n",
-       "4               1899               75343               376                519   \n",
+       "0                817               32902               164                266   \n",
+       "1               1027               56902               284                386   \n",
+       "2               1327               71902               359                461   \n",
+       "3               1627               86902               434                536   \n",
+       "4               1927              101902               509                611   \n",
        "\n",
-       "   Loads / Loop Iteration  Stores / Loop Iteration  \n",
-       "0                3.703125                 0.828125  \n",
-       "1                2.671875                 0.648438  \n",
-       "2                2.562500                 0.588542  \n",
-       "3                2.507812                 0.617188  \n",
-       "4                2.475000                 0.587500  "
+       "   Grid Points  \n",
+       "0          128  \n",
+       "1          256  \n",
+       "2          384  \n",
+       "3          512  \n",
+       "4          640  "
       ]
      },
-     "execution_count": 6,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "df_ldst = pd.read_csv(\"poisson2d.ld_st.bin.csv\", skiprows=range(2, 50000, 2))\n",
-    "common.normalize(df_ldst, \"PM_LD_CMPL (min)\", \"Loads / Loop Iteration\")\n",
-    "common.normalize(df_ldst, \"PM_ST_CMPL (min)\", \"Stores / Loop Iteration\")\n",
+    "df_ldst[\"Grid Points\"] = df_ldst[\"nx\"] * df_ldst[\"ny\"] \n",
     "df_ldst.head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "image/png": "\n",
+      "image/png": "\n",
       "text/plain": [
        "<Figure size 1008x432 with 2 Axes>"
       ]
      },
-     "metadata": {},
+     "metadata": {
+      "needs_background": "light"
+     },
      "output_type": "display_data"
     }
    ],
    "source": [
     "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
-    "df_ldst.set_index(\"nx\")[\"Loads / Loop Iteration\"].plot(ax=ax1, legend=True);\n",
-    "df_ldst.set_index(\"nx\")[\"Stores / Loop Iteration\"].plot(ax=ax2, legend=True);"
+    "df_ldst.set_index(\"Grid Points\")[\"PM_LD_CMPL (min)\"].plot(ax=ax1, legend=True);\n",
+    "df_ldst.set_index(\"Grid Points\")[\"PM_ST_CMPL (min)\"].plot(ax=ax2, legend=True);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Also this behaviour looks – at a first glance – linear. We can again fit a first-order polynom (and re-use our previously defined function `curve_fit`)!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Counter PM_LD_CMPL (min) is proportional to the grid points (nx*ny) by a factor of 2.3437 (± 0.000037)\n",
+      "Counter PM_ST_CMPL (min) is proportional to the grid points (nx*ny) by a factor of 0.5860 (± 0.000019)\n"
+     ]
+    }
+   ],
+   "source": [
+    "_fit, _cov = common.print_and_return_fit(\n",
+    "    [\"PM_LD_CMPL (min)\", \"PM_ST_CMPL (min)\"], \n",
+    "    df_ldst.set_index(\"Grid Points\"), \n",
+    "    linear_function,\n",
+    "    format_value=\".4f\"\n",
+    ")\n",
+    "fit_parameters = {**fit_parameters, **_fit}\n",
+    "fit_covariance = {**fit_covariance, **_cov}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's overlay this in one common plot:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 1008x432 with 2 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
+    "for ax, pmu_counter in zip([ax1, ax2], [\"PM_LD_CMPL (min)\", \"PM_ST_CMPL (min)\"]):\n",
+    "    df_ldst.set_index(\"Grid Points\")[pmu_counter].plot(ax=ax, legend=True);\n",
+    "    ax.plot(\n",
+    "        df_ldst[\"Grid Points\"], \n",
+    "        linear_function(df[\"Grid Points\"], *fit_parameters[pmu_counter]), \n",
+    "        linestyle=\"--\", \n",
+    "        label=\"Fit: {:.2f} * x + {:.2f}\".format(*fit_parameters[pmu_counter])\n",
+    "    )\n",
+    "    ax.legend();"
    ]
   },
   {
@@ -1676,9 +1845,12 @@
    "source": [
     "Did you expect more?\n",
     "\n",
-    "The reason is simple: Among the load and store instructions counted by `PM_LD_CMPL` and `PM_ST_CMPL` are vector instructions which can load and store multiple (two) values at a time. To see how many *bytes* are loaded and stored, we need to measure counters for vectorized loads and stores as well.\n",
+    "The reason is simple: Among the load and store instructions counted by `PM_LD_CMPL` and `PM_ST_CMPL` are vector instructions which can load and store multiple (in this case: two) values at a time. To see how many *bytes* are loaded and stored, we need to measure counters for vectorized loads and stores as well.\n",
+    "\n",
+    "### TASK B\n",
+    "<a name=\"task2-b\"></a>\n",
     "\n",
-    "<a name=\"task2-b\"></a>**TASK B**: Please measure counters for _vectorized_ loads and _vectorized_ stores. See the TODOs in [`poisson2d.vld.c`](/edit/Tasks/poisson2d.vld.c) and [`poisson2d.vst.c`](/edit/Tasks/poisson2d.vst.c) (*Note: These vector counters can not be measured together and need separate files and runs*). Can you find out the name of the counters yourself, using `papi_native_avail | grep VECTOR_`?\n",
+    "Please measure counters for _vectorized_ loads and _vectorized_ stores. See the TODOs in [`poisson2d.vld.c`](poisson2d.vld.c) and [`poisson2d.vst.c`](poisson2d.vst.c) (*Note: These vector counters can not be measured together and need separate files and runs*). Can you find out the name of the counters yourself, using `papi_native_avail | grep VECTOR_`?\n",
     "\n",
     "Compile, test, and bench-run your program again.\n",
     "\n",
@@ -1687,16 +1859,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "| PM_VECTOR_FLOP_CMPL                                                          |\r\n",
-      "| PM_VECTOR_LD_CMPL                                                            |\r\n",
-      "| PM_VECTOR_ST_CMPL                                                            |\r\n"
+      "| PM_VECTOR_FLOP_CMPL                                                          |\n",
+      "| PM_VECTOR_LD_CMPL                                                            |\n",
+      "| PM_VECTOR_ST_CMPL                                                            |\n"
      ]
     }
    ],
@@ -1713,15 +1885,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "bsub -W 60 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vld.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.vld.bin.csv\n",
-      "Job <4097> is submitted to default queue <batch>.\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vld.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.vld.bin.csv\n",
+      "Job <24641> is submitted to default queue <batch>.\n",
       "<<Waiting for dispatch ...>>\n",
       "<<Starting on login1>>\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
@@ -1731,9 +1903,9 @@
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,12,0.0012,174000,870,870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,16,0.0013,234000,1170,1170\n",
+      "200,32,16,0.0012,234000,1170,1170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,20,0.0014,294000,1470,1470\n",
+      "200,32,20,0.0013,294000,1470,1470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,24,0.0014,354000,1770,1770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
@@ -1747,11 +1919,11 @@
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,44,0.0017,654000,3270,3270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,48,0.0017,714000,3570,3570\n",
+      "200,32,48,0.0018,714000,3570,3570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,52,0.0018,774000,3870,3870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,56,0.0020,834000,4170,4170\n",
+      "200,32,56,0.0019,834000,4170,4170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,60,0.0020,894000,4470,4470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
@@ -1761,123 +1933,117 @@
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,72,0.0022,1074000,5370,5370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,76,0.0023,1134000,5670,5670\n",
+      "200,32,76,0.0022,1134000,5670,5670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,80,0.0023,1194000,5970,5970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,84,0.0023,1254000,6270,6270\n",
+      "200,32,84,0.0024,1254000,6270,6270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,88,0.0024,1314000,6570,6570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,92,0.0025,1374000,6870,6870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,96,0.0025,1434000,7170,7170\n",
+      "200,32,96,0.0027,1434000,7170,7170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,100,0.0026,1494000,7470,7470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,104,0.0027,1554000,7770,7770\n",
+      "200,32,104,0.0029,1554000,7770,7770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,108,0.0027,1614000,8070,8070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,112,0.0028,1674000,8370,8370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,116,0.0028,1734000,8670,8670\n",
+      "200,32,116,0.0029,1734000,8670,8670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,120,0.0029,1794000,8970,8970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,124,0.0030,1854000,9270,9270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,128,0.0030,1914000,9570,9570\n",
+      "200,32,128,0.0032,1914000,9570,9570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,132,0.0031,1974000,9870,9870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,136,0.0032,2034000,10170,10170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,140,0.0032,2094000,10470,10470\n",
+      "200,32,140,0.0033,2094000,10470,10470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,144,0.0033,2154000,10770,10770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,148,0.0034,2214000,11070,11070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,152,0.0035,2274000,11370,11370\n",
+      "200,32,152,0.0036,2274000,11370,11370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,156,0.0035,2334000,11670,11670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,160,0.0036,2394000,11970,11970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,164,0.0036,2454000,12270,12270\n",
+      "200,32,164,0.0037,2454000,12270,12270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,168,0.0037,2514000,12570,12570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,172,0.0037,2574000,12870,12870\n",
+      "200,32,172,0.0038,2574000,12870,12870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,176,0.0038,2634000,13170,13170\n",
+      "200,32,176,0.0039,2634000,13170,13170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,180,0.0039,2694000,13470,13470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,184,0.0041,2754000,13770,13770\n",
+      "200,32,184,0.0040,2754000,13770,13770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,188,0.0040,2814000,14070,14070\n",
+      "200,32,188,0.0041,2814000,14070,14070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,192,0.0041,2874000,14370,14370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,196,0.0041,2934000,14670,14670\n",
+      "200,32,196,0.0042,2934000,14670,14670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,200,0.0042,2994000,14970,14970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,204,0.0043,3054000,15270,15270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,208,0.0044,3114000,15570,15570\n",
+      "200,32,208,0.0045,3114000,15570,15570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,212,0.0044,3174000,15870,15870\n",
+      "200,32,212,0.0045,3174000,15870,15870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,216,0.0044,3234000,16170,16170\n",
+      "200,32,216,0.0045,3234000,16170,16170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,220,0.0045,3294000,16470,16470\n",
+      "200,32,220,0.0046,3294000,16470,16470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,224,0.0046,3354000,16770,16770\n",
+      "200,32,224,0.0048,3354000,16770,16770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,228,0.0047,3414000,17070,17070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,232,0.0047,3474000,17370,17370\n",
+      "200,32,232,0.0048,3474000,17370,17370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,236,0.0048,3534000,17670,17670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,240,0.0048,3594000,17970,17970\n",
+      "200,32,240,0.0049,3594000,17970,17970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,244,0.0049,3654000,18270,18270\n",
+      "200,32,244,0.0050,3654000,18270,18270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,248,0.0049,3714000,18570,18570\n",
+      "200,32,248,0.0052,3714000,18570,18570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,252,0.0050,3774000,18870,18870\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,252,0.0051,3774000,18870,18870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,256,0.0051,3834000,19170,19170\n",
+      "200,32,256,0.0052,3834000,19170,19170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,260,0.0052,3894000,19470,19470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,264,0.0052,3954000,19770,19770\n",
+      "200,32,264,0.0053,3954000,19770,19770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,268,0.0053,4014000,20070,20070\n",
+      "200,32,268,0.0054,4014000,20070,20070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,272,0.0053,4074000,20370,20370\n",
+      "200,32,272,0.0054,4074000,20370,20370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,276,0.0055,4134000,20670,20670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,280,0.0055,4194000,20970,20970\n",
+      "200,32,280,0.0056,4194000,20970,20970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,284,0.0055,4254000,21270,21270\n",
+      "200,32,284,0.0056,4254000,21270,21270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,288,0.0057,4314000,21570,21570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,292,0.0056,4374000,21870,21870\n",
+      "200,32,292,0.0058,4374000,21870,21870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,296,0.0057,4434000,22170,22170\n",
+      "200,32,296,0.0058,4434000,22170,22170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,300,0.0059,4494000,22470,22470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
@@ -1885,384 +2051,366 @@
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,308,0.0060,4614000,23070,23070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,312,0.0060,4674000,23370,23370\n",
+      "200,32,312,0.0061,4674000,23370,23370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,316,0.0061,4734000,23670,23670\n",
+      "200,32,316,0.0062,4734000,23670,23670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,320,0.0061,4794000,23970,23970\n",
+      "200,32,320,0.0062,4794000,23970,23970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,324,0.0062,4854000,24270,24270\n",
+      "200,32,324,0.0063,4854000,24270,24270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,328,0.0062,4914000,24570,24570\n",
+      "200,32,328,0.0063,4914000,24570,24570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,332,0.0063,4974000,24870,24870\n",
+      "200,32,332,0.0064,4974000,24870,24870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,336,0.0063,5034000,25170,25170\n",
+      "200,32,336,0.0065,5034000,25170,25170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,340,0.0066,5094000,25470,25470\n",
+      "200,32,340,0.0065,5094000,25470,25470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,344,0.0065,5154000,25770,25770\n",
+      "200,32,344,0.0066,5154000,25770,25770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,348,0.0067,5214000,26070,26070\n",
+      "200,32,348,0.0069,5214000,26070,26070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,352,0.0068,5274000,26370,26370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,356,0.0067,5334000,26670,26670\n",
+      "200,32,356,0.0070,5334000,26670,26670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,360,0.0067,5394000,26970,26970\n",
+      "200,32,360,0.0069,5394000,26970,26970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,364,0.0068,5454000,27270,27270\n",
+      "200,32,364,0.0070,5454000,27270,27270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,368,0.0069,5514000,27570,27570\n",
+      "200,32,368,0.0070,5514000,27570,27570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,372,0.0069,5574000,27870,27870\n",
+      "200,32,372,0.0071,5574000,27870,27870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,376,0.0070,5634000,28170,28170\n",
+      "200,32,376,0.0073,5634000,28170,28170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,380,0.0071,5694000,28470,28470\n",
+      "200,32,380,0.0073,5694000,28470,28470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,384,0.0071,5754000,28770,28770\n",
+      "200,32,384,0.0073,5754000,28770,28770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,388,0.0073,5814000,29070,29070\n",
+      "200,32,388,0.0074,5814000,29070,29070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,392,0.0074,5874000,29370,29370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,396,0.0073,5934000,29670,29670\n",
+      "200,32,396,0.0076,5934000,29670,29670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,400,0.0074,5994000,29970,29970\n",
+      "200,32,400,0.0075,5994000,29970,29970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,404,0.0074,6054000,30270,30270\n",
+      "200,32,404,0.0076,6054000,30270,30270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,408,0.0075,6114000,30570,30570\n",
+      "200,32,408,0.0077,6114000,30570,30570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,412,0.0076,6174000,30870,30870\n",
+      "200,32,412,0.0078,6174000,30870,30870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,416,0.0076,6234000,31170,31170\n",
+      "200,32,416,0.0079,6234000,31170,31170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,420,0.0080,6294000,31470,31470\n",
+      "200,32,420,0.0079,6294000,31470,31470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,424,0.0079,6354000,31770,31770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,428,0.0078,6414000,32070,32070\n",
+      "200,32,428,0.0080,6414000,32070,32070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,432,0.0079,6474000,32370,32370\n",
+      "200,32,432,0.0080,6474000,32370,32370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,436,0.0080,6534000,32670,32670\n",
+      "200,32,436,0.0081,6534000,32670,32670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,440,0.0080,6594000,32970,32970\n",
+      "200,32,440,0.0082,6594000,32970,32970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,444,0.0083,6654000,33270,33270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,448,0.0082,6714000,33570,33570\n",
+      "200,32,448,0.0084,6714000,33570,33570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,452,0.0082,6774000,33870,33870\n",
+      "200,32,452,0.0084,6774000,33870,33870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,456,0.0083,6834000,34170,34170\n",
+      "200,32,456,0.0084,6834000,34170,34170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,460,0.0086,6894000,34470,34470\n",
+      "200,32,460,0.0085,6894000,34470,34470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,464,0.0084,6954000,34770,34770\n",
+      "200,32,464,0.0086,6954000,34770,34770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,468,0.0085,7014000,35070,35070\n",
+      "200,32,468,0.0087,7014000,35070,35070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,472,0.0086,7074000,35370,35370\n",
+      "200,32,472,0.0088,7074000,35370,35370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,476,0.0086,7134000,35670,35670\n",
+      "200,32,476,0.0088,7134000,35670,35670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,480,0.0087,7194000,35970,35970\n",
+      "200,32,480,0.0089,7194000,35970,35970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,484,0.0088,7254000,36270,36270\n",
+      "200,32,484,0.0090,7254000,36270,36270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,488,0.0088,7314000,36570,36570\n",
+      "200,32,488,0.0091,7314000,36570,36570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,492,0.0089,7374000,36870,36870\n",
+      "200,32,492,0.0091,7374000,36870,36870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,496,0.0091,7434000,37170,37170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,500,0.0092,7494000,37470,37470\n",
+      "200,32,500,0.0094,7494000,37470,37470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,504,0.0091,7554000,37770,37770\n",
+      "200,32,504,0.0093,7554000,37770,37770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,508,0.0092,7614000,38070,38070\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,508,0.0095,7614000,38070,38070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,512,0.0092,7674000,38370,38370\n",
+      "200,32,512,0.0096,7674000,38370,38370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,516,0.0093,7734000,38670,38670\n",
+      "200,32,516,0.0095,7734000,38670,38670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,520,0.0093,7794000,38970,38970\n",
+      "200,32,520,0.0095,7794000,38970,38970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,524,0.0094,7854000,39270,39270\n",
+      "200,32,524,0.0097,7854000,39270,39270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
       "200,32,528,0.0097,7914000,39570,39570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,532,0.0095,7974000,39870,39870\n",
+      "200,32,532,0.0098,7974000,39870,39870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,536,0.0096,8034000,40170,40170\n",
+      "200,32,536,0.0098,8034000,40170,40170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,540,0.0097,8094000,40470,40470\n",
+      "200,32,540,0.0099,8094000,40470,40470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,544,0.0097,8154000,40770,40770\n",
+      "200,32,544,0.0100,8154000,40770,40770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,548,0.0099,8214000,41070,41070\n",
+      "200,32,548,0.0101,8214000,41070,41070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,552,0.0099,8274000,41370,41370\n",
+      "200,32,552,0.0101,8274000,41370,41370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,556,0.0100,8334000,41670,41670\n",
+      "200,32,556,0.0104,8334000,41670,41670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,560,0.0100,8394000,41970,41970\n",
+      "200,32,560,0.0103,8394000,41970,41970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,564,0.0101,8454000,42270,42270\n",
+      "200,32,564,0.0103,8454000,42270,42270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,568,0.0102,8514000,42570,42570\n",
+      "200,32,568,0.0106,8514000,42570,42570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,572,0.0103,8574000,42870,42870\n",
+      "200,32,572,0.0105,8574000,42870,42870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,576,0.0103,8634000,43170,43170\n",
+      "200,32,576,0.0106,8634000,43170,43170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,580,0.0104,8694000,43470,43470\n",
+      "200,32,580,0.0108,8694000,43470,43470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,584,0.0104,8754000,43770,43770\n",
+      "200,32,584,0.0109,8754000,43770,43770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,588,0.0106,8814000,44070,44070\n",
+      "200,32,588,0.0108,8814000,44070,44070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,592,0.0106,8874000,44370,44370\n",
+      "200,32,592,0.0109,8874000,44370,44370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,596,0.0107,8934000,44670,44670\n",
+      "200,32,596,0.0109,8934000,44670,44670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,600,0.0107,8994000,44970,44970\n",
+      "200,32,600,0.0110,8994000,44970,44970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,604,0.0109,9054000,45270,45270\n",
+      "200,32,604,0.0111,9054000,45270,45270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,608,0.0109,9114000,45570,45570\n",
+      "200,32,608,0.0112,9114000,45570,45570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,612,0.0110,9174000,45870,45870\n",
+      "200,32,612,0.0112,9174000,45870,45870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,616,0.0110,9234000,46170,46170\n",
+      "200,32,616,0.0114,9234000,46170,46170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,620,0.0111,9294000,46470,46470\n",
+      "200,32,620,0.0113,9294000,46470,46470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,624,0.0112,9354000,46770,46770\n",
+      "200,32,624,0.0114,9354000,46770,46770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,628,0.0112,9414000,47070,47070\n",
+      "200,32,628,0.0117,9414000,47070,47070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,632,0.0113,9474000,47370,47370\n",
+      "200,32,632,0.0116,9474000,47370,47370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,636,0.0114,9534000,47670,47670\n",
+      "200,32,636,0.0116,9534000,47670,47670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,640,0.0115,9594000,47970,47970\n",
+      "200,32,640,0.0117,9594000,47970,47970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,644,0.0115,9654000,48270,48270\n",
+      "200,32,644,0.0119,9654000,48270,48270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,648,0.0115,9714000,48570,48570\n",
+      "200,32,648,0.0118,9714000,48570,48570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,652,0.0116,9774000,48870,48870\n",
+      "200,32,652,0.0119,9774000,48870,48870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,656,0.0118,9834000,49170,49170\n",
+      "200,32,656,0.0119,9834000,49170,49170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,660,0.0117,9894000,49470,49470\n",
+      "200,32,660,0.0121,9894000,49470,49470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,664,0.0118,9954000,49770,49770\n",
+      "200,32,664,0.0122,9954000,49770,49770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,668,0.0118,10014000,50070,50070\n",
+      "200,32,668,0.0123,10014000,50070,50070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,672,0.0120,10074000,50370,50370\n",
+      "200,32,672,0.0122,10074000,50370,50370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,676,0.0121,10134000,50670,50670\n",
+      "200,32,676,0.0123,10134000,50670,50670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,680,0.0120,10194000,50970,50970\n",
+      "200,32,680,0.0123,10194000,50970,50970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,684,0.0121,10254000,51270,51270\n",
+      "200,32,684,0.0125,10254000,51270,51270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,688,0.0123,10314000,51570,51570\n",
+      "200,32,688,0.0125,10314000,51570,51570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,692,0.0122,10374000,51870,51870\n",
+      "200,32,692,0.0127,10374000,51870,51870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,696,0.0123,10434000,52170,52170\n",
+      "200,32,696,0.0126,10434000,52170,52170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,700,0.0124,10494000,52470,52470\n",
+      "200,32,700,0.0127,10494000,52470,52470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,704,0.0124,10554000,52770,52770\n",
+      "200,32,704,0.0128,10554000,52770,52770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,708,0.0125,10614000,53070,53070\n",
+      "200,32,708,0.0129,10614000,53070,53070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,712,0.0126,10674000,53370,53370\n",
+      "200,32,712,0.0128,10674000,53370,53370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,716,0.0126,10734000,53670,53670\n",
+      "200,32,716,0.0131,10734000,53670,53670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,720,0.0126,10794000,53970,53970\n",
+      "200,32,720,0.0130,10794000,53970,53970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,724,0.0128,10854000,54270,54270\n",
+      "200,32,724,0.0130,10854000,54270,54270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,728,0.0128,10914000,54570,54570\n",
+      "200,32,728,0.0132,10914000,54570,54570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,732,0.0129,10974000,54870,54870\n",
+      "200,32,732,0.0133,10974000,54870,54870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,736,0.0130,11034000,55170,55170\n",
+      "200,32,736,0.0135,11034000,55170,55170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,740,0.0130,11094000,55470,55470\n",
+      "200,32,740,0.0135,11094000,55470,55470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,744,0.0130,11154000,55770,55770\n",
+      "200,32,744,0.0135,11154000,55770,55770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,748,0.0131,11214000,56070,56070\n",
+      "200,32,748,0.0134,11214000,56070,56070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,752,0.0132,11274000,56370,56370\n",
+      "200,32,752,0.0135,11274000,56370,56370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,756,0.0133,11334000,56670,56670\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,756,0.0136,11334000,56670,56670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,760,0.0134,11394000,56970,56970\n",
+      "200,32,760,0.0137,11394000,56970,56970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,764,0.0134,11454000,57270,57270\n",
+      "200,32,764,0.0137,11454000,57270,57270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,768,0.0135,11514000,57570,57570\n",
+      "200,32,768,0.0138,11514000,57570,57570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,772,0.0135,11574000,57870,57870\n",
+      "200,32,772,0.0139,11574000,57870,57870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,776,0.0136,11634000,58170,58170\n",
+      "200,32,776,0.0141,11634000,58170,58170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,780,0.0138,11694000,58470,58470\n",
+      "200,32,780,0.0140,11694000,58470,58470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,784,0.0138,11754000,58770,58770\n",
+      "200,32,784,0.0142,11754000,58770,58770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,788,0.0139,11814000,59070,59070\n",
+      "200,32,788,0.0141,11814000,59070,59070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,792,0.0139,11874000,59370,59370\n",
+      "200,32,792,0.0142,11874000,59370,59370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,796,0.0141,11934000,59670,59670\n",
+      "200,32,796,0.0143,11934000,59670,59670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,800,0.0140,11994000,59970,59970\n",
+      "200,32,800,0.0143,11994000,59970,59970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,804,0.0141,12054000,60270,60270\n",
+      "200,32,804,0.0145,12054000,60270,60270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,808,0.0142,12114000,60570,60570\n",
+      "200,32,808,0.0145,12114000,60570,60570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,812,0.0143,12174000,60870,60870\n",
+      "200,32,812,0.0145,12174000,60870,60870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,816,0.0143,12234000,61170,61170\n",
+      "200,32,816,0.0148,12234000,61170,61170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,820,0.0143,12294000,61470,61470\n",
+      "200,32,820,0.0148,12294000,61470,61470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,824,0.0144,12354000,61770,61770\n",
+      "200,32,824,0.0148,12354000,61770,61770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,828,0.0145,12414000,62070,62070\n",
+      "200,32,828,0.0148,12414000,62070,62070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,832,0.0145,12474000,62370,62370\n",
+      "200,32,832,0.0149,12474000,62370,62370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,836,0.0146,12534000,62670,62670\n",
+      "200,32,836,0.0150,12534000,62670,62670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,840,0.0146,12594000,62970,62970\n",
+      "200,32,840,0.0150,12594000,62970,62970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,844,0.0147,12654000,63270,63270\n",
+      "200,32,844,0.0151,12654000,63270,63270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,848,0.0148,12714000,63570,63570\n",
+      "200,32,848,0.0153,12714000,63570,63570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,852,0.0149,12774000,63870,63870\n",
+      "200,32,852,0.0153,12774000,63870,63870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,856,0.0150,12834000,64170,64170\n",
+      "200,32,856,0.0153,12834000,64170,64170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,860,0.0150,12894000,64470,64470\n",
+      "200,32,860,0.0154,12894000,64470,64470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,864,0.0151,12954000,64770,64770\n",
+      "200,32,864,0.0154,12954000,64770,64770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,868,0.0152,13014000,65070,65070\n",
+      "200,32,868,0.0155,13014000,65070,65070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,872,0.0151,13074000,65370,65370\n",
+      "200,32,872,0.0157,13074000,65370,65370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,876,0.0152,13134000,65670,65670\n",
+      "200,32,876,0.0156,13134000,65670,65670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,880,0.0154,13194000,65970,65970\n",
+      "200,32,880,0.0157,13194000,65970,65970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,884,0.0154,13254000,66270,66270\n",
+      "200,32,884,0.0157,13254000,66270,66270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,888,0.0154,13314000,66570,66570\n",
+      "200,32,888,0.0158,13314000,66570,66570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,892,0.0155,13374000,66870,66870\n",
+      "200,32,892,0.0159,13374000,66870,66870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,896,0.0156,13434000,67170,67170\n",
+      "200,32,896,0.0160,13434000,67170,67170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,900,0.0158,13494000,67470,67470\n",
+      "200,32,900,0.0160,13494000,67470,67470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,904,0.0158,13554000,67770,67770\n",
+      "200,32,904,0.0162,13554000,67770,67770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,908,0.0159,13614000,68070,68070\n",
+      "200,32,908,0.0162,13614000,68070,68070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,912,0.0161,13674000,68370,68370\n",
+      "200,32,912,0.0163,13674000,68370,68370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,916,0.0162,13734000,68670,68670\n",
+      "200,32,916,0.0163,13734000,68670,68670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,920,0.0162,13794000,68970,68970\n",
+      "200,32,920,0.0164,13794000,68970,68970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,924,0.0163,13854000,69270,69270\n",
+      "200,32,924,0.0165,13854000,69270,69270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,928,0.0162,13914000,69570,69570\n",
+      "200,32,928,0.0166,13914000,69570,69570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,932,0.0164,13974000,69870,69870\n",
+      "200,32,932,0.0166,13974000,69870,69870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,936,0.0163,14034000,70170,70170\n",
+      "200,32,936,0.0167,14034000,70170,70170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,940,0.0164,14094000,70470,70470\n",
+      "200,32,940,0.0167,14094000,70470,70470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,944,0.0165,14154000,70770,70770\n",
+      "200,32,944,0.0168,14154000,70770,70770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,948,0.0166,14214000,71070,71070\n",
+      "200,32,948,0.0170,14214000,71070,71070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,952,0.0166,14274000,71370,71370\n",
+      "200,32,952,0.0171,14274000,71370,71370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,956,0.0170,14334000,71670,71670\n",
+      "200,32,956,0.0171,14334000,71670,71670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,960,0.0168,14394000,71970,71970\n",
+      "200,32,960,0.0171,14394000,71970,71970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,964,0.0174,14454000,72270,72270\n",
+      "200,32,964,0.0175,14454000,72270,72270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,968,0.0172,14514000,72570,72570\n",
+      "200,32,968,0.0176,14514000,72570,72570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,972,0.0173,14574000,72870,72870\n",
+      "200,32,972,0.0176,14574000,72870,72870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,976,0.0173,14634000,73170,73170\n",
+      "200,32,976,0.0175,14634000,73170,73170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,980,0.0175,14694000,73470,73470\n",
+      "200,32,980,0.0178,14694000,73470,73470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,984,0.0175,14754000,73770,73770\n",
+      "200,32,984,0.0180,14754000,73770,73770\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,988,0.0176,14814000,74070,74070\n",
+      "200,32,988,0.0178,14814000,74070,74070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,992,0.0176,14874000,74370,74370\n",
+      "200,32,992,0.0179,14874000,74370,74370\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,996,0.0178,14934000,74670,74670\n",
+      "200,32,996,0.0181,14934000,74670,74670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,1000,0.0179,14994000,74970,74970\n",
+      "200,32,1000,0.0180,14994000,74970,74970\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,1004,0.0178,15054000,75270,75270\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,1004,0.0182,15054000,75270,75270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,1008,0.0179,15114000,75570,75570\n",
+      "200,32,1008,0.0181,15114000,75570,75570\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,1012,0.0179,15174000,75870,75870\n",
+      "200,32,1012,0.0183,15174000,75870,75870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,1016,0.0181,15234000,76170,76170\n",
+      "200,32,1016,0.0183,15234000,76170,76170\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,1020,0.0181,15294000,76470,76470\n",
+      "200,32,1020,0.0186,15294000,76470,76470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,1024,0.0179,15354000,76770,76770\n",
-      "mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vld.bin.csv .\n",
-      "bsub -W 60 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vst.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.vst.bin.csv\n",
-      "Job <4098> is submitted to default queue <batch>.\n",
+      "200,32,1024,0.0182,15354000,76770,76770\n",
+      "mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.vld.bin.csv .\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vst.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.vst.bin.csv\n",
+      "Job <24642> is submitted to default queue <batch>.\n",
       "<<Waiting for dispatch ...>>\n",
       "<<Starting on login1>>\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
@@ -2276,11 +2424,11 @@
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,20,0.0013,54200,271,271\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,24,0.0014,66200,331,331\n",
+      "200,32,24,0.0013,66200,331,331\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,28,0.0014,78200,391,391\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,32,0.0016,90200,451,451\n",
+      "200,32,32,0.0015,90200,451,451\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,36,0.0015,102200,511,511\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
@@ -2296,115 +2444,109 @@
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,60,0.0020,174200,871,871\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,64,0.0022,186200,931,931\n",
+      "200,32,64,0.0020,186200,931,931\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,68,0.0022,198200,991,991\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,72,0.0021,210200,1051,1051\n",
+      "200,32,72,0.0023,210200,1051,1051\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,76,0.0023,222200,1111,1111\n",
+      "200,32,76,0.0022,222200,1111,1111\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,80,0.0023,234200,1171,1171\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,84,0.0023,246200,1231,1231\n",
+      "200,32,84,0.0024,246200,1231,1231\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,88,0.0024,258200,1291,1291\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,92,0.0025,270200,1351,1351\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,96,0.0027,282200,1411,1411\n",
+      "200,32,96,0.0025,282200,1411,1411\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,100,0.0026,294200,1471,1471\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,104,0.0027,306200,1531,1531\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,108,0.0027,318200,1591,1591\n",
+      "200,32,108,0.0028,318200,1591,1591\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,112,0.0028,330200,1651,1651\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,116,0.0028,342200,1711,1711\n",
+      "200,32,116,0.0029,342200,1711,1711\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,120,0.0030,354200,1771,1771\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,124,0.0030,366200,1831,1831\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,128,0.0030,378200,1891,1891\n",
+      "200,32,128,0.0031,378200,1891,1891\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,132,0.0032,390200,1951,1951\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,136,0.0032,402200,2011,2011\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,140,0.0032,414200,2071,2071\n",
+      "200,32,140,0.0033,414200,2071,2071\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,144,0.0033,426200,2131,2131\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,148,0.0033,438200,2191,2191\n",
+      "200,32,148,0.0035,438200,2191,2191\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,152,0.0034,450200,2251,2251\n",
+      "200,32,152,0.0035,450200,2251,2251\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,156,0.0035,462200,2311,2311\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,160,0.0036,474200,2371,2371\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,164,0.0036,486200,2431,2431\n",
+      "200,32,164,0.0038,486200,2431,2431\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,168,0.0037,498200,2491,2491\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,172,0.0037,510200,2551,2551\n",
+      "200,32,172,0.0038,510200,2551,2551\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,176,0.0039,522200,2611,2611\n",
+      "200,32,176,0.0038,522200,2611,2611\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,180,0.0039,534200,2671,2671\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,184,0.0039,546200,2731,2731\n",
+      "200,32,184,0.0040,546200,2731,2731\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,188,0.0040,558200,2791,2791\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,192,0.0040,570200,2851,2851\n",
+      "200,32,192,0.0041,570200,2851,2851\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,196,0.0041,582200,2911,2911\n",
+      "200,32,196,0.0042,582200,2911,2911\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,200,0.0042,594200,2971,2971\n",
+      "200,32,200,0.0044,594200,2971,2971\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,204,0.0042,606200,3031,3031\n",
+      "200,32,204,0.0043,606200,3031,3031\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,208,0.0043,618200,3091,3091\n",
+      "200,32,208,0.0044,618200,3091,3091\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,212,0.0044,630200,3151,3151\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,216,0.0044,642200,3211,3211\n",
+      "200,32,216,0.0045,642200,3211,3211\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,220,0.0046,654200,3271,3271\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,224,0.0046,666200,3331,3331\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,228,0.0046,678200,3391,3391\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,228,0.0047,678200,3391,3391\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,232,0.0047,690200,3451,3451\n",
+      "200,32,232,0.0048,690200,3451,3451\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,236,0.0047,702200,3511,3511\n",
+      "200,32,236,0.0048,702200,3511,3511\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,240,0.0048,714200,3571,3571\n",
+      "200,32,240,0.0049,714200,3571,3571\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,244,0.0049,726200,3631,3631\n",
+      "200,32,244,0.0050,726200,3631,3631\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,248,0.0049,738200,3691,3691\n",
+      "200,32,248,0.0050,738200,3691,3691\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,252,0.0050,750200,3751,3751\n",
+      "200,32,252,0.0051,750200,3751,3751\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,256,0.0051,762200,3811,3811\n",
+      "200,32,256,0.0052,762200,3811,3811\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,260,0.0051,774200,3871,3871\n",
+      "200,32,260,0.0052,774200,3871,3871\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,264,0.0053,786200,3931,3931\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,268,0.0053,798200,3991,3991\n",
+      "200,32,268,0.0054,798200,3991,3991\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,272,0.0054,810200,4051,4051\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
@@ -2412,396 +2554,378 @@
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,280,0.0055,834200,4171,4171\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,284,0.0055,846200,4231,4231\n",
+      "200,32,284,0.0056,846200,4231,4231\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,288,0.0056,858200,4291,4291\n",
+      "200,32,288,0.0057,858200,4291,4291\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,292,0.0057,870200,4351,4351\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,296,0.0057,882200,4411,4411\n",
+      "200,32,296,0.0058,882200,4411,4411\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,300,0.0058,894200,4471,4471\n",
+      "200,32,300,0.0059,894200,4471,4471\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,304,0.0058,906200,4531,4531\n",
+      "200,32,304,0.0059,906200,4531,4531\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,308,0.0059,918200,4591,4591\n",
+      "200,32,308,0.0060,918200,4591,4591\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,312,0.0060,930200,4651,4651\n",
+      "200,32,312,0.0061,930200,4651,4651\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,316,0.0060,942200,4711,4711\n",
+      "200,32,316,0.0061,942200,4711,4711\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,320,0.0061,954200,4771,4771\n",
+      "200,32,320,0.0062,954200,4771,4771\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,324,0.0061,966200,4831,4831\n",
+      "200,32,324,0.0063,966200,4831,4831\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,328,0.0062,978200,4891,4891\n",
+      "200,32,328,0.0063,978200,4891,4891\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,332,0.0063,990200,4951,4951\n",
+      "200,32,332,0.0064,990200,4951,4951\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,336,0.0063,1002200,5011,5011\n",
+      "200,32,336,0.0065,1002200,5011,5011\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,340,0.0064,1014200,5071,5071\n",
+      "200,32,340,0.0066,1014200,5071,5071\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,344,0.0065,1026200,5131,5131\n",
+      "200,32,344,0.0066,1026200,5131,5131\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,348,0.0066,1038200,5191,5191\n",
+      "200,32,348,0.0067,1038200,5191,5191\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,352,0.0066,1050200,5251,5251\n",
+      "200,32,352,0.0069,1050200,5251,5251\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,356,0.0067,1062200,5311,5311\n",
+      "200,32,356,0.0068,1062200,5311,5311\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,360,0.0067,1074200,5371,5371\n",
+      "200,32,360,0.0068,1074200,5371,5371\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,364,0.0068,1086200,5431,5431\n",
+      "200,32,364,0.0069,1086200,5431,5431\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,368,0.0068,1098200,5491,5491\n",
+      "200,32,368,0.0070,1098200,5491,5491\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,372,0.0069,1110200,5551,5551\n",
+      "200,32,372,0.0071,1110200,5551,5551\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,376,0.0070,1122200,5611,5611\n",
+      "200,32,376,0.0071,1122200,5611,5611\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,380,0.0071,1134200,5671,5671\n",
+      "200,32,380,0.0072,1134200,5671,5671\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,384,0.0072,1146200,5731,5731\n",
+      "200,32,384,0.0073,1146200,5731,5731\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,388,0.0072,1158200,5791,5791\n",
+      "200,32,388,0.0073,1158200,5791,5791\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,392,0.0072,1170200,5851,5851\n",
+      "200,32,392,0.0074,1170200,5851,5851\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,396,0.0073,1182200,5911,5911\n",
+      "200,32,396,0.0075,1182200,5911,5911\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,400,0.0074,1194200,5971,5971\n",
+      "200,32,400,0.0075,1194200,5971,5971\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,404,0.0074,1206200,6031,6031\n",
+      "200,32,404,0.0076,1206200,6031,6031\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,408,0.0076,1218200,6091,6091\n",
+      "200,32,408,0.0077,1218200,6091,6091\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,412,0.0076,1230200,6151,6151\n",
+      "200,32,412,0.0077,1230200,6151,6151\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,416,0.0077,1242200,6211,6211\n",
+      "200,32,416,0.0080,1242200,6211,6211\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,420,0.0077,1254200,6271,6271\n",
+      "200,32,420,0.0078,1254200,6271,6271\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,424,0.0078,1266200,6331,6331\n",
+      "200,32,424,0.0079,1266200,6331,6331\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,428,0.0078,1278200,6391,6391\n",
+      "200,32,428,0.0080,1278200,6391,6391\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,432,0.0080,1290200,6451,6451\n",
+      "200,32,432,0.0081,1290200,6451,6451\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,436,0.0079,1302200,6511,6511\n",
+      "200,32,436,0.0082,1302200,6511,6511\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,440,0.0081,1314200,6571,6571\n",
+      "200,32,440,0.0082,1314200,6571,6571\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,444,0.0081,1326200,6631,6631\n",
+      "200,32,444,0.0083,1326200,6631,6631\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,448,0.0082,1338200,6691,6691\n",
+      "200,32,448,0.0083,1338200,6691,6691\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,452,0.0082,1350200,6751,6751\n",
+      "200,32,452,0.0084,1350200,6751,6751\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,456,0.0084,1362200,6811,6811\n",
+      "200,32,456,0.0085,1362200,6811,6811\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,460,0.0084,1374200,6871,6871\n",
+      "200,32,460,0.0085,1374200,6871,6871\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,464,0.0084,1386200,6931,6931\n",
+      "200,32,464,0.0087,1386200,6931,6931\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,468,0.0085,1398200,6991,6991\n",
+      "200,32,468,0.0086,1398200,6991,6991\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,472,0.0085,1410200,7051,7051\n",
+      "200,32,472,0.0087,1410200,7051,7051\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,476,0.0086,1422200,7111,7111\n",
+      "200,32,476,0.0088,1422200,7111,7111\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,480,0.0087,1434200,7171,7171\n",
+      "200,32,480,0.0090,1434200,7171,7171\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,484,0.0088,1446200,7231,7231\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,484,0.0089,1446200,7231,7231\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,488,0.0088,1458200,7291,7291\n",
+      "200,32,488,0.0090,1458200,7291,7291\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,492,0.0089,1470200,7351,7351\n",
+      "200,32,492,0.0092,1470200,7351,7351\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,496,0.0089,1482200,7411,7411\n",
+      "200,32,496,0.0092,1482200,7411,7411\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,500,0.0090,1494200,7471,7471\n",
+      "200,32,500,0.0092,1494200,7471,7471\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,504,0.0092,1506200,7531,7531\n",
+      "200,32,504,0.0093,1506200,7531,7531\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,508,0.0093,1518200,7591,7591\n",
+      "200,32,508,0.0094,1518200,7591,7591\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,512,0.0092,1530200,7651,7651\n",
+      "200,32,512,0.0095,1530200,7651,7651\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,516,0.0093,1542200,7711,7711\n",
+      "200,32,516,0.0096,1542200,7711,7711\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,520,0.0094,1554200,7771,7771\n",
+      "200,32,520,0.0096,1554200,7771,7771\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,524,0.0094,1566200,7831,7831\n",
+      "200,32,524,0.0096,1566200,7831,7831\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,528,0.0094,1578200,7891,7891\n",
+      "200,32,528,0.0097,1578200,7891,7891\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
       "200,32,532,0.0097,1590200,7951,7951\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,536,0.0096,1602200,8011,8011\n",
+      "200,32,536,0.0098,1602200,8011,8011\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,540,0.0097,1614200,8071,8071\n",
+      "200,32,540,0.0100,1614200,8071,8071\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,544,0.0097,1626200,8131,8131\n",
+      "200,32,544,0.0099,1626200,8131,8131\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,548,0.0099,1638200,8191,8191\n",
+      "200,32,548,0.0100,1638200,8191,8191\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,552,0.0099,1650200,8251,8251\n",
+      "200,32,552,0.0101,1650200,8251,8251\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,556,0.0101,1662200,8311,8311\n",
+      "200,32,556,0.0102,1662200,8311,8311\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,560,0.0100,1674200,8371,8371\n",
+      "200,32,560,0.0102,1674200,8371,8371\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,564,0.0101,1686200,8431,8431\n",
+      "200,32,564,0.0105,1686200,8431,8431\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,568,0.0102,1698200,8491,8491\n",
+      "200,32,568,0.0104,1698200,8491,8491\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,572,0.0103,1710200,8551,8551\n",
+      "200,32,572,0.0105,1710200,8551,8551\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,576,0.0103,1722200,8611,8611\n",
+      "200,32,576,0.0105,1722200,8611,8611\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,580,0.0104,1734200,8671,8671\n",
+      "200,32,580,0.0108,1734200,8671,8671\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,584,0.0104,1746200,8731,8731\n",
+      "200,32,584,0.0108,1746200,8731,8731\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,588,0.0105,1758200,8791,8791\n",
+      "200,32,588,0.0109,1758200,8791,8791\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,592,0.0107,1770200,8851,8851\n",
+      "200,32,592,0.0109,1770200,8851,8851\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,596,0.0108,1782200,8911,8911\n",
+      "200,32,596,0.0109,1782200,8911,8911\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,600,0.0107,1794200,8971,8971\n",
+      "200,32,600,0.0111,1794200,8971,8971\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,604,0.0109,1806200,9031,9031\n",
+      "200,32,604,0.0111,1806200,9031,9031\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,608,0.0109,1818200,9091,9091\n",
+      "200,32,608,0.0112,1818200,9091,9091\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,612,0.0109,1830200,9151,9151\n",
+      "200,32,612,0.0112,1830200,9151,9151\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,616,0.0110,1842200,9211,9211\n",
+      "200,32,616,0.0114,1842200,9211,9211\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,620,0.0111,1854200,9271,9271\n",
+      "200,32,620,0.0113,1854200,9271,9271\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,624,0.0112,1866200,9331,9331\n",
+      "200,32,624,0.0114,1866200,9331,9331\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,628,0.0111,1878200,9391,9391\n",
+      "200,32,628,0.0114,1878200,9391,9391\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,632,0.0112,1890200,9451,9451\n",
+      "200,32,632,0.0116,1890200,9451,9451\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,636,0.0113,1902200,9511,9511\n",
+      "200,32,636,0.0116,1902200,9511,9511\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,640,0.0116,1914200,9571,9571\n",
+      "200,32,640,0.0117,1914200,9571,9571\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,644,0.0114,1926200,9631,9631\n",
+      "200,32,644,0.0118,1926200,9631,9631\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,648,0.0115,1938200,9691,9691\n",
+      "200,32,648,0.0118,1938200,9691,9691\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,652,0.0117,1950200,9751,9751\n",
+      "200,32,652,0.0121,1950200,9751,9751\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,656,0.0117,1962200,9811,9811\n",
+      "200,32,656,0.0121,1962200,9811,9811\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,660,0.0117,1974200,9871,9871\n",
+      "200,32,660,0.0121,1974200,9871,9871\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,664,0.0118,1986200,9931,9931\n",
+      "200,32,664,0.0121,1986200,9931,9931\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,668,0.0119,1998200,9991,9991\n",
+      "200,32,668,0.0122,1998200,9991,9991\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,672,0.0120,2010200,10051,10051\n",
+      "200,32,672,0.0122,2010200,10051,10051\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,676,0.0120,2022200,10111,10111\n",
+      "200,32,676,0.0124,2022200,10111,10111\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,680,0.0120,2034200,10171,10171\n",
+      "200,32,680,0.0123,2034200,10171,10171\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,684,0.0121,2046200,10231,10231\n",
+      "200,32,684,0.0124,2046200,10231,10231\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,688,0.0122,2058200,10291,10291\n",
+      "200,32,688,0.0126,2058200,10291,10291\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,692,0.0123,2070200,10351,10351\n",
+      "200,32,692,0.0127,2070200,10351,10351\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,696,0.0124,2082200,10411,10411\n",
+      "200,32,696,0.0126,2082200,10411,10411\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,700,0.0124,2094200,10471,10471\n",
+      "200,32,700,0.0128,2094200,10471,10471\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,704,0.0125,2106200,10531,10531\n",
+      "200,32,704,0.0127,2106200,10531,10531\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,708,0.0125,2118200,10591,10591\n",
+      "200,32,708,0.0128,2118200,10591,10591\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,712,0.0125,2130200,10651,10651\n",
+      "200,32,712,0.0129,2130200,10651,10651\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,716,0.0125,2142200,10711,10711\n",
+      "200,32,716,0.0130,2142200,10711,10711\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,720,0.0126,2154200,10771,10771\n",
+      "200,32,720,0.0130,2154200,10771,10771\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,724,0.0127,2166200,10831,10831\n",
+      "200,32,724,0.0131,2166200,10831,10831\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,728,0.0128,2178200,10891,10891\n",
+      "200,32,728,0.0131,2178200,10891,10891\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,732,0.0128,2190200,10951,10951\n",
+      "200,32,732,0.0132,2190200,10951,10951\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,736,0.0130,2202200,11011,11011\n",
+      "200,32,736,0.0134,2202200,11011,11011\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,740,0.0130,2214200,11071,11071\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,740,0.0134,2214200,11071,11071\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,744,0.0130,2226200,11131,11131\n",
+      "200,32,744,0.0134,2226200,11131,11131\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,748,0.0131,2238200,11191,11191\n",
+      "200,32,748,0.0135,2238200,11191,11191\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,752,0.0133,2250200,11251,11251\n",
+      "200,32,752,0.0136,2250200,11251,11251\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,756,0.0133,2262200,11311,11311\n",
+      "200,32,756,0.0136,2262200,11311,11311\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,760,0.0133,2274200,11371,11371\n",
+      "200,32,760,0.0137,2274200,11371,11371\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,764,0.0134,2286200,11431,11431\n",
+      "200,32,764,0.0138,2286200,11431,11431\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,768,0.0135,2298200,11491,11491\n",
+      "200,32,768,0.0138,2298200,11491,11491\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,772,0.0137,2310200,11551,11551\n",
+      "200,32,772,0.0139,2310200,11551,11551\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,776,0.0136,2322200,11611,11611\n",
+      "200,32,776,0.0139,2322200,11611,11611\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,780,0.0137,2334200,11671,11671\n",
+      "200,32,780,0.0140,2334200,11671,11671\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,784,0.0137,2346200,11731,11731\n",
+      "200,32,784,0.0141,2346200,11731,11731\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,788,0.0138,2358200,11791,11791\n",
+      "200,32,788,0.0142,2358200,11791,11791\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,792,0.0139,2370200,11851,11851\n",
+      "200,32,792,0.0142,2370200,11851,11851\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,796,0.0140,2382200,11911,11911\n",
+      "200,32,796,0.0144,2382200,11911,11911\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,800,0.0140,2394200,11971,11971\n",
+      "200,32,800,0.0144,2394200,11971,11971\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,804,0.0141,2406200,12031,12031\n",
+      "200,32,804,0.0144,2406200,12031,12031\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,808,0.0143,2418200,12091,12091\n",
+      "200,32,808,0.0146,2418200,12091,12091\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,812,0.0142,2430200,12151,12151\n",
+      "200,32,812,0.0146,2430200,12151,12151\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,816,0.0143,2442200,12211,12211\n",
+      "200,32,816,0.0146,2442200,12211,12211\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,820,0.0144,2454200,12271,12271\n",
+      "200,32,820,0.0147,2454200,12271,12271\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,824,0.0144,2466200,12331,12331\n",
+      "200,32,824,0.0148,2466200,12331,12331\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,828,0.0145,2478200,12391,12391\n",
+      "200,32,828,0.0149,2478200,12391,12391\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,832,0.0146,2490200,12451,12451\n",
+      "200,32,832,0.0149,2490200,12451,12451\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,836,0.0146,2502200,12511,12511\n",
+      "200,32,836,0.0150,2502200,12511,12511\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,840,0.0147,2514200,12571,12571\n",
+      "200,32,840,0.0151,2514200,12571,12571\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,844,0.0148,2526200,12631,12631\n",
+      "200,32,844,0.0152,2526200,12631,12631\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,848,0.0149,2538200,12691,12691\n",
+      "200,32,848,0.0151,2538200,12691,12691\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,852,0.0149,2550200,12751,12751\n",
+      "200,32,852,0.0152,2550200,12751,12751\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,856,0.0150,2562200,12811,12811\n",
+      "200,32,856,0.0153,2562200,12811,12811\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,860,0.0152,2574200,12871,12871\n",
+      "200,32,860,0.0154,2574200,12871,12871\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,864,0.0151,2586200,12931,12931\n",
+      "200,32,864,0.0155,2586200,12931,12931\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,868,0.0151,2598200,12991,12991\n",
+      "200,32,868,0.0155,2598200,12991,12991\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,872,0.0151,2610200,13051,13051\n",
+      "200,32,872,0.0156,2610200,13051,13051\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,876,0.0152,2622200,13111,13111\n",
+      "200,32,876,0.0156,2622200,13111,13111\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,880,0.0155,2634200,13171,13171\n",
+      "200,32,880,0.0157,2634200,13171,13171\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,884,0.0154,2646200,13231,13231\n",
+      "200,32,884,0.0158,2646200,13231,13231\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,888,0.0155,2658200,13291,13291\n",
+      "200,32,888,0.0159,2658200,13291,13291\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,892,0.0155,2670200,13351,13351\n",
+      "200,32,892,0.0159,2670200,13351,13351\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,896,0.0156,2682200,13411,13411\n",
+      "200,32,896,0.0160,2682200,13411,13411\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,900,0.0157,2694200,13471,13471\n",
+      "200,32,900,0.0160,2694200,13471,13471\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,904,0.0159,2706200,13531,13531\n",
+      "200,32,904,0.0162,2706200,13531,13531\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,908,0.0160,2718200,13591,13591\n",
+      "200,32,908,0.0162,2718200,13591,13591\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,912,0.0161,2730200,13651,13651\n",
+      "200,32,912,0.0163,2730200,13651,13651\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,916,0.0162,2742200,13711,13711\n",
+      "200,32,916,0.0163,2742200,13711,13711\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,920,0.0161,2754200,13771,13771\n",
+      "200,32,920,0.0164,2754200,13771,13771\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,924,0.0162,2766200,13831,13831\n",
+      "200,32,924,0.0165,2766200,13831,13831\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,928,0.0163,2778200,13891,13891\n",
+      "200,32,928,0.0166,2778200,13891,13891\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,932,0.0165,2790200,13951,13951\n",
+      "200,32,932,0.0168,2790200,13951,13951\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,936,0.0165,2802200,14011,14011\n",
+      "200,32,936,0.0167,2802200,14011,14011\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,940,0.0165,2814200,14071,14071\n",
+      "200,32,940,0.0169,2814200,14071,14071\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,944,0.0166,2826200,14131,14131\n",
+      "200,32,944,0.0169,2826200,14131,14131\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,948,0.0166,2838200,14191,14191\n",
+      "200,32,948,0.0169,2838200,14191,14191\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,952,0.0168,2850200,14251,14251\n",
+      "200,32,952,0.0170,2850200,14251,14251\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,956,0.0167,2862200,14311,14311\n",
+      "200,32,956,0.0170,2862200,14311,14311\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,960,0.0168,2874200,14371,14371\n",
+      "200,32,960,0.0171,2874200,14371,14371\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,964,0.0173,2886200,14431,14431\n",
+      "200,32,964,0.0175,2886200,14431,14431\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,968,0.0172,2898200,14491,14491\n",
+      "200,32,968,0.0175,2898200,14491,14491\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,972,0.0172,2910200,14551,14551\n",
+      "200,32,972,0.0176,2910200,14551,14551\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,976,0.0173,2922200,14611,14611\n",
+      "200,32,976,0.0176,2922200,14611,14611\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,980,0.0175,2934200,14671,14671\n",
+      "200,32,980,0.0178,2934200,14671,14671\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,984,0.0176,2946200,14731,14731\n",
+      "200,32,984,0.0178,2946200,14731,14731\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,988,0.0176,2958200,14791,14791\n",
+      "200,32,988,0.0179,2958200,14791,14791\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,992,0.0177,2970200,14851,14851\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,992,0.0178,2970200,14851,14851\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,996,0.0178,2982200,14911,14911\n",
+      "200,32,996,0.0181,2982200,14911,14911\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,1000,0.0177,2994200,14971,14971\n",
+      "200,32,1000,0.0180,2994200,14971,14971\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,1004,0.0179,3006200,15031,15031\n",
+      "200,32,1004,0.0181,3006200,15031,15031\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,1008,0.0179,3018200,15091,15091\n",
+      "200,32,1008,0.0182,3018200,15091,15091\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,1012,0.0180,3030200,15151,15151\n",
+      "200,32,1012,0.0183,3030200,15151,15151\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,1016,0.0180,3042200,15211,15211\n",
+      "200,32,1016,0.0183,3042200,15211,15211\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,1020,0.0182,3054200,15271,15271\n",
+      "200,32,1020,0.0184,3054200,15271,15271\n",
       "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,1024,0.0178,3066200,15331,15331\n",
-      "mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vst.bin.csv .\n"
+      "200,32,1024,0.0182,3066200,15331,15331\n",
+      "mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.vst.bin.csv .\n"
      ]
     }
    ],
@@ -2815,12 +2939,12 @@
    "source": [
     "Let's plot it again, as soon as the run finishes! Non-interactively, call `graph_task2b`.\n",
     "\n",
-    "*We need to read in two CSV files now, which we combine to one common dataframe `df_vldvst`.*"
+    "*Because we couldn't measure the two vector counters at the same time, we have two CSV files to read in now. We combine them into one common dataframe `df_vldvst` in the following.*"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2831,7 +2955,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -2865,8 +2989,7 @@
        "      <th>PM_VECTOR_ST_CMPL (total)</th>\n",
        "      <th>PM_VECTOR_ST_CMPL (min)</th>\n",
        "      <th>PM_VECTOR_ST_CMPL (max)</th>\n",
-       "      <th>Vector Loads / Loop Iteration</th>\n",
-       "      <th>Vector Stores / Loop Iteration</th>\n",
+       "      <th>Grid Points</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -2882,8 +3005,7 @@
        "      <td>200</td>\n",
        "      <td>1</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.007812</td>\n",
+       "      <td>128</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
@@ -2897,8 +3019,7 @@
        "      <td>18200</td>\n",
        "      <td>91</td>\n",
        "      <td>91</td>\n",
-       "      <td>2.226562</td>\n",
-       "      <td>0.355469</td>\n",
+       "      <td>256</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -2912,38 +3033,35 @@
        "      <td>30200</td>\n",
        "      <td>151</td>\n",
        "      <td>151</td>\n",
-       "      <td>2.265625</td>\n",
-       "      <td>0.393229</td>\n",
+       "      <td>384</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>16</td>\n",
        "      <td>200</td>\n",
        "      <td>32</td>\n",
-       "      <td>0.0013</td>\n",
+       "      <td>0.0012</td>\n",
        "      <td>234000</td>\n",
        "      <td>1170</td>\n",
        "      <td>1170</td>\n",
        "      <td>42200</td>\n",
        "      <td>211</td>\n",
        "      <td>211</td>\n",
-       "      <td>2.285156</td>\n",
-       "      <td>0.412109</td>\n",
+       "      <td>512</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>20</td>\n",
        "      <td>200</td>\n",
        "      <td>32</td>\n",
-       "      <td>0.0014</td>\n",
+       "      <td>0.0013</td>\n",
        "      <td>294000</td>\n",
        "      <td>1470</td>\n",
        "      <td>1470</td>\n",
        "      <td>54200</td>\n",
        "      <td>271</td>\n",
        "      <td>271</td>\n",
-       "      <td>2.296875</td>\n",
-       "      <td>0.423438</td>\n",
+       "      <td>640</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -2954,8 +3072,8 @@
        "0   4   200  32   0.0010                          0                        0   \n",
        "1   8   200  32   0.0011                     114000                      570   \n",
        "2  12   200  32   0.0012                     174000                      870   \n",
-       "3  16   200  32   0.0013                     234000                     1170   \n",
-       "4  20   200  32   0.0014                     294000                     1470   \n",
+       "3  16   200  32   0.0012                     234000                     1170   \n",
+       "4  20   200  32   0.0013                     294000                     1470   \n",
        "\n",
        "    PM_VECTOR_LD_CMPL (max)  PM_VECTOR_ST_CMPL (total)  \\\n",
        "0                         0                        200   \n",
@@ -2964,52 +3082,109 @@
        "3                      1170                      42200   \n",
        "4                      1470                      54200   \n",
        "\n",
-       "   PM_VECTOR_ST_CMPL (min)   PM_VECTOR_ST_CMPL (max)  \\\n",
-       "0                        1                         1   \n",
-       "1                       91                        91   \n",
-       "2                      151                       151   \n",
-       "3                      211                       211   \n",
-       "4                      271                       271   \n",
-       "\n",
-       "   Vector Loads / Loop Iteration  Vector Stores / Loop Iteration  \n",
-       "0                       0.000000                        0.007812  \n",
-       "1                       2.226562                        0.355469  \n",
-       "2                       2.265625                        0.393229  \n",
-       "3                       2.285156                        0.412109  \n",
-       "4                       2.296875                        0.423438  "
+       "   PM_VECTOR_ST_CMPL (min)   PM_VECTOR_ST_CMPL (max)  Grid Points  \n",
+       "0                        1                         1          128  \n",
+       "1                       91                        91          256  \n",
+       "2                      151                       151          384  \n",
+       "3                      211                       211          512  \n",
+       "4                      271                       271          640  "
       ]
      },
-     "execution_count": 9,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "common.normalize(df_vldvst, \"PM_VECTOR_LD_CMPL (min)\", \"Vector Loads / Loop Iteration\")\n",
-    "common.normalize(df_vldvst, \"PM_VECTOR_ST_CMPL (min)\", \"Vector Stores / Loop Iteration\")\n",
+    "df_vldvst[\"Grid Points\"] = df_vldvst[\"nx\"] * df_vldvst[\"ny\"] \n",
     "df_vldvst.head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "image/png": "\n",
+      "image/png": "\n",
       "text/plain": [
        "<Figure size 1008x432 with 2 Axes>"
       ]
      },
-     "metadata": {},
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
+    "df_vldvst.set_index(\"Grid Points\")[\"PM_VECTOR_LD_CMPL (min)\"].plot(ax=ax1, legend=True);\n",
+    "df_vldvst.set_index(\"Grid Points\")[\"PM_VECTOR_ST_CMPL (min)\"].plot(ax=ax2, legend=True);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Also here seems to be a linear correlation. Let's do our fitting and plot directly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Counter PM_VECTOR_LD_CMPL (min) is proportional to the grid points (nx*ny) by a factor of 2.3439 (± 0.000111)\n",
+      "Counter PM_VECTOR_ST_CMPL (min) is proportional to the grid points (nx*ny) by a factor of 0.4688 (± 0.000012)\n"
+     ]
+    }
+   ],
+   "source": [
+    "_fit, _cov = common.print_and_return_fit(\n",
+    "    [\"PM_VECTOR_LD_CMPL (min)\", \"PM_VECTOR_ST_CMPL (min)\"], \n",
+    "    df_vldvst.set_index(\"Grid Points\"), \n",
+    "    linear_function,\n",
+    "    format_value=\".4f\",\n",
+    ")\n",
+    "fit_parameters = {**fit_parameters, **_fit}\n",
+    "fit_covariance = {**fit_covariance, **_cov}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAF/CAYAAAB+GZmgAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzs3Xl8XHW9//HXTCaTfZ2Z7HuanNDSvYVWylqQpVRWuYBsCl52AbWi4BWuwgVxQVmEXuAKilzxCogKePGi/AREWQq4UKZps0z2mUwmmeyTmTm/P5pGapekJelkmvfz8eBBc873nPM5+ZAy75xzvsdimiYiIiIiIiKyZ9ZYFyAiIiIiIjLbKTiJiIiIiIhMQsFJRERERERkEgpOIiIiIiIik1BwEhERERERmYSCk4iIiIiIyCQUnERERERERCah4CQiIiIiIjIJBScREREREZFJ2KYyyDCMU4FvABa2h61b3W7304Zh1AKPAQ7AD1zkdrvrx7eZ9nUiIiIiIiKxMOkVJ8MwLMCPgQvdbvcS4ALgMcMwrMCDwP1ut7sWuB/Y+KFNZ2KdiIiIiIjIATelK05AFMga/3M20AE4gWXACePL/xu4zzAMF9uvTE3rOrfb7ZtCnUnAyvH6IlM8NxERERERmTsSgELgTWB0qhtNGpzcbrdpGMY5wLOGYQwCGcA6oBRoc7vdkfFxEcMw2seXW2Zg3VSC00rglamevIiIiIiIzFlHAq9OdfBUbtWzAV8BTnO73eXAeuBJIH1/K5xBHbEuQERERERE4sI+ZYep3Kq3BChyu92vAbjd7tfGrzyNAMWGYSSMXxlKAIqAFrZfOZrudVMRAfD7B4hGzSluMr1crgx8vv6YHFv2j3oWf9Sz+KOexR/1LL6oX/FHPYsdq9WCw5EO+/hoz1SmI28FSgzDMAAMwzgEKADqgXeB88bHnQe843a7fW632zvd6/blpERERERERKbTVJ5x6jQM40rg54ZhRMcXf9rtdvcYhnEF22fY+xoQAC760KYzsU5EREREROSAs5hmbG5pmyEVQKNu1ZN9oZ7FH/Us/qhn8Uc9iy/qV/xRz2LnQ7fqVQJNU91uqtORx7VIJEwg4CMcDs34sbxeK9FodPKBMmvMlZ7ZbHZyclwkJMyJH3sRERGJkUgkQnerh57merzBCG8PFTI2FuWL5y0hwTqVJ4VmpznxCSoQ8JGcnEpaWgEWi2VGj2WzWQmHD/4P4QeTudAz0zQZHAwSCPhwOgtjXY6IiIgcJEKhMdr8w3i6+kl1P09W/zZyI92kWsKkAiNjBfQmn8n8ipxYl/qRzYngFA6HDkhoEpmtLBYLaWmZDAz0xroUERERiVP9vb34tn3AQHsjloCHtJFOopEo/9F3GgCXZHSQkWSlLWsJNmc5WaXzWFBRzdIke4wrnx5zIjgBCk0y5+lnQERERKYiGo3S09GOv6meka4m/l9kGS2+AY4P/56PJW/FBfSZqfTZ8wk7SrjqmAWUFWTgzD4W60H8eWPOBCcREREREdnZ2FiITv8wHt8QI03vUex9hdywjxRLiCIgalr4jaWE6uJCUjKPpyvjOPIqDUocDkpiXfwBpuAUA2efvR673U5iop1oNMLFF1/K8cefyKZNb/G5z13BeeddyNVXXzcx/ppr/pV3393Eiy/+gdTU1F32193t47zzzuTpp58nIyNjYvmmTW9x553f4Mknf8G1115OV1cXaWlpE+u/8IUbWbhwMaZp8j//81N++ctnAJNoNMrixUs56aR13H33twAIBvsYGhqkoKAIgPXrT+ess87hvffe4cEH7yMQCBCJRFi6dBnXXHMDmZmZE7XvOO7o6AgnnngKl1xy2V6/P7fffit1dYdw1ln/stPyRx7ZyDPP/ByXy8Xw8Ajp6emceOIpnHnmJ0lISNjrPv/859d59NGHCAQC2Gw2ioqKufzya6iunsfpp69jbCzE008/P7Gf5577JXfc8XVuuGEDZ531Lzz//K+4557vUFBQRDg8Rnl5BTfe+FUyM7O45pp/5bzzLuSII47caw0jIyNcffVnue++/yQlJWWvYz+su9vHv//7V7n33o17HRcKhbjyykv5/vcfID09fcr7FxERkblhaHCIrq0fMNDegOlvJnWoE4fp50f9J7AtnM+iZB+laWE6MuZjdZSTWVxFflUNN+7m8+dcpOAUI7fd9k2qquaxZcsHXHHFpaxYcTgAZWXlvPLKy1xxxTUkJCTQ3t7G6OjIXvfldLpYvHgpL730v5x++tkTy59//leccsr6iVu0rr/+i7v9cP/QQw/w7rubuOeeB8jNdRCNRnnllZdxOl08+ugTE/v64x9f4bbb7prYrrW1hZtv3sA3vvFNli5dTjQa5b777ubf/u3LfP/7P5gYt+O43d3dXHDB2axcuYoFCw7dr+/bSSet45prrgegra2Vb3zja7S1tXD99Rv2uM0bb/yJO+/8Bnfc8W3q6uYDsGXLB/j93VRXzwPA4XDyxhuvs3r1GgBeeOHXbH/X8z+sWHEYt912F9FolK997cs89tgjXHvt56dc+89//lOOOea4fQpNsL2/k4UmALvdzoknnsyTT/6ESy+9fJ+OISIiIgePaDRK0N+Nr2ELwx0NuEP5vB3IJLXfww2ZL+AEBs1kemx5eDIO5+Rlh5JfWUVBbkpcz3o30+ZkcHrtrx28+peOGdn30UuLWDW/YMrja2vrSE1NpaOjDYCUlFQqKionPsS/8MKvOemkdWze/P5e97Nu3Sd44okfTwSnoaFB/vCHl/nxj5/c63ZDQ0P89Kc/4dFHf0JurgMAq9XK0UcfN2ntP/rRf7Fu3WksXbp8YrurrrqOc845jffee4fFi5fuNN7pdFJaWk5XV+d+B6cPKy4u4Stf+RoXXfQvXHbZlXu8yvLDHz7ExRdfOhGaYPv3/cNOPnk9zz//a1avXjMRVquqqne7P6vVyrJlK3n99Vf3qd5f/vIZ7rnnwYmvzz57PR//+Mm8/fab+HxerrjiWnp7e/jtb39DMBjkpptuYfHipXR0tHPZZRfy3HMvAbBmzQr+9V+v4g9/eJm+vj6uvvpzHHPMWgCOP/5ELr30QgUnERGROSISjuDtDtASiNDa3s0hzU+SPeYl3TLCjk+kDdHDKM07kvL5S+lILMRZUUNeXj4FCkn7ZE4Gp9lk06a3CIVClJSUUV/vBuCUU9bz7LNPs2rVEbz00os88MAjE7fM7cmaNUfzne/cSWNjA5WVVfzud7/l0EMXkZ//jxD3ve99m4ceemDi67vvvo+Ojnbs9kTKyir2ufZt27Zy8cWX7rTMZrNRW2uwdeuWXYKTx9NMMNg3EbSmQ3l5BcnJyXg8Tcyfv/swtmXLB3z+81/a636WLVvBM8/8D8FgcCKsfvDB5t2ODYVCvPrqH6irO2S363enq6uTkZERCgp2ngp8bGyMjRt/yObNf+faay/nyis/x0MP/YiXXvotDz54Hw888Mhu95eWlsbDD/+Iv/zlXb72ta9MBKfcXAc2WyLNzU2Ul1dMuT4RERGZ/UJjETq3bibYUk+k20PKYDuOqJ+/h8p5YvAIEqxQlx3Gl1aDL7eMjKJK8qpqOD0z60N7qY1Z/fFuTganIxYWcsTCmXmXzVTfCfTVr96I3Z5EWloat9/+zZ2eTVq2bAXf+c6d/OEPL1NVVU1WVvak+0tMTOSEE07m+ed/xdVXX8dzz/2Ks88+d6cxu7tVr729fYpntivTNKc07nvf+zYPPHAvHk8T1133RXJyZt88/hYLHHfcCbz00osTYfWfg9Nbb73BJZecD8DChYu58MJPT3n/Pp+XnJzcXZavXXsCsP0K2MjICGvXfhyAurpDaGtr3eP+1q49EYAFCxbS3e1jdHSUpKQkABwOB15vl4KTiIhIHOvvDeDd5mawvYG+gRFeHFxAh3+IGzN/QXlCH8OmnZ4EF61ZS3EVHsItxkqKnGkk2ia/a0j2z5wMTrPBjmecdsdisXDccSdw1123cdNNt055n6eeeho33HA169Z9Ao+niSOPPHrSbSorqwiFQng8zZSVlU/5WADz5tXw97//laOOOmZiWTgcZssWN+eee8HEsh2B7S9/eZcbbriaJUuWTTxb9FF5PE2MjIzsNSTU1tbx/vt/p6bG2Ou+Tj75VC6//BKWLFm227C64xmn/ZGUlEQoNLrLcrt9+3sNdkxKseNrq9VKJBLe4/7+ebtIJDKxLhT6R4gSERGR2S0ajdLT2UVzv40Wbz+Oxv+lcvhvZFkGyRsf02E6cOQuZ2mti9GUixgpdOEoKiFPt9odUApOs9Rpp51JSkoKhx++esrbVFfPw+XK47bbbuGEE06a+HC9N6mpqZxzzvncddftfOMbd5KTk4tpmvz2t79hwYKFFBfveaLJCy64hCuv/AyrVn1sYnKIH/zg+5SUlLJkybJdxi9atIQzzzyHhx9+kDvu+PaUz2tPOjraueOOb3DGGWeTlrbnWeQuvvhS7rrrdurq5mMY259tev/9v9HX18fq1UdMjCsuLuGzn71qj7f8fRRlZeX4/X5CodCU+rK/IpEI7e1tewzlIiIiEjvhSJQuTwt9DX/F7G0lIdBCbsSHnTE2Bs4nYkngEzlWslPK6MkpIbWwmrzKGmodDt1gNwsoOM1SLlcen/rUxfu83amnnsZ3vnMnN9548y7r/vkZp8suu5w1a47m8suv5sknf8K1126fUMA0TRYtWjoxw9yelJWVc9ttd7Fx4/309vYSiYRZsmQZt932zT1uc9FFn+Hcc0+nvt691ytADz30II8//tjE11/60k0A/OY3z/H2228wMjJCWlo6H//4SbtMW/7PVq36GBs2fIXvfveb9PX1jU9HXsQVV1yzy9jTTjtzr/vak//4j1ux2/9xledb3/o+8+bVTHydlJTMsmXLeeedt/cpDO+rv/71PebPP1TTkYuIiMTYUH8/XQ31DLRtw+zx8L+hZdT74cjEv3F66tuEzAT8VicdGQuwOsq48ZQllBTkkpSoW+1mK8tUn1OJExVAo98/QDT6j/Pq7GymoGDfbkPbX1N9xklmjwPVs7/+9T2eeOJH3HHHd2bsGLfeejPr1n2ClSsP3+36A/mzMJNcrgx8vv5YlyH7QD2LP+pZfFG/YicajdLn89EWCNHYE2GkdTOHBV4g1xKcGDNgJvN/aZ8gqbiWqmyTkmwrhyxdQKB376+ckZlhtVpwONIBKoGmqW6nK04iB8jChYv52MeOZHh4eJ/f5TQVoVCIJUuW7jE0iYiIyEcTjZp0+QL0fvAmIW8Tif1t5IxP/f364CpeH62lNhvqkgvozV5OSkEVrspa8l0uLvyn55FsiYmAglM8UXCKM5deeuFOEwEALFhwKBs23BSjivbP66+/ysaNP9hl+eWXXzXpLYK7U1/v5vbb/32X5WeddQ7r15++XzXOhJmsxW637/QCZBEREdl/o8NDdDVsI9i2lUi3h20jObwYKCMhPMydOU8SNq34Lbl4x6f+PrZyMZ+qrCI12QacHOvyZQYoOMWZRx75caxLmBarV6/Zr4C0JzU1Bo8++sS07U9ERETmjv5AD+3tXTQOpOHpDHJ0148oMH3kWExygGHTjt++mKMWFVGWn0FfSjX55RXkzOCETzL7KDiJiIiIyJxgmibdfSN0b36bULsbW7CN7FAXmZZBQmEnPwueQk5GEoemlzKcXkdKQQW5FbU4C4s4bqdb7WbmfaAyuyk4iYiIiMhBZywUwtu0jd6WbYR9zUQHAzwcPIrh0TCfSX+ZhYkt+C3Z9KSU4c8pJbV4Ht+rW0Jmqh04YtL9y9yj4CQiIiIicW371N9b2Bpy0OIdpqDjZVZH3iTTEiUTCJk2/FYHqw9xUJKfTUlWDUmFTqpmYLImOXgpOImIiIhIXDBNk96BEO2NDYQb3iSht4WM0S5yLUGcwEN9n2DA7uIIRwGepFUk5pWTWzYPV2kFDluCXiIrH4mCUwycffZ67Hb7xAtTly1bzuc+9wUefvhBKiurWLv242za9BbhcJjDDls16f76+nr5xje+RltbK3a7neLiUjZsuImcnJxdxj722CP87ne/xWq1Yppw4YWXsHbtx3cas2nTW1x//VVcd90XdvtyWdM0sVgs3H77rdx8860TX8fKc8/9kp/97AksFivRaIT168/gk588d6cxHk8Tn/70pzjjjE9yzTXX77KPV155mR/+8GHGxkKYJqxb9wnOO+8CAH75y2d46qknJ87z/PMv4sQTTzkg5yYiIjJXRcIRfJ4melrqGfM2Yw+28cLQQv426GB+YiuXZ/yOHjOTYFIBvdkrSCmoZEPtIrJzMmP6uUQOXgpOMXLbbd+kqmreTssuu+yKiT+/887bDA8PTyk47fgwv2zZCgDuv//7PPjgvXzlK1/bZexZZ/0LF198KQDd3T7OP/9sVq5cRWZmJgBDQ4M88MC9rFr1sT0e7957v0td3XwikQhPPfUkg4ODXHTRZyY/6Ul0dLRz++23ct99/7lP2x1zzHGccsp6LBYLQ0ODXHjhv7B06XLmzasBIBKJcNdd/8GRRx6zx33k5jq56667cTpdDAwMcOmlFzB//gIWL15KSUkp9977n2RmZuL1dvHpT5/PokVLKCws+iinKyIiIuNGhofoaqinPQj1wWT6OzycO/pT0iwR0mB86m8HRlEqi8prKHMuxJp7BuWZGbEuXeYQBadZ5Pbbb6Wu7hCWLFnOs88+TTQa5a233mDt2o9z4YWX7HG7zMysidAE29/r9MwzT+12bHp6+sSfh4aGsFjANKMTy+69927OP/9C/vjHV/d4vM997gts3Hg/L774AosXL91taPrNb57jqad+xgMPPILVauWGG67m2GPXzsh7htLS/nFOIyMjhMPhnX7T9Pjjj46/eHaI4eHh3e5jwYJDJ/6cnp5OeXklnZ0dLF68dKfvbV5ePg6HE6/Xq+AkIiKyH/qHQng6+4j8/UUsgVbSRzrJNXvJtZi8N3IIf46spsKVRWvyMmx5FWSXVJNXWUVOop15k+9eZMbM2eA09Ks7dllmqzoM+4K1mOFRhl/47i7rE2vXkGgcSXSkn5Hf3rfr+vnHYTNWT+n4X/3qjRO36l155bUcfvg/tquunsdpp53J8PDwTreVffGLn+Oyy66grm7+HvcbjUZ55pmnWLPmqD2O+cUvfs7PfvbfeL1dfOUrXyMrKxuA119/jf7+fo499vi9Bqf77vseNTW1fPzjJzM2FuLxxx/lggsu2WnMSSet45133uaBB+4lPT2dzMysGX0566uv/j8efPB+2ttbufzyq6mu3v5X69at9bzxxp+4554HefTRh6e0r+bmJt5//6986Uu7vlR406a3GBgYoK6ublrrFxEROdhEo1H87a30NNUz0tWELdhKx0gqT/QtB0xuy36FiMVG0J5PMHMhSQWVrK6sY31hwfgvQA+L9SmI7GTOBqdY292tepP59rfvmXTM3Xd/i9TUFM4665w9jjn99LM5/fSz2bZtK1//+ldZseIwrNYEHnzwPr73vfsnPcbVV1+HxWLhrbfe4Oyzz8U0zd2O+/znv8Sll15IOBze44t7A4EebrjhGgDC4TG6ujq55JLzge1XgTZs2DW87M6aNUezZs3RdHZ2ctNNX2D16iMoKirhm9+8jZtuuoWEhIQp7ae7u5svf/nz3HDDjTidrp3WNTY2cNttt3DLLbeTlJQ8pf2JiIjMBTum/u7u6OTvoWI83gHWBX9KRYKXIiBqWvBbssnIdHHOsnmU5aeTkXsYGbrVTuLIlIKTYRjJwN3A8cAI8Lrb7f5XwzBqgccAB+AHLnK73fXj20z7uumUuv4re1xnsSXtdb01OWOv62Plvvu+R2urh29+826sO72kbfeqq+fhcLh45523yclx4Pd389nPXgxsn3DitddeIRgM8ulPf3an7XbcBnfzzbfu9PU/8/v947cDWhgcHNzplrodcnJyefTRJ4CpPePU19fLddddBUBZWTlf//rOVw4LCgo45JAFvPbaqxx77PG0t7eyYcN1AAwM9GOaJoODg9x448277DsQ6OH666/i/PMvYu3aE3Za19LiYcOG69iw4SYWL16yx/pEREQOdkMjYVq8/QS3vEVS53ukDXWSa/aQaYliiybywOD5lLoy8OcdhiUjgcySeeRXzqMqJYWqWBcv8hFM9YrTXWwPTLVut9s0DCN/fPmDwP1ut/txwzAuADYCx83gujkjLS2N7m7flMdv3Hg/bvdmvvWt72O32/c4rqmpkYqKSgDa29uor3dTUVFFRUUlv/71byfG7Xjeanez6k3F2NgYt9zyFa666nOMjo5yyy03ce+9G7HZPtpFzqys7ImgtUNzcxPl5RUA9Pb2smnTWxx99HEUFBTw3HMvTYx75JGNu9z+uENfXy/XX381Z511DuvXn77Tura2Vj7/+Wu5/vovsnq1XognIiJzQzQapc/nxde4hZHORqy9rWSMdnFHYB0hEjk15T1WJ28lkJiPJ6MWe14FOeXzuL+0goQEK7Bi0mOIxJNJP8UahpEOXASUuN1uE8DtdncZhpEHLAN2/Gr+v4H7DMNwAZbpXud2u6eeIg4CRx11LDffvIFLLjl/YnKIPT3j1NCwjR//+IeUlpZxxRXbJ2ooLCzijju+DcAll5zPt7/9fZxOF488spHGxgZsNhtWq5Xrr//iRJCaTj/4wT3Mm2dw/PEnArBp05s89NADXHnltdN+rGeffYo33vgzNpsN0zQ566xzpjQb4cMPP4jT6eTss8/h8ccfo6XFw7PPPs2zzz4NwCc/eS7r1n2CBx64l2Cwl4cf3sjDD28Edn0uTUREJJ7tmPo74KnHPVbIVn+U/O63ON3+GoXjY/xmFsHkAs5cXUhBSRGlzlVkZSRTOIW7XEQOBpY9PZ+yg2EYi4Gnx/85FhgAvgoMAz9yu90LPjT2feACtgegaV3ndrs3TeF8KoBGv3+AaPQf59XZ2UxBQfkUNv/obDYr4XB08oEya8ylnh3In4WZ5HJl4PP1x7oM2QfqWfxRz+LLvvRrdCxCq2+ATo+HjIaXSBnsIDfajd0SAeDRwWPoyV7AIbmjHJLYQXpJNflVtaSm73rLvew//YzFjtVqweFIB6gEmqa63VTum7IBVcA7brd7g2EYhwO/Aj65H3UeEOPfiAlerxWb7cD9NuRAHkumx1zpmdVqxeU6OB7EPVjOYy5Rz+KPehZfdtevgNdH6wfv09tcT6S7mdShDn4/WMurowYOaz8bst4nkJhPZ+5KUgqrcFXX8dWaGuxJe77tX6aPfsbiy1SCUzMQZvttc7jd7j8bhtHN9itOxYZhJLjd7ohhGAlAEdDC9itH071uyv75ilM0Gj1gVxTm0tWLg8Vc6lk0Gj0ofrul39LFH/Us/qhn8SU3NxX3e5vxN22lcyDKuwN5dHkDfNn2KGkWSAN6zXT67PnUGhUsrl5IeV4auVnryfunW+36gqPAaEzOYy7Rz1jsfOiK0z6ZNDi53e5uwzB+z/bnjl4cn/UuD9gCvAucBzw+/u93djyLZBjGtK8TERERmevGwmHau4fxePtJ3fxr0vubyI34SLaMUQz0jRXjT/oElWUuGhJOIzO/iLxqg9LsbEpjXbxIHJvqFGdXAP9lGMZ3gDHgQrfb3WsYxhXAY4ZhfA0IsH0SiQ9vM93r9ptpmnucNltkLpjseUYREZl9BoNBvA1uBtoaMXs8pA13MBy28N3gKQBcnunBYo/SlbMYskvIKp3HkopqDk9JGd/Dgj3vXET2yaSTQ8SZCnYzOUR3dwfJyamkpWXOeHiaS7d9HSzmQs+2v78qyMjIEE5n4eQbzHK6vSH+qGfxRz07sKLRKL3eLrob6xn0evhjZDHNXf2cEn6R5UlNAAyYyQQS8xnNLGNswamU5qWTn5OK1WpRv+KQehY7Mzk5RNzLyXERCPgYGOid8WNZrVai0YP7Q/jBZq70zGazk5PjinUZIiJzXjgcpiswjMc7yGjDJvK9r5MT9pFmGZmY+vvXZhEVhXlY09fSmWHBVVVLoSuP+P/Vl0j8mhPBKSHBdsB+y67fHsQf9UxERGbKyNAwXQ1b6G/bRtTvmZj6e2PwJFojDlYkd1KcNoI3rRaLo4yM4iryqmr5N039LTLrzIngJCIiIjLTgn4/3kY3Q+2NbA4V8JdAGpnBrVyV8X/kAkOmnYAtj9bMFaxfWkdhRSUFjmOwJcyNV2KIxDsFJxEREZF9EIlG6PYHafGP0dHeRXXT02SNdZFpGSJ/fMyWyCryXR+jsmY57fZiHBU1uAqKyLcqJInEKwUnERERkT0YC0fp2vYBfZ6thP0ekgfayI1089ZoFU8NHY7NYrIhZwB/agX+nDLSiypxVRmcnZ39ob0cErP6RWT6KDiJiIiIAIPBPrzbttDf3kCgP8RLgwYd/kH+LeN/KEsYYtS04U9w0Z65kOLCQ/k3YwXFzjTsiWtjXbqIHAAKTiIiIjKnRKNRerv9ePoseLr6cTa+QMnQZnIs/TgBJ2CLusjOXciiagd9yReRVJiPs6QMZ0JCrMsXkRhRcBIREZGDViQapcvTQm/j+4z5mrH3t5ET9mIjwr2BczGxclZOlIzkInqzS0gtrMJVWcN8Vx7zJ/ZSHcMzEJHZQsFJREREDgojg0N0NdbT37qViL+F/x1bzrbuCGsTN3Fyyl8YM634rU686QaW3DK+fNJiSgpySEk6Ltali0gcUHASERGRuBP0d9PiH6W5J8yo528s7f0tOWYvuRYmpv52pB5C2dIqKrPz6c86FVd5BbmJ9liXLiJxSsFJREREZq2oaeLzBQi4NzHa1Ygt2E72+NTfvx84kk2hSg7JjFKT4qAvazHJBZXjU38Xcqmm/haRaaTgJCIiIrNCaHQUb+M2elu2EuluZsuoi9/3FJI0FuTrOU8RMS30WHLwp1bizynlxKplXFJZSXpKIrA+1uWLyEFOwUlEREQOuIG+XjrafTT2J+Hp6ufojkdxmd1kWUyygFHThs++jI8dupSyvHn0JleRX1FNdnJyrEsXkTlKwUlERERmjGma9ARH8X3wNqPtW0jobSUz5CXH0k/fWAE/7f84WWl2FmUtXub5AAAgAElEQVQUMpReiz2vHEd5LY7SUo63fnjq7+KYnYOICCg4iYiIyDQJh8fwNjfR69nKmLeJ0GA/PwquYnAkzJUZv6XW1kHAkkXf+NTfWcW13F23hKz0JGBNrMsXEdkrBScRERHZZ8ODg3gb6tkWcuDxDlLU9hIrIpvIsETIAMZMKz6ri+W1DsoKsnBkVGIvclGRlh7r0kVE9ouCk4iIiOxV32CI9oZthBrfxhpoIWO0a2Lq73t6zyRkzyY9N4+WpBUk5lWQXTaPvPIKcm2JGLEuXkRkmig4iYiICACRaAR/Sws9nq283+MBv4dfDy3hg4FMltobuST9FQJmBkF7Hn3ZS0guqOCmmiXkOrKwWCyxLl9EZEYpOImIiMxBodERuhq30ha0sLXXxlBnI2cO/5wUS5hiIGJa8FtyqStMYknpPMod87E4z6IsKzvWpYuIxISCk4iIyEFucGQMT0cv4b+/hBloIX24g1wzQLbF5E/Di3gtspwqVxZtmYtJcJaTXTqP+SsWkz0wRnWsixcRmSUUnERERA4S0WiUns4Oepq2MNzVhK23lebRDH7euwgLJnfm/B8h7PQl5tGceQhJ+ZUcXVXHWUXFWC0WYPXEvpJSkmFgLHYnIyIyyyg4iYiIxKGxsTF8nkZ8nV42j+Tj6epnfd8TlCR0UwhETQhYssnJcHH2kmrK8tKxZy/DlZujNyKJiOwHBScREZFZbiQUptU7SJ/7DRI7/0bKUAeOaDcZliiRaCq/HziHElcaXudyIhl2MkqqyaucR0VaOhWxLl5E5CCh4CQiIjKL9Pl8+Bq3MNTRgKW3ldQRH3cGTiGClTNT32ZlUiM9tjxaslaS6Kogp2weP6isJsFqBVbGunwRkYOWgpOIiEgMRKIRulta6GnegnusmAb/GMXdf+IU25/IHx8TMDMIJuVz5qoCCosLKHOuJDsrjXyrNaa1i4jMRQpOIiIiM2wsHKHVN4jX00Ryw8skD3bgiPhItYRJBZ4bOJ5gVi1lhXU02nPIKK4ir6qWsswsAA6NbfkiIsI+BifDMG4BbgUWut3uvxmGsQrYCKQATcAFbrfbOz522teJiIjMdv29vfga3Ay0N0CghfThTl4cnM8bo1UUJfRwXeZf8Se4aMtags1RRlbZPD5XUY09KSnWpYuIyF5MOTgZhrEMWAV4xr+2AI8Dl7jd7lcNw/gqcCfwmZlYN10nLCIiMh22T/3djr+pno5++Eu/g26vjw22H+MCXEDQTKUvMZ/5Rgkrqg6lNC+V3OwzcFkTYl2+iIjsoykFJ8MwkoD7gfOB348vXgGMuN3uV8e/fpDtV4g+M0PrREREYmIsHKGrZ5jmrn5SN/+KlH4POWEvqZYQRUDXaDldySdTWpzPtoSTSS8oJa+ylmKHU1N/i4gcJKZ6xenrwONut7vRMIwdy8qA5h1fuN3ubsMwrIZh5M7EOrfb3bOf5ygiIjJlQwMDeBvr6W/dRrTHQ+pQJ30hGz/oPx6A6zK3kpAIXRnzseaWklEyj8MqazgqLXV8D3oiSUTkYDRpcDIMYzXb5zf98syXMz0cjvSYHt/lyojp8WXfqWfxRz2LP7OxZ90dnbR98D49bS38OTqfxrY+1o38ikPtrTiAITOJXns+Sa4qvrBsGVXFWRQ5T8Vmmxu32s3GnsmeqV/xRz2LL1O54nQ0UAfsuNpUAvwvcA9QvmOQYRhOwHS73T2GYXime92+nJTfP0A0au7LJtPG5crA5+uPybFl/6hn8Uc9iz+x7lkkGsHbM0yLb5DhrW/i8r1J9piXDMswGUCaCT+N5lKY72AsfS0dGVaclbW48vJ3mfo7EBiKzUkcYLHumewb9Sv+qGexY7Va9utCy6TBye1238n2CRoAMAyjCTgVeB/4V8Mw1ow/k3QF8LPxYW8DKdO8TkREZFKh0RG6GrbS17KViN9D8mA7jkg33+s7FV80kyOS2yhJHaA7tRp/bhnpxZXkVdVy2/jU3yIiIruz3+9xcrvdUcMwLgQ2GoaRzPjU4TO1TkRE5J/19/bi2+ZmoKOBzaEi/taTjCP4AZem/55sYMRMpGd86u8zl86joKycIucxJNr0AlkREdk3FtOMzS1tM6QCaNSterIv1LP4o57Fn4/as2g0Sk9gAI9/lM7Wdsqbf0lmqItsy8DEmF9F1uB1rqTaYaUm0YujYh65xSUkaOrv/aKfs/iifsUf9Sx2PnSrXiXbL9RMyX5fcRIREZkJY+EI3sat9LZsJexrJmmgjdywj9dGavjV8HLsljBfyumhN6WUQHYJaUVVuCprOd/hiHXpIiJyEFNwEhGRmBkaGKCrYQsDbQ34BiK8MlBFW3c/X8/4KWXWEGNmAn6rk86M+ZTXLuRmYzklznSS7B+PdekiIjLHKDiJiMiMM02T3p4AnkAET9cAzm2/pmConhz6cFrACZiRfNJyajl+RRk++4VQWIizrJxcW2KsyxcREVFwEhGR6RWNmnjbWgk0bCbkbSKxv43sMS8WM8r3es8BLJybEyI9KY9g1lKSC7ffaneoK49FE1N/z4vlKYiIiOxCwUlERPbb6PAI3qbxqb+7PbwwdhhN3SOcZHuDtSnvEzat9FhyCWTWEM0q4caTFlNakE1q8nGxLl1ERGSfKDiJiMiU9PcGaPWP0twdYszzHgsCvyPX7CXbYk5M/Z2dOp+jFpVRlHUKwez15FVUk2O3a/YoERGJewpOIiKyE9M06e4O0LPlXUY6m7AF28gKdZJlGeSF/mP521gpCzPHqErJpjnzUJLyK3BU1JBbVMwVmvpbREQOUgpOIiJz2NhYCG9jA30tWxnzNbN5tIBXelxkjPm5OftZoqaFHks2gZQyenJKWHfESi6rrCAz1Q6cGevyRUREDhgFJxGROWJoYID29m6aggm0dvaypv1HOEw/mZYomcCYmUBH4koOn38oZa5KepKqyK+qoTIlNdali4iIxJyCk4jIQcY0TXoHQng3b2KkvR5rXysZI13k0EfXWClPDBxLekoiizJzGUyfh91VTk5ZDa6yck6yffh/C6UxOwcREZHZRsFJRCSORcIRfK3N9DTXM+ZtZnBwiCeDy+kfGuP6jBeoSvQRMDMJJuUTzF6Gs9jgO3VLyU63Y7EcGevyRURE4oaCk4hInBgZHsbb1EDDSDYe7wAlrS+yKPweaZYwaUDYtNJlzWNxlYPSggzS0suwFuVRlpkZ69JFRETinoKTiMgs1D8Uor1xG6MN72AJtJA20kmu2UuOxeTngXMw7elk5DpoS1uKzVlOdln1xNTfdbEuXkRE5CCk4CQiEkPRaBR/exs9zfWMdDZiC7bx7NBytvansMpez3npr9NnptFnz6M581CS8yu4pW45TkcWFosl1uWLiIjMGQpOIiIHyPapv7fRErTQ0GtltL2edcPPkmIJUQQTU3/X5dtYtmIeZY46yD2bklwHJbEuXkREZI5TcBIRmQFDI2FaO/yMbP4D9HhIHerAYfaQaYmydWgZr0YWUevMoCNjAQnOMjKL55FfNY/KlBQqY128iIiI7ELBSUTkI4hGo/T5fPgatzDc2UBCbyvbRnJ4treOBCLclfMCo9gJJObhyajBnlfB8VXzOb+4BKvVAhwV61MQERGRKVBwEhGZokg4gq+lia6ubtzDDlq6+lnf+2MKrT0Ujo/pMTNxpDs5Y1EV5fnpkLmEPIeDAqs1prWLiIjIR6PgJCKyG6NjEVp9A/R/8GcsnZtJGWzHEfVvn/o7kslv+8+g2JVGV84SRjNSSC+uIr+qhvKMTMpjXbyIiIhMOwUnEZnz+nv8dDXUM9TRgCXQgn3Ez12BEzFNC+el/ZnF9hZ6Ely0Zi0l0VVOVlkND1TVYEuwAofFunwRERE5ABScRGTO2DH1t79pCx+Ei2nyjVLe/SrHJ7xJ/viY7VN/53P64YUUFboodywjNzeTPN1qJyIiMqcpOInIQSkcidLePUhn0zaSGl8jaaCN3Eg3KZYQxcDPgycxml1JaV4tjfYc0oqqyKuqpSQnlxJgQaxPQERERGYVBScRiXuDwX68DW4G2howAy2kDnXw3OBC3hstpdLm5aqMd/AnOCem/s4qnceGyhqSkpNjXbqIiIjECQUnEYkb0WiUXm8XrX/5E/XeMO8Hswh0dXBdwhM4AScwYCbTm5jHotoCVlctoNS1kuzsT+KwJcS6fBEREYljCk4iMitFoyYdPUO0dAZJfv9X2PtbyQ17SbOMkAWER6tpsR9PWUEhDda1pBZU4qqsId/potBq5ZBYn4CIiIgcVBScRCTmRoaH6GrYSrB1G1G/h5TBdnyhZB7pPxqAG7M2k5CQgDetBqujjAJjPke4yjghI2N8DwtjV7yIiIjMCQpOInJABXv8eBu20Oft4s2xebR4Bzgz9AtqEzvJBYZNOz0JLpLyyrn0qEMoy8+gIPcoEm3/+OvK5crA5+uP3UmIiIjInDNpcDIMwwH8GKgGRoGtwOVut9tnGMYqYCOQAjQBF7jdbu/4dtO+TkTiRyQaxd87jMc7yMjWP5Pj20RWyEuWZZB8wGFa+VnkEorzshlMO472TBuOilqchUWa+ltERERmnalccTKBu9xu98sAhmF8C7jTMIzLgMeBS9xu96uGYXwVuBP4jGEYluleN50nLSLTKzQawtu0jb6WrYS7m0kaaMcR8fEfvacRNFM5NrmZotReAqnlBHJKSSusxFVt8M3snPE9LIpp/SIiIiKTmTQ4ud3uHuDlDy36E3AlsAIYcbvdr44vf5DtV4g+M0PrRGQWGAz24W2oZ6Ctgb+PlbC5x0Zh31/4VNqrZAEh04Y/wUl75kLOWVJJYVkpxc6jsSdqVjsRERGJX/v0jJNhGFa2h6ZfAmVA8451bre72zAMq2EYuTOxbjzAicgBEo1G6Q0O4/EN09XaQnHz82SMdpJr6Z+Y+vvtsaPIci2juGwFrfYycstrcBaXaepvEREROejs6+QQ9wIDwH3AGdNfzvRwONJjenyXK2PyQTKrzPWehcNh2urr6drqZqi9gYS+FrJDXl4eruPFkUWkWUbYkNPFQGoRw85yMkvnUVI3ny8WFGCxWGJS81zvWTxSz+KPehZf1K/4o57FlykHJ8Mwvg3UAOvdbnfUMAwPUP6h9U7AdLvdPTOxbl9Oyu8fIBo192WTaaPZvuLPXOvZyNAQXQ31BNu24R2APw6U0u7r57aMn+C0RAibVvxWB960GubVLGJ57TJKXOmkJJ2y036iQHf3QEzOYa717GCgnsUf9Sy+qF/xRz2LHavVsl8XWqYUnAzDuB1YDqxzu92j44vfBlIMw1gz/kzSFcDPZnCdiOyjYF8/np4Qnq4BnFt/iWuogVyzl1yLSS4wFikiKauCo5eW0p74KRyFxbgqKslJtMe6dBEREZFZZSrTkS8AbgK2AH80DAOg0e12n2EYxoXARsMwkhmfOhxg/IrUtK4TkT2Lmibd7W30NLgZ8TaSGGwjK9RFOGrlu31nAnBh9iBpSbkEsxaSkl9JbmUNSwqKWDYx9XdN7E5AREREZJazmGZsbmmbIRVAo27Vk30Rbz3bPvX3VvpatjHW7eE3Y4fh8Q2x3vYaa5K3EDEt9FhyGEwpwMwpJfHQEynNzyA9JTHWpU+beOuZqGfxSD2LL+pX/FHPYudDt+pVsv1CzZTs6+QQInIADQb7aPGH8HSPEG7ahBH4A7lmgCxLlCxg1LSRljyfjx1aRG7mKfRmf4L8imqyk1NiXbqIiIjIQUXBSWQWME2TQE8vvi1/YaSzEWtfK1mjXeRY+nkmeAL14UKWpo9QkZqOJ9PAnlexferv0jKutWrqbxEREZGZpuAkcoCFw2P4mpsIeLYy5mvm76OFvO7PwTHWyReyngfATxbB5EJ6s1dy+sdWUlRRTlZ6UowrFxEREZm7FJxEZtDw0CAdnT009Vpo7/BzePtPcET9pFsipANh00qLbTXLjVrKXWX02GvIq66hIi227yITERERkZ0pOIlMk77BEL7Nmxhs34q1t5X0kU5yzT6aQxU8PngkackJLMpMpyWzArurnKyyebjKKzl1p6m/K2JVvoiIiIjshYKTyD6KRCP4W1vpad7KaFcjwcFRnu5fTN9AiBszf0mVrZdeM52gPZ/+rMUUFtfxrbql5GYmYbEcHevyRURERGQ/KDiJ7EVodJSu5iYahzPwdPVT2vob5o/9jRTLGMVAxLTQbilgQcWxlOWlY027AktJAaVZ2bEuXURERESmkYKTyLjBkTHat21juPFdCLSSNtyBw+whE/hJ4Dys9iROysmmI3MRCc4yMkurya+Yx/zkZObHungRERERmVEKTjLnRKNRAl2ddDfWM9LVSGJ/G08NrKQxaOPopM2cmfYm/WYKvYn5NGfWkZRfwdfrVuJyZmHVrXYiIiIic5KCkxzUwuExvM1NtPZZaAiYRNo/YO3Qr0mzjFI0PqaHLOryLCxbWkVlbg1RxzkUOV0T60VEREREFJzkoDESCtPW7mfog1cx/R5ShjpwRP1kWCL8dfBw3ogcwnxnKt50A6ujjIziavKqalhRUUi5rz/W5YuIiIjILKbgJHGpr9uHt7GeoY4GrIFW3KNOXuipwk6Iu3J/xZBpJ2DLoyVrBXZnOSdXH8olpSUkWK3A2liXLyIiIiJxRsFJZrVINEJ3awtdnX7qh7Np7gxyeuBRXNY+CsbH9JrpuFIdnLamktL8dMYyFuPKyyffao1p7SIiIiJy8FBwklljLByhrXuQvs1/hs4PSB5sJzfSTaplDFs4l98MrKfQkUZn1kIGMzNIK6wkr7qW0qxsSmNdvIiIiIgc1BScJCb6+/rwNWxhoG0bBFqxDAf4fuB4oqbJJWmvMt/ehj/BNT71dzlZZfP4QXUdiTYrcFisyxcRERGROUbBSWZUNBqlp6sDf2M9H4SL8fiGqfS9zNHWTbgAFxA0U+lLzGPdYUWUFuZQmruYHGc2TmtCrMsXEREREQEUnGQahSNROv1DdDVtxdb4OvaBNnLDPlLHp/5+vO9UIlklFDtqaEzKJbWwElelQbHTSTHoJbIiIiIiMmspOMl+GR4coKuhnv7WBqI9HlKHOvhF/xI2hwqoS2zjsvS38FsddKXXbZ/6u6Sam6vqSE5NjnXpIiIiIiL7TMFJJtXX7cPbsIW2gQTeD6Yz1OXhs/wMhwUcwJBpp8eWx7JaF2sq51PmWkFm7rnk2hJjXbqIiIiIyLRQcJIJUdPEGxjG09mH/f1fkxhsI3vMS6ZliALAPVJHU+JRlOcV0Ww9muSCKpwVNbjyC8i3Wjkk1icgIiIiIjJDFJzmqNDoCJ0N2wi2biXS7SF5sJ220XQe718NwC3Z7xKxJtGTWklPbhnpRVUcW1XDqVnZ43tYErviRUREREQOMAWnOaC/txdfwxZ6vD7eGavA4x3gnNH/ocLWTQ4watrwJ7hIzyvi00fWUZaXQWHuUdjt+s9DRERERAQUnA4q0WiUnuAoHu8AI/V/ItP7LpmhLnIsA7iADNPGf49dTGl+Jr2px9KWmYSjYh65xaWa+ltEREREZC8UnOLU2NgY3uZGej1bGfM1kzTQRk7Yx9d7z2DEtHNS8lYOS/XTl1JCb3YJaUXVuCpq+I7TOb6HxTGtX0REREQknig4xYGhge1Tfw+0beNv4TK2dFsoC77N2Sl/IhMYMxPwWx14Mw7h/CXlFJYUUew8iuQktVdEREREZDrok/UsYpomff0jeHxD+DxN5HleJGOkkxz6cFrACbweOpY05yKcxctpTSwnu7wGV1m5pv4WEREREZlBszI4GYZRCzzG9tcE+YGL3G53fWyrml6RSARfi4eAZyuj3kYSg+1kj3n5v6H5/L/R+eRYB7g+q51gUj7BrKUkF1TirKjhsvwCrFZrrMsXEREREZlTZmVwAh4E7ne73Y8bhnEBsBE4LsY17bfQ6CidDfUEW7fRMWDljYFCOr193Jb+OGkWk4hpwW/JpSe1kkNq5rNq3lJK8zJITf5ErEsXERERERFmYXAyDCMPWAacML7ov4H7DMNwud1uX+wq2zemadL54n8Rbv+AjFA3ORaTHGAwXEpC1hmsXlRKs+1cHEWl5FVWk21PinXJIiIiIiKyB7MuOAGlQJvb7Y4AuN3uiGEY7ePL4yY4/fFvnYS2tJKWkERb9iqyS6txVNSworiEwyem/q6NaY0iIiIiIjI1szE4fWQOR3pMj+9yZXDsYXa2FHyRhdVOzW4XB1yujFiXIPtIPYs/6ln8Uc/ii/oVf9Sz+DIbP9G3AMWGYSSMX21KAIrGl0+J3z9ANGrOWIF743Jl4PP1A1DhSqM/OEx/TCqRqfpwzyQ+qGfxRz2LP+pZfFG/4o96FjtWq2W/LrTMuunZ3G63F3gXOG980XnAO/H0fJOIiIiIiBxcZuMVJ4ArgMcMw/gaEAAuinE9IiIiIiIyh83K4OR2uz8ADo91HSIiIiIiIjBLg9NHkADb71uMpVgfX/adehZ/1LP4o57FH/Usvqhf8Uc9i40Pfd8T9jbun1lMMzaTKMyQNcArsS5CRERERERmvSOBV6c6+GALTknASqADiMS4FhERERERmX0SgELgTWB0qhsdbMFJRERERERk2s266chFRERERERmGwUnERERERGRSSg4iYiIiIiITELBSUREREREZBIKTiIiIiIiIpNQcBIREREREZmEgpOIiIiIiMgkFJxEREREREQmoeAkIiIiIiIyCQUnERERERGRSSg4iYiIiIiITELBSUREREREZBIKTiIiIiIiIpNQcBIREREREZmEgpOIiIiIiMgkFJxEREREREQmoeAkIiIiIiIyCQUnERERERGRSSg4iYiIiIiITELBSUREREREZBIKTiIiIiIiIpNQcBIREREREZmEgpOIiIiIiMgkFJxEREREREQmoeAkIiIiIiIyCQUnERERERGRSdhiXcA0SwJWAh1AJMa1iIiIiIjI7JMAFAJvAqNT3ehgC04rgVdiXYSIiIiIiMx6RwKvTnXwwRacOgACgUGiUTMmBTgc6fj9AzE5tuwf9Sz+qGfxRz2LP+pZfFG/4o96FjtWq4WcnDQYzw5TdbAFpwhANGrGLDjtOL7EF/Us/qhn8Uc9iz/qWXxRv+KPehZz+/RojyaHEBERERERmYSCk4iIiPx/9u47PI7y3P//e1errpW0Tb1b0rjigjE2NVTTHAgYAoQAARIINSThkB+QE5IAAUIOocMXkkAghTQIhBpIOECSE3qHcVGvu9pV77s7vz8sFBs3WZa8Kp/XdemyZp6ZZ+7Zm0G6Nc88IyIiO6HCSUREREREZCfG9IyTYRi3ACcBJcAi0zQ/GFlfAwyMfAFcaZrmcyNtK4H7gGSgBjjDNE3/7rSNVyQSpr09QDg8tDvdjInfbycajU76cWTizIacORwJuFw+4uJm2mONIiIiMtX0dnXhrzLpaaqmuTeOf3YXMhSO8oNzV+CIm773bcb6W9TjwG1se6rvtZ8WUp8yDMMGPAKcbZrmq4ZhXAPcCJwz3rbxnNyn2tsDJCWlkJqag81m252udsrhsBMOz+xfwmeamZ4zy7Lo7e2ivT2A15sb63BERERkhohGo3S2d1HbHqa+tZv8jX/C21+Dy9aNF/ACvZES0t1zmFfiwm6f3N/DJ9uYCifTNF8FMAxjrP0uBwY+3Q+4l013j87ZjbZxC4eH9kjRJDIV2Ww2UlPT6enpiHUoIiIiMk1FoxatdbV01HzCUKCWhK4GXOEAXZFEbu86HoCvuIaJT8qlPXMFKTml+MoqWenLYmWMY58oEzFu51cjd4peBa4yTbMDKAJqP93ANM02wzDshmG4x9tmmmZod4JU0SSzmf77FxERkbEa6O/DX7WBroYNhEON/GVoJQ2BXk5OfJkViVWELTttdg+tqZXYvcV8Z8EyCrPSSE48NNahT6rdLZwONE2z3jCMROCnwJ3AGbsf1u7xeNK2WPb77Tgce2485Z48lkyM2ZAzu92Oz+eMdRgTZiady2yhnE0/ytn0onxNP1MhZx1tbdS0DVPV3MvQun8yp+1lXFYHLpuFC+i3EvBk7s3C/UrIz8gBbwqllRVUJiTEOvQ9brcKJ9M060f+HTQM427giZGmOqD40+0Mw/AClmmaIcMwxtW2K3EFgz1bvFAsGo3usWdYxvK8zNq1a0hISCA+PoFoNMJZZ53L4Yev5q233uDSSy/gtNO+zEUXXTa6/cUXf4133nmL559/mZSUlK36a2sLcNppJ/KnPz2N0/mfC/Ctt97gxht/yKOPPs4ll5xPa2srqampo+3f+taVLFq0GMuy+P3vf8sTTzwGWESjURYvXspRRx3Lrbf+GICurk76+nrJyckDYM2aEzjppFN49923uffeO2lvbycSibB06TIuvvhy0tPTR2P/9LiDgwOsXn0MZ5993g4/n/Xr13HbbbfQ09NDODxMWpqTG274Mb/4xQO8//67ANTUVJGXl09CQiIAP/vZw8TFxW2zv48//pD77ruLxsZGkpISycx0ce6557NkyTIuvvhrfPTRBzz++DOkp2eMfm6XXnoBp556Bhdf/A3eeusNrrjiMgoLi4lEwng8Xq688hpyc/O4/vprmTt3Hied9MUdnlM0GuWii77K979/A1lZ2Tvc9rPOPvt07rvv5yQmJu1wu4sv/hpXXfU98vLytxtDINC9S8eeqnw+54w5l9lCOZt+lLPpRfmafvZ0zizLoi0QJLTuPQZaq3F0NZI51Eq6rZcHOo+mNuJj34wBcpLcdGYsIjG7FG9JBd7cPL5u3/IPzB2dg8DgHot9otnttq1utIzFuAsnwzBSAYdpmp0jQ/VOBd4ZaX4TSDYM44CR55UuAH63m20zynXX3URZWTnr1n3CBRecy/Ll+wJQVFTMK6+8xAUXXExcXBxNTY0MDg7ssC+v18fixUt58cXnOOGEtaPrn376SY45ZlY/KA4AACAASURBVM3oMK1vfOPb7L//gVvtf//99/DOO29x++334HZ7iEajvPLKS3i9Ph588Nejff3zn69w3XU3j+7X0FDP1VdfwQ9/eBNLl+5NNBrlzjtv5bvf/Q633Xb36HafHretrY0zzljLPvusZMGChds9n+9//xq+/vVLRmOtr68jKSmZb33rytFt1q5dM/oZ7sjGjRu44opv8N3v/oB99101GveGDetGtykpKeOFF57nxBNPHj1Xw5i3RT8lJWX87GcPA3DHHf/DHXfcyg03/HiHx97c3//+AqWlZbtcNAGjOdiZU045nZ///P9xzTXf3+VjiIiIyMwxPDSEv7aKzroNDAdqeXeoiH8HM8iLNHJp+nNELRtBWyah5CKCrkJOPWg5eUWFOFNm312kXTHW6chvB04EcoAXDMMIAmuAPxqGEQfEAR8BFwKYphk1DOPLwH2GYSQxMq347rRNpH+838yr7zVPdLcAHLw0j5Xzc8a8fWXlXFJSUmhubgQgOTmFkpJSXnvtX6xadQDPPPMXjjrqWD7++KMd9nPssZ/n179+eLRw6uvr5eWXX+Lhhx/d4X59fX389re/4sEHf4Xb7QE2Dek6+OCdj1H95S9/zrHHHs/SpXuP7nfhhZdxyinH8+67b7N48dIttvd6vRQWFtPa2rLDwikQaMXn840uFxYW7TSW7fnVrx7iuOOOHy2aAAoKCikoKBxdPvbYNTz77FOceOLJ9PX18f7773LYYUcyNLTt6euXL1/BXXfdvktxPPHEY1vcabv44q9hGPP4+OMPaWlpZu3aU/H5fPzxj7+jrS3AhRdexqGHHg7AAQcsH73buHbtGo466lhef/3fBINtnHbaGaN3u/bb7wBuvvl6+vp6SUlJ3WYcIiIiMrP0dXfT1NpObaed5qZWVjX/BrcVIt0WJR0YshzUxKey7/xKir2FtCcvJLu0nLLk5FiHPu2MdVa9S4FLt9G0dBvrPt3nn8CiiWybid566w2GhoYoKChi/XoTgGOOWcOf//wnVq7cnxdffJ577vnZ6JC57TnggIP5yU9upLq6itLSMv72t7+ycOFeZGf/p4j76U9v4f777xldvvXWO2lubiIhIZ6iopJdjn3jxg2cdda5W6xzOBxUVhps2LBuq8Kprq6Wrq7O0UJre8488xwuuuirLFy4FwsX7sXhh6+muHjX4wNYt+4TPve5w3a4TX5+PgkJCdTUVPPhh+9z4IGf2+6wv2g0yksv/Y3KyjHPMEk4HOb9999j/vwFW6wPBPzceef/IxQK8sUvnsApp5zOvff+nI8++oCrr/6v0cLpswYGBrjvvl/Q3NzEmWd+kaOPXkNKSgoOh4Oysjm89967rFy535jjExERkanPsiw6e4cIfPQ6/U1V2DsacA624LZ1sXGgnEf79sOZ7GCJ00mds4L4rBLcReX4Cks43rHt32tk18zKt2HuvyiX/RdNzvtsxvpOoGuuuZKEhERSU1O5/vqbtng2admy5fzkJzfy8ssvUVY2h4yMzJ32Fx8fzxFHHM3TTz/JRRddxlNPPcnatadusc22huo1NTWN8cy2ZlnWzjdiU8F2zz13UFdXw2WXfRuXy7XD7b/0pbNYvfoY3nzzdd544zXOPfcMbrnldpYsWTZpMR511LE888xf+PDD9/nmN/+Lv//9xS3aa2qqOPvs07Esi/Lyci655PIxx9DR0UF8vGOrZ5QOOeQw7HY7Xq+PjIxMDj74EAAMYx6BgJ/BwUESExO36u/ww48EIDc3D6cznUDAP1pYut0eAoHdel+0iIiIxFgkEiFQX0tH7XoG/TV09EV5rHMhXX3DXJPxGKVx3YSsdLoSc+jI3Jvigvn8xNiLzLQEbLaDYh3+jDUrC6epYEfP59hsNg499Ahuvvk6rrrq2jH3edxxx3P55Rdx7LGfp66uhgMPPHin+5SWljE0NERdXS1FRcU73X5z5eUVfPjh+xx00OdG14XDYdatMzn11P+MsPy0YHvvvXe4/PKLWLJkGXPm7PjZJK/Xx+rVx7B69TEkJiby0ksvjqtw+nQ43OYxbsuhhx7BGWecjMfjoaysfKvCafNnnHZVYmLiNof9fTqxBWwa5pgwMjvNp3e7IpHINvtL2GwWG7vdTiQSHl0eGhraZrElIiIiU9PQQD8ttbVU9zup8/dQVv8kc8OfkGoLkwqELTvYC1g052CKsp1YKRdhL8inOD32M/LNNiqcpqjjjz+R5OTkLZ7N2Zk5c8rx+bK47rrvccQRR23xC/b2pKSkcMopp3Pzzdfzwx/eiMvlxrIs/vrXZ1mwYBH5+QXb3feMM87m618/h5Ur9xudHOLuu2+joKBwm0XOXnst4cQTT+GBB+7lRz+6Zbv9vvzyS+y33wE4HA4GBwepqakeUxG4Laed9mUuv/xCli1bzj77bJqAo66uhnXrTA4/fPUWn8NFF12Gx+Md13F2xOl04nK5aW5uIjc3b8L731xtbTXl5RWTegwREREZn57+YZo3mAzUvo+9s5Hk3kbcVgdObDzcfjqJifG4XR4aUpfi8BaTWTSHrJI5LE5IYHGsgxcVTlOVz5fFl7501i7vd9xxx/OTn9zIlVdevVXbZ59xOu+88znggIM5//yLePTRX3HJJecDm4a37bXXUlatOmCHxyoqKua6627mvvvuoqOjg0gkzJIly7juupu2u8+ZZ57DqaeewPr1JhUV235O6KWXXuSee24nISGRSCTM8uX77nS67+2pqKjkxhtv5f777+bHP76BpKSkkenIL9hq28MOO3Jcx7j//nt55JGHRpf/67+u2uqzO+igQ/j3v//FCSecNK5jjEVLy6YJT3Y206CIiIhMrmg0SqiliVD1evpbq3F0NvL7/pXUdcVxeNL7rEl5my4rlY6ELGrTF5KYVcIN81fgdadht43vj8Uy+WxjfQZkmigBqj/7HqeWllpycnZtGNp4jfUZJ5k69kTOmpoaufbaq7nvvl+MTg8/0e69904KCgo47rgTttm+J6+Dyab3lUw/ytn0o5xNL8pX7AwPDxGoraa+O47qkIXV+D6H9T9Lsm3TMP2oZSNky+Q1z+dJzy+jxGUn35vCnLklylmMbPYep1I2zeI9JrrjJLIH5OXlc+qpZxAMtuH1+na+wzh4vV6OOebzk9K3iIiIwMBQmMbGAP3mP4gG60jua8YTDeK0RXm7Zz/ejlay0OOkxTkfu6cIZ/4csssqKE1JoTTWwctuU+E0zZx77pe3mjRgwYKFXHHFVTGKaHz+9a9Xue++u7daf/75F+50iOBntbeHuPzyi7daf/DBh/CVr3x13DFOtO1NLz5RPjuLooiIiIxfZ1uAQNU6+po3Tf39wWAOfw0Vk2rr53rXX+i1Eml3ZFGfsYL4rBI+P2cRX83PJc5uB46IdfgyCVQ4TTPjndltqlm16oBdLpC2x+Vy8+CDv56QvkRERGR2iUQjBOvraQm0s743nbqWbk7qeACXrYfskW3aLSfZqdkcP7+Uomwnw85lZGVlk2O3xzR22bNUOImIiIjIrDAcjtLU1kvnR/+E1nUk9jbhiQRItoWJDGfxbO/R5HpSaXIuoiMjk7S8MrLnVFKUnkFRrIOXmFPhJCIiIiIzTm9nB60b19HbXI0VqiPa38NdHZ8jErX4atorVMS3EIzz0Zi+GIe3GF9RBXfPMYh32IEVsQ5fpiAVTiIiIiIybUWjUTpaW2irWc8n4ULqAr1U+F9gP/u7+AAf0GWl0BGfzdErCijMyaDItRcunwuvPS7W4cs0osIpBtauXUNCQgIJCYkALFu2N5de+i0eeOBeSkvLOOywI3nrrTcIh8OsWLFyTH3W1dVy/fXX0tnZSUZGBtdc830KC7d/U7muroavfOVLfOELJ3Pxxd8A4LLLLqSzswOASCRMdXUVDz74m61eqGpZFjabjeuvv5arr752dDlWnnjiMf74x0dH4zj99DNZvfoYAILBNn784xtobm4iHA5z5pnnjLZti2VZfOMbF7JhwzqeeupFAH7/+9/y1FNPjG7T1NTImjXHc8kl35zcExMREZEtRKJRWkL9+KtM7DWvkdDdiCvsJ9U2SC7ws44TsGfmUOCuoDrRS0peGb5Sg3yvl3xgQaxPQKY1FU4xct11N231otLzzvvPS1nffvtN+vv7x1w43XLLjzjxxJNZvfoYnnvuaX784xu4/fZ7t7ltJBLh5ptv4MADP7fF+ttu+88sdy+//BL333/3VkUTwMMP/4KUlBQikQjPP/8Mn3zyEZde+q0xxbkza9eu4Q9/eHKX9ikoKOSOO/4f6enp+P2tfOUrp7PXXkvIzc3jjjtuZe7c+dx44//Q3t7OueeewZIly8jOztlmX3/846Pk5OSyYcO60XUnn3wqJ5+8aca6cDjMCScczRFHHDX+kxQREZGdGujto7V6Pd2NGzdN/d3bzB96lrF+0Mde8XWcmfYaQbsXf5qBzVNMev4cri03SE5OjnXoMkOpcJpCrr/+WubOnceSJXvz5z//iWg0yhtvvMZhhx3Jl7989nb3a28PsW7dJ9x6610AHH74am699Wba29txuVxbbf/IIw+y334H0t/fR39//zb7fOqpP3Pssdt+J9CZZ57D44//keeff4bs7JxtFk1vvfUGN998PQ888DBpaWlcf/21uN0evv71S8bwSeyaZcuWj36flZWNx+PF7/eTm5vHhg3r+eIXTwfA5XJRUVHJ3/72AqeddsZW/dTX1/Hii89z1VXX8uqr/7vNY/3jHy/j8XiYO3f+hJ+HiIjIbNUVasNftY6GngQ+6UxisLWGs60/4LaBG+izEgg5slhW4eXgknkU+ZbhdJ+OOz4h1qHLLKLCKUauuebK0aF6X//6Jey776rRtjlzyjn++BPp7+8fHUYH8O1vX8p5512w1S/tra2teL1ZxMVtGqcbFxeH1+vD72/dqnDasGE9r732f9x++708+OAD24wtFAryxhuv8Z3v/Pc22x9++EGSkpI48sijKS0t4847f7pFnLCpmDnqqGO58cYfsP/+B1FfX8eVV14zxk9n/N566w16enqYO3cuAIYxlxdeeJ65c+fT3NzEBx+8R25u3lb7RaNRbrrpOr75zStxOLZ/WTz11BPbLShFRERkxyzLItDRT31zB46Pn8bR1UjmUCvptj6ygff7F7Axfj9Ks3KpsR9Mck4J7pJKfDm5ZNvtzIv1CcisNmsLp74nf7TVOkfZChIWHIYVHqT/mf/Zqj2+8gDijQOJDnQz8Nc7t26ffygOY9VW67dlW0P1duaWW27fpe0/KxwOc9NN13HVVd8bLbK25Zln/sK+++63zbtVAGeccdboM05HHrn9YWtnnnkO3/jGhdx110954IFHtluQbP5S37a2AGefvekOUXZ2NjfddOuYz6+6uorrrvse3/ve9SQmJgFw8cWXc8cd/8PZZ59OdnYOy5btQ1zc1nH85jcPs2TJMioqDJqbm7bZf1tbG2+++TpXXXXtmGMSERGZrYaHBvFXb6SjfiPhtlqSepqoGUzn0e4VgMV1mW8wYE8hmFJC0FVIWm4Zh88x+EJm5kgPe8cyfJGtzNrCaSbJzs6mrc1PJBIhLi6OSCRCW1uArKzsLbZra2ujqamBK664DICenm4sy6K3t5crr7x6dLunn36SCy+8bLvH+3QiiKuvvnaL5c/q6emhtbWF+PgEuro6yMnZ9nNFm7/Ud+3aNTt9me1Xv3oWw8PDpKSkcPfdm+6a1dfXccUVl3HFFVexePGS0W1dLhf//d8/HF3+9rcvpaSkdKs+3333bTZsWM+zzz5FJBKhu7ubtWvX8NBDvyE1NQ3YVFCuWrU/maP/QxcRERGA3q4u/FXrCAbaeHeoiPrWHk4d/A35ce2kA4OWg2CclwxfDmftb1CU7cTjPpDERA21k+lj1hZOKWv+v+222RyJO2y3Jzl32D4RUlNTaWsLjGlbl8tNeXklL7zwHKtXH8MLLzxHRYWx1R2jnJyc0ZniAH72s/u2Gg74/vvv0tPTw8qV++32OfzoR9/nuONOYN68+Vx77dU88MAvSUlJ3e1+77//oS2WGxsb+OY3L+Eb3/g2q1btv0VbZ2cHqalpOBwO3nzzdaqqNnLddTdv1efNN/909Pvm5ibOO+/LW01S8cwzT3LJJZfvdvwiIiLTlWVZdPQMUdfazYD5T1ID75M+2ILL1o0XSIom8sjQlyjKTifg/RxWZjKuonJ8BcV4HZr6W6a3WVs4TXUHHXQIV199BWefffro5BDbe8YJ4IorruK6677HL37xAE6nk+9+9/ujbTva77OefvpJjjrq2B0O5RuL3/3u1wwODo4O6zvkkMO56abr+f73b9itfrflnnvuoKurgwceuI8HHrgP+M9zYx999CG33XYLdrudjIxMbrrpf0hK2jSM7/HH/0BbWxsXXHDhTo/x3nvv0NfXx4oVYxuKKSIiMt1FwhEC9TWE6tYz7K8loauR9HAbP2g/gWEcfD7ZZGlSK51JuXRk7ENybhm+skp+6ssa6WHJDvsXmW5slmXFOoaJVAJUB4M9RKP/Oa+Wllpycor3SAAOh51wOLpHjiUTY7bkbE9eB5PN53MSCHTHOgzZBcrZ9KOcTS+7m6/B/j5aqzbS1bCBDyIlbGiLUNrxbz6f9DoAYctO0OahLyWH9vLjyM3PpsCXSkpS/ESdwqyjayx27HYbHk8aQClQM9b9dMdJREREZBbp7huizt9DW2017toXSBtowW114LJZuICXBg4n3jMfZ/ne1CWVkFlYTlZJKS5N/S2znAonERERkRkoGo0SamokWLuegZZqHF2NZAy18kzfIv41WEmWvZOLM+roTMimK30hSTmleIoruDAvH7vdHuvwRaYcFU4iIiIi09zw0BCN600669bT1Ovgze4s/P4g16b+ijwgatkI2TJpTy5i4ZxK9itfQlFWGs6UL1AQ6+BFpgkVTiIiIiLTSF//IA1t/dT7e3CZj5HeU4vHCpFui5IOtA8XM+xcw+L5RVTb15KZV0xWWTmlycmxDl1kWps1hZNlWdt935DITDfDJoEREZk1OgJ+AlXr6G+uwtbZQPpAC12RBG7tOgaACzICDCek0Ow2wF2Iu6iClYUl7D869bcRu+BFZphZUTg5HAn09naRmpqu4klmnU0vOe7C4dBDvSIiU1UkGqGtvo5Q7Xr6A028FF5Cnb+Hk2zPsyShDoB2y0lXYg4RVwmXHbkXRdlOMtMOwWazaYY2kT1gVhROLpeP9vYAPT0dk34su91ONDrzp7aeSWZDzhyOBFwuX6zDEBERYGhggMbQAHX+XsLVr1MY/D/ckTZSbGFS2DT197OOOSwq8xCXdjRtGfFklVVSlJ4e69BFZrVZUTjFxTnwenP3yLH0F5/pRzkTEZHJ0t3VTWD9R/Q0V2Frrye1vwW31c4DXcfRHHGxMiVEQYqdxowlOLzFZBaVk1VSxlUJibEOXUQ+Y1YUTiIiIiKTKRqNEmppIlSznv6Waj4YLuKdYDLZ/Rs53/k3fECnlUJnQja16fM5ed+F5BUX4s08BLseIxCZFlQ4iYiIiOyCcHiYFn8HdaEwrY3NzGv4E66wnxTbELlA1ALTdhBz8ldQ5l1Ba2I5WaWVFHi8mvpbZBpT4SQiIiKyHf2Dw/jXfUhX40asYC3JfS14om28PWjwWN8+JDksFmQM0+qcj91diLOgnOzSCk5JTYl16CIywVQ4iYiIiACdbW0Eqkz6mqsI9lk8312JP9TH9zN/T4m9nz4rkZAji/qMFRTnL+CHlUvJ8aQQZz8s1qGLyB6gwklERERmlUg0QrC5ldpuB3Wt3eRVP0nxoInT1k/2yDaWlU++bymr5mfTnXQeKfm5+LJzyLbbYxq7iMSOCicRERGZsYbDUVpqqums/pBIWy1JvU24I21gwT0dpxJnt3OSK562lDLa3EWk5ZWRVVbB4oxMFo/2UhrDMxCRqUKFk4iIiMwIvV2dtG5cR09TFYTq+fPgSupCwxyV+BZHJr/PoOUgGOejKX0v4rzFfG/B3uRlOYl3HBLr0EVkGthp4WQYxi3ASUAJsMg0zQ9G1lcCDwEeIAicaZrm+slqExEREYFNU393+Fupb49SGxwi0vAee3e+gMvWjQ/wAd1WMgXpS5lfUUJZZj79mSfhKSzEa4+LdfgiMk2N5Y7T48BtwCufWX8vcJdpmo8YhnEGcB9w6CS2iYiIyCwTiUZpaQ3R+cnrDAdqSehuxBX2k2ob5KXug3lvuJhFbjvlSXl0ZBaQkluGr7SCPF8WX4l18CIyo+y0cDJN81UAwzBG1xmGkQUsA44YWfUb4E7DMHyAbaLbTNMMjPcERUREZHoY6O+jdeN6uhs3Eg3W8dFADn8P5ZAe7eS/Mx9j2LITtHvxpxnY3EWsmbOM80uKSEpwAGtiHb6IzHDjfcapEGg0TTMCYJpmxDCMppH1tkloU+EkIiIyg3SFgjQ3B6nqTqS+tYvDW3+Ox+rAbbNwA/1WAi2JqRyydG+KsirpTpqLr6QUd3xCrEMXkVlqRk4O4fGkxfT4Pp8zpseXXaecTT/K2fSjnE0/E5Ezy7JoDfXR8NY/6a37BFuojvTBFtJtfXQM5fH7nsPxuZJZnlrMYMZS0grLyaucR0lhAQs09fcu0TU2/Shn08t4C6d6IN8wjLiRO0NxQN7IetsktO2SYLCHaNQa56ntHp/PSSDQHZNjy/goZ9OPcjb9KGfTz3hyNjw0hL9mIx11Gwi31TLQ18cvO/elfzDCxc7nKHP4CdlchFJKCLoK8RZUcruxF2nJ8cCqLfoKBnsn8GxmPl1j049yFjt2u21cN1rGVTiZpuk3DOMd4DTgkZF/3/70WaTJaBMREZGpo6+7m9aqdWwY8lHn76G46a/sHX2XdFuUdGDIcuCPy2Ll/GyKsp140ueQnJtFWXJyrEMXERmXsUxHfjtwIpADvGAYRtA0zQXABcBDhmH8N9AOnLnZbpPRJiIiInuYZVl09AzRvHE94Zo3sXc04BxsxW3rwgvc1r4WkjNwu/OoS0wmMauYzOIKfAXFeBxxzIv1CYiITBCbZcVmSNskKQGqNVRPdoVyNv0oZ9OPcjY9RMIRAvU1hOrWQ6gBW6iOx/r2ZkOvk+UJG/ly2j8IWel0JeVgZRSQnFuKz1hCRkYaNpst1uHParrGph/lLHY2G6pXCtSMdb8ZOTmEiIiI7Nhgfz+tVRto6LGxoSOeoeYNfGHwMVJtYVKBsGUnaHOzsCCZFYUVFHsWEOc7jeK02E7AJCISKyqcREREZrie/mHqm4IMf/R3bO11pA204LY6cNks/tm3hH9by6jwuWhIWkq8r5iMwjnMX74YV9cQ5bEOXkRkilDhJCIiMkNEo1FCTY0Ea9cz0FqDo7OB6sFM/tSxEDtRbnK9QD9JdCZk05W+kKTsEg6bM58v5uSMDLXbb7SvhMREYChm5yIiMtWocBIREZmGhoeH8NdUE2gN8PFANnWt3Xyh+2Fy7e3kAVHLRsiWiSs9l1OWllOYnUZC5nI8mZkUxDp4EZFpSIWTiIjIFNc/GKbe30OP+W/iWj4kta8ZjxUk3RZlOJLG//aupSArjVbvCoacSaQXzCGrrJzS5BRKYx28iMgMocJJRERkCukI+AlUraO/uQpbZwPJA238qP0YLOycnPImSxPraHdkUZe+L/FZxbgLK7i7pAy73QYsj3X4IiIzlgonERGRGIhEI7TV1xGqXc8nwwVUtw1RFPwHqx2vkzOyTbvlpCsxh7X75ZGbl0WxdwUZ6Snk2O0xjV1EZDZS4SQiIjLJhsMRGgK9+GurSar+X5J6m/FEAqTYwqQAT/QcSV9mObbc+dQk+EjLLyOrrJKi9HQAFsY2fBERQYWTiIjIhOru6CBQtY6epo3Q3kBafzNP9yzkraESCuPauDj9fUJxPhozluDwFpFRWM7lJeUkJCbEOnQREdkBFU4iIiLjEI1GaW9poa1mHU09dj7oziTU6uebjkfwAT6gy0qhMz6bRXPzWVm2kKKsFNyZJ+Gzx8U6fBER2UUqnERERHYiHInQEuqnrrWb5I+eIKm7Hnc4QIptkDygabCUlqTVFOZlUxW3mtTcYrJKK8n3eMmPdfAiIjIhVDiJiIhspr+3F3/VeroaN2IF60juayY0lMB93YcC8K2M9Tgcdlqd87C7C3EWlLOqtIJDUlNGelgUu+BFRGTSqHASEZFZq7OtjUC1SVdrM69F5lLb2sMXhp5gXkITbqDPSiTk8OHILuWrn5tPUVYa2e6DccRpqJ2IyGyjwklERGa8SDRCW8cAdf5eBja8htv/JpnDraTb+sgGfJaNP0SzyMvKZDD1MJoy4vCVVOLLziFbU3+LiAgqnEREZIYZGhzEX72RzvoNhNtqSeptwh1p48edawhF0zgwqZ6DUzoJpZQSchWSmldG1pxKbsjIjHXoIiIyhalwEhGRaau3qxN/1Tq6G6v5eDiPD4MJZHV9yFmp/0sGMGg5CMb5aEpfxElL55BbVEi+92DiHRpqJyIiu0aFk4iITHnRaJSOzj7q2gZobWwkv+ZJ0gdbcdm68QJe4N3hA3D59qG4ZBkN8fl4iivxFBbi1dTfIiIyAVQ4iYjIlBKORAjU1tBRt54hfy0J3Q24wgFeHajg6f6lJDHEf7kDdCbl0ZFZQEpuGb7SCr7sy4p16CIiMoOpcBIRkZgZ6O+jdeN6uhs34u+J8o+eEpoC3fzA+WsKbMOELTttdg/+tEpKKhZzVeXe5PtSSU48Ktahi4jILKPCSURE9oiuzi7qQ2HqWrtxb/wLWb0bcFsduG0WbiASySExo4yDlxbQnPAlvDm5+EpKccUnxDp0ERERFU4iIjKxLMsi0NxMaOPHDPprcHQ1kDnkx7Ki/KTjZAC+lNlLWqKbroxFJGWX4imtYFFOHotHp/6uiN0JiIiIbIMKJxERGbfhs9LNdAAAGgdJREFUoSH8NRvpqNtIuK2W58IrqA30cXTcvzg46RMilo2QLZOOtFIi6flcsXovCnMySEs+NNahi4iI7BIVTiIiMiZ93d00BAepbetnuPZdKoMv4bFCpNuipANDloOUJIOV8wvxph9NR+axZJWUU5acjM/nJBDojvUpiIiIjJsKJxER2YJlWXS0dxFY9x79LdXYOxpIH2zBbevi8e7D+GQ4n0WpfZSkplDnrCAhqwRXUTm+whIu1vuRRERkhlLhJCIyi0XCEQL1NYTqNjDsr+HjwWz+FfLgHPRzZcaTAISsdLoTc+jMXM5xq/bha6WlZKQmYLOdEuPoRURE9hwVTiIis8Rgfz9NzUFqu2w0NLezouERPNE2Um1hUoGwZacxbgWLyw2KfYW0JZSQXVZJsdMZ69BFRERiToWTiMgM1NM/TOvH79DbuAE66knrb8ZtdVA/XMQvew4mOdHBwvRU+p2FxPuKySicQ1bJHI5J2Hzq75JYhS8iIjLlqHASEZnGotEooeYm2mrWM9haTU9PP4/1LCHUNci30p+i1BGky0qlIyGL2vSF+PIMbpq3N96MJGy2g2IdvoiIyLShwklEZJoYHh7CX1dHTV8qda095NY/x/yh90i2DZEPRC0bLTYflQWfozA7DUfqeVh5WeS7PeTHOngREZFpToWTiMgU1D8Yprmmmt6qd7GCdST3N+OJBkm3RflV+6lE45JY7XbS4pyP3VOEs2AO2WUVGMkpGLEOXkREZAZS4SQiEmOdAT/+6vX0N1dh66jn6YFlrGtPYL9Ek1NS/02vlUi7I4v6jBUkZJVwzdwV5GS5sNttsQ5dRERk1lDhJCKyh0SiEdrq62noiFDdDgNNJod2P4nT1k/OyDbtlpOyTIt5C0spcZcz7DqBrKxscuz2mMYuIiIy26lwEhGZBMPhCA3N7fR+8k8ibTUk9TbjiQRIsYX5sHc5rwwvoNKTRFvqHIKuItLyS8kqq6QoPYOiWAcvIiIiW1HhJCKym3o6O/BvXEdPUxW017NxIJOnQuXEWcPc7HqMYeIIxvlozFiCw1vEIaWLOK24mHiHHTg01uGLiIjIGKhwEhEZo2g0SntrCy1Nfjb0Z1DX2s2Rbb8kz9aGD/ABXVYKHcmLOGZVEUVZTobS5uHOy8Nrj4t1+CIiIrIbdrtwMgyjBhgY+QK40jTN5wzDWAncByQDNcAZpmn6R/YZV5uIyJ4SiUZpDvbR/skbRJs+JqGnCVfYT6ptEEc4kye6Pk+2O4VAmsGgcwkpuaVklRrke72a+ltERGQGmqg7TmtN0/zg0wXDMGzAI8DZpmm+ahjGNcCNwDnjbZugOEVEttLf24u/ej1dDRuxgnXE9QW5reMwwhGLM1JfZUlCLUG7B3/aXOyeItIKyrm7ciGJCXHAyliHLyIiInvAZA3VWw4MmKb56sjyvWy6e3TObrSJiOy2zmAbgap1rB/OojYwQF7rKxxs/Ru3DdxAn5VAuyOLI5f6yM/xUuRaSHq2C7cjPtahi4iISAxNVOH0q5G7Ra8CVwFFQO2njaZpthmGYTcMwz3eNtM0QxMUq4jMAlHLItDRT3NNNbaN/8LR1UjmcCvptj6ygUc6j6E3tQCXu5iaxCSSs0vwlBj4cnLIttuZG+sTEBERkSllIgqnA03TrDcMIxH4KXAn8NgE9DtuHk9aLA+Pz+eM6fFl1yln08/mORsaGKDONGmrWsdAcxXxXY0807uAd/tyKHe0cKHzH7TbXXQ659DjK8ZVXMmNCxfhzEiP4RnMPrrOph/lbHpRvqYf5Wx62e3CyTTN+pF/Bw3DuBt4ArgNKP50G8MwvIBlmmbIMIy68bTtSkzBYA/RqLU7pzVuPp+TQKA7JseW8VHOppferk56/XWsaxrA7EqlvaWJcyO/xmGLkgUMWg6CcT7mFWWwtHQuRb6lpLhPJDMpaYt+BoZgQHnfY3SdTT/K2fSifE0/ylns2O22cd1o2a3CyTCMVMBhmmbnyFC9U4F3gDeBZMMwDhh5XukC4Hcju423TURmEcuyaO8epK6lG9tHTxPX0YBzsAW3rZtkoGeggg9sB1OU5aYubhUJWcV4iivwFBThjYvDiPUJiIiIyIyyu3ecsoE/GoYRB8QBHwEXmqYZNQzjy8B9hmEkMTKtOMB420Rk5oqEI/jrauioW8+Qv5aE7gb8g0k81LUKgGsy3sBut9OdlEtn5gq85XNZlVXCUT7fSA/LYhe8iIiIzAo2y4rNkLZJUgJUa6ie7ArlbM8a6O+jtWo97S0tvDdcTG1rD8f1/oFyRwsAYctO0O6hPa2Mrso1FGWnke9OIiXlP0PtlLPpRzmbfpSz6UX5mn6Us9jZbKheKZtu1IzJZE1HLiJCd98Qda099Kx/g5SWt0kbaMFtdeC2WTgtOz/vO5OC7HQ68/enzmnHVViOr6QUV3xCrEMXERER2YIKJxHZbdFolGBTI8GadQy21uDoaiBjyM9NHcfSayVxRNJHHJhcR2dCNl0Zi0jKLsVTUsFPc/Ow2+1oqJ2IiIhMdSqcRGSXDA8P4a+uoqN+A+ZQDutCdrJCb3Fy4qvkA1HLRsiWSXtyEV+YX0BOYQGFWfvjTEmkINbBi4iIiIyTCicR2a6+gWEaAr0019fjqnqelL5mPFaIdFuUdOCf/QcQcS8ho2wRdfEe0gvKySotpzQ5Odahi4iIiEwoFU4iQjQapbOtjUCVSX9LNfaOBtIHW3ilv5y/DSzEaevnO5nraY/Pos5ZQUJWCa6ics4pLCHOERfr8EVEREQmnQonkVkmEokQqK+lvXY9rV0R3uzLo7G1g+8m/pJcWxSAkJVOd2I2c4or2KtiLwqznGSkHk2u3R7j6EVERERiQ4WTyAw2ODhMU6if2tZuUs2nyejaiDvaRqotTCrQP5xDd8qJLJyTTU38iWRm55JdVkGxMz3WoYuIiIhMKSqcRGaI7o4O/BtNepuqoKOetP4WIpEIP+o8HoBz0puwEuw0ZizB4S0ms6icRSVlLEtIHOlhXuyCFxEREZniVDiJTDPRaJRQcxNtNesZaK3l5chS6gI9HBn+O6uSNgDQZaXSkZDFsKeQiw5ZQGFOOr6MQ7DZbDGOXkRERGR6UuEkMoUNDw/REuynLtBHf/U7FAT+gSvsJ8U2NDL1Nzxny6eiII8k52H4nYfhK6sg3+0lP9bBi4iIiMwgKpxEpoi+3j78VSbdDRuJhuo2Tf0dDfJQ9xFUhbNZnNRGceowrc752D1FOAvmkF1awZUpKbEOXURERGTGU+EkEgOdgQD+6nX0N1fxyVA2b4ecpHTX8o30Z/EAfVYiIUcW9RkrOGbZQnJKy8h2JxOnWe1EREREYkKFk8gkikQjBAKd1IWGaWwKUln7O1zDrTht/eSMbFMVXUFB1oEUz19Gc3wu3uIKfNk5ZKtIEhEREZkyVDiJTJDhcISWjZ/QWbeRSFsNSb3NeCIBPhgq5te9+xNnh3mZAwRTygi6i0jLKyOrrIITMjI366UyZvGLiIiIyPapcBIZh56uTvwb19HTuJGO7gH+2jeP5rY+rkx/nOK4TgYtB8E4H40Zi/Flz+N78/Yhz5tKvOPQWIcuIiIiIuOgwklkB6LRKO2BNuo6bdT5e3BXP0dx34e4bN34AB/QHHXj8SxlSbmXweQz6M/x4SkoxGuPi3X4IiIiIjJBVDiJjIhEo7TUN9BZ9SFD/loSexpxhf0kMcTd7acTJY7PuyEzKY8OVyEpuaX4Sg0qvd7NBtjNieEZiIiIiMhkUeEks9JAbx+t1evpatiAvbORZ/r2Yl0bHBj/PiekvMmwZSdo9+JPm4vdU8SVRy2mMNdDYoKG2omIiIjMRiqcZMbrCgapD/ZTG4rQX/8xy9qfxW114rZZuIF+K4Gs1HIK966gLDOPnozV+IpLcDsSYh26iIiIiEwRKpxkxohaFoG2Tto/eZPB1moc3Y1kDrWSbuvjH72r+L/BCirSo8xN9tCdsZiknBI8JZXMW1RJVrA31uGLiIiIyBSmwkmmpaHBIfw1G+ms30C4rZaNAy5eCBVgH+7jR65HiVg2QjYXoZQSQq5CDildypfmzCE1KR44Zou+7HpfkoiIiIjshAonmfJ6u7pobmqlqjuJ+pYu9m9+mGzLT4YtSgYwaDkIxS9mv4XLKcp20pFURHZxGZlJybEOXURERERmCBVOMmVYlkV79yD+T95hsMnE3tFA+mALbls3fWEvv+06hvTUBOY5cxhMLSMhuwR3cQXegiIOidt86u+8mJ2DiIiIiMxMKpwkJiLhCP76Gtpr1zPsryXSG+LBrgPo6R/m3LS/s1dCPSErna6kXDoz9yEpt5xb5+9NRloicECswxcRERGRWUaFk0y6gf4+Wqs2UDWYSb2/D2/jy+wbfo00W5g0IGzZCdo97F2eSUGOi5z0MuJyvRSnpcU6dBERERERQIWTTLDuviEaq2sZqnodW3s9qQMteKwO3DaLezs+T3e8lwM8HhrSlhHvLSKzqBxfcRmuhATKYx28iIiIiMh2qHCScYlGowSbGgnVrqe/pYb4rgb+2r+Q97oyWRDfwNecf6PTSqUzIZvajIUkZpfyrcoleLwubDZbrMMXEREREdklKpxkp4aHh/BXV9HYEWFDVzxdzfWc2PcoybYh8oCoZSNky6Tc52Du3uUUe+eD+zgKXG4KYh28iIiIiMgEUOEkW+gfDFPf2sXABy9ihepJ7WvGYwVJt0V5o38+L4dXUOJNpcW5ALu3iIyCOWSVllOanEJprIMXEREREZkkKpxmKcuy6AgECFSb9DdXY+9soHkgid+2LwEsrst8EbsN2uOzqHPuS0JWCfuWzmVNQRF2uw1YGetTEBERERHZY1Q4zQKRSIRAfS2BpmY+Gcymzt/DYaFHmRPXTO7INu1WOn0pFXxhURlFWWkkZy4lw51Jjt0e09hFRERERKYCFU4zzNBwhMa2XtrNt4hreo+k3iY80TZSbWFs0QRu7zqVfG8a7Z7F1KQtxZk/B19pBUXp6RTFOngRERERkSlKhdM01t3Rgb/KpLepCtrrSe1v5SftqxmwHByX/BYHJpmE4nw0ZizB4S0ms6icu8sqiXfEAStiHb6IiIiIyLShwmkaiEajhJqbCdauY8Ogj43BKJ7A63w+7hWyRrbpslLoSMjm2OVZ5BTkUejZG487DZ89Lqaxi4iIiIjMBCqcpphwJEpLsI+munoSN75EQk8j7nCAFNsgecCzPQfjd87Dl11OdXwiqblz8JVVkO/2kg8siPUJiIiIiIjMQFOycDIMoxJ4CPAAQeBM0zTXxzaqidfX04O/ej3dDVVEQ7Wk9LXwcm85/xgox2vv4jsZbxC0e2h1zsPuKcJZMIfzyypISk6JdegiIiIiIrPKlCycgHuBu0zTfMQwjDOA+4BDYxzTbukMBPBXr6O1a5j3ery0tIb4Jr/AY7PwAH1WIu0OHxVlOSyYM5/CrDTSXcfidsTHOnQRERERkVlvyhVOhmFkAcuAI0ZW/Qa40zAMn2magdhFtmsi0Sj1f/s9w83rSO1rxmnrIwcIDuVTE3csRdkuqhxHkubLw1tSgS87h2y7nbmxDlxERERERLYy5QonoBBoNE0zAmCaZsQwjKaR9dOmcHrlvWac5ts47YO0ppQQ9BaTllfGorIKVmZkjmy1KKYxioiIiIjI2EzFwmm3eTxpMT2+z+fkuIPKqa/8AaX5LuIdeonsVOfzOWMdguwi5Wz6Uc6mH+VselG+ph/lbHqZioVTPZBvGEbcyN2mOCBvZP2YBIM9RKPWpAW4Iz6fk0CgGwBXSgId7b0xiUPGbvOcyfSgnE0/ytn0o5xNL8rX9KOcxY7dbhvXjZYpdyvENE0/8A5w2siq04C3p9PzTSIiIiIiMrNMxTtOABcADxmG8d9AO3BmjOMREREREZFZbEoWTqZpfgLsG+s4REREREREYIoWTrshDjaNW4ylWB9fdp1yNv0oZ9OPcjb9KGfTi/I1/ShnsbHZ5x63K/vZLCs2kyhMkgOAV2IdhIiIiIiITHkHAq+OdeOZVjglAvsAzUAkxrGIiIiIiMjUEwfkAq8Dg2PdaaYVTiIiIiIiIhNuyk1HLiIiIiIiMtWocBIREREREdkJFU4iIiIiIiI7ocJJRERERERkJ1Q4iYiIiIiI7IQKJxERERERkZ1Q4SQiIiIiIrITjlgHMJMYhlEJPAR4gCBwpmma62Mb1exjGEYNMDDyBXClaZrPGYaxErgPSAZqgDNM0/SP7DOuNhkfwzBuAU4CSoBFpml+MLJ+u9fQZLTJ2O0gZzVs43obadM1FyOGYXiAh4E5bHq54wbgfNM0A5ORF+Vs9+0kZxbwPhAd2fzLpmm+P7LfGuDHbPqd7k3gK6Zp9u1Om4ydYRiPA6Vsyk0PcIlpmu/o59nMpDtOE+te4C7TNCuBu9j0Q0RiY61pmktGvp4zDMMGPAJcNJKfl4EbAcbbJrvlceAgoPYz63d0DU1Gm4zd9nIGn7neYPzXla65CWMBN5umaZimuRewEbhxMvKinE2YbeZss/b9NrvOPi2a0oD7gTWmaZYD3cC3d6dNdtlZpmkuNk1zKXAL8POR9fp5NgOpcJoghmFkAcuA34ys+g2wzDAMX+yiks0sBwZM03x1ZPle4JTdbJNxMk3zVdM06zdft6NraDLaJuvcZqpt5WwndM3FkGmaIdM0X9ps1f8BxUxOXpSzCbCDnO3I0cAbm911uBf44m62yS4wTbNzs8UMIKqfZzOXCqeJUwg0mqYZARj5t2lkvex5vzIM4z3DMO42DCMTKGKzv5SbptkG2A3DcO9Gm0ysHV1Dk9EmE+ez1xvompsyDMOwA18HnmBy8qKcTbDP5OxTLxmG8Y5hGD8yDCNxZN0Wnz1Qx3/+/zbeNtlFhmE8YBhGHXA9cBb6eTZjqXCSmehA0zQXA/sANuDOGMcjMpPpepv67mDTsxfKzfTx2ZwVmaa5nE3DZecD341VYLI10zTPM02zCLiKTc+NyQylwmni1AP5hmHEAYz8mzeyXvagT4cTmaY5CNwN7M+mv6aNDnkwDMMLWKZphnajTSbWjq6hyWiTCbCd6w10zU0JI5N6VABfNE0zyuTkRTmbQNvI2ebXWRfwANu5zth0J6l+N9tknEzTfBg4BGhAP89mJBVOE2Rk9qB3gNNGVp0GvG2aZiB2Uc0+hmGkGoaRMfK9DTiVTXl5E0g2DOOAkU0vAH438v1422QC7egamoy2yT+jmW8H1xvomos5wzCuB/YGThgpbGFy8qKcTZBt5cwwDJdhGMkj3zuAtfznOnsW2McwjIqR5c0/+/G2yRgZhpFmGEbhZstrgBCgn2czlM2yrFjHMGMYhjGXTdNEuoB2Nk0TacY2qtnFMIwy4I9A3MjXR8Clpmk2G4axH5tmoEniP9Plto7sN642GR/DMG4HTgRygDYgaJrmgh1dQ5PRJmO3rZwBa9jO9fb/t3c3IVbVYRzHv2FOC0ccYkjISojsCYteFkFW0KawJBhCspdRMQhSsii12hi9gBE2YS9QRpTVwsWAkBAULcykxoWQBEY8IeVQkdWY5ns107Q4Z2KanDm3O3NxZvp+Vpf7/5/nPNzLcPnN/5z/KY/xb+40iYhLgT3AV8CJ8u1vMvO2RnwvfmejN9x3Bqyn+Gz7galAF/BQZh4tj2sr50wBdgPLMvPYaMZUm4iYCWwFpgF9FKFpTWZ+5u/Z5GRwkiRJkqQKXqonSZIkSRUMTpIkSZJUweAkSZIkSRUMTpIkSZJUweAkSZIkSRUMTpKkcSkiNkbE4yOM90fERWN8zvaI+HAsa0qSJge3I5ckNVxE3Ak8DFwGHKN4Ps3bwKuZWdcPUUT0A3Myc+8pxrYD1wC9wElgB3D/wDOmxkJELAPuzczrq+ZKkiY+V5wkSQ0VEauBF4HnKB6gOxNYDlwHNA1zzJQxOPXKzGwGLgZagA1jUFOS9D915uluQJI0eUXEDOBpiifcbxk0tBtoHzTvLeAEMBu4AWiLiMXAd5m5tpzzCLAK6AfW1tpDZv4SEVuAFYN6ehm4BTgOvA48k5l/Dl1FKle1VgCrgVZgM7ASuATYCEyNiKNAb2a2RMQCoAM4HzgMbMjMjlp7lSSNX644SZIaaR5wFrC1hrl3A+uA6cAngwci4mZgDXATMAe4sdYGIqIVWEgR1qAITTOACylC2lLgnhFK3ApcDVwBLALmZ+aXFKtmOzOzOTNbyrlvAPdl5nSKyxK31dqnJGl8c8VJktRIrUBPZvYOvBERXcBcikA1PzN3lENbM/PT8vXJiBhcZxGwKTP3lDWeBO6qOPdLEdFBcU/VdmBVeQngHcBVmXkEOBIRzwNLKELPqTybmYeAQxHxEXAl8MEwc/8A5kbE55l5EDhY0aMkaYJwxUmS1EgHgNaI+PsfdZl5bblCc4B//g59O0Kdc4eMd9dw7gczsyUzZ2Vme2b+TBHkmoYc3w3MGqHO/kGvjwPNI8xdCCwAuiPi44iYV0OfkqQJwOAkSWqkncBvQFsNc0faXe8HivuGBlxQZz89FKtCs4fU+r6OWv/qNzN3ZWYbcA7wLtBZT5OSpPHHS/UkSQ2TmYci4inglYg4g+ISt+PA5cC0/1CqE9gUEe8A+4An6uynLyI6gXURsRQ4m2LDiXo2cPgROC8imjLz94hoAm4H3svMXyPiMNBXT5+SpPHHFSdJUkNl5nqKcPIo8BNF4HgNeAzoqrHG+8ALFJst7GV0my48QHHf09cUm1BsBt6so8424Atgf0T0lO8tAfaVoWk5sHgUfUqSxhEfgCtJkiRJFVxxkiRJkqQKBidJkiRJqmBwkiRJkqQKBidJkiRJqmBwkiRJkqQKBidJkiRJqmBwkiRJkqQKBidJkiRJqmBwkiRJkqQKfwEuKolBEyPWdwAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<Figure size 1008x432 with 2 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
      "output_type": "display_data"
     }
    ],
    "source": [
     "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
-    "df_vldvst.set_index(\"nx\")[\"Vector Loads / Loop Iteration\"].plot(ax=ax1, legend=True);\n",
-    "df_vldvst.set_index(\"nx\")[\"Vector Stores / Loop Iteration\"].plot(ax=ax2, legend=True);"
+    "for ax, pmu_counter in zip([ax1, ax2], [\"PM_VECTOR_LD_CMPL (min)\", \"PM_VECTOR_ST_CMPL (min)\"]):\n",
+    "    df_vldvst.set_index(\"Grid Points\")[pmu_counter].plot(ax=ax, legend=True);\n",
+    "    ax.plot(\n",
+    "        df_vldvst[\"Grid Points\"], \n",
+    "        linear_function(df[\"Grid Points\"], *fit_parameters[pmu_counter]), \n",
+    "        linestyle=\"--\", \n",
+    "        label=\"Fit: {:.2f} * x + {:.2f}\".format(*fit_parameters[pmu_counter])\n",
+    "    )\n",
+    "    ax.legend();"
    ]
   },
   {
@@ -3038,46 +3213,66 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "image/png": "\n",
+      "image/png": "\n",
       "text/plain": [
        "<Figure size 1008x432 with 1 Axes>"
       ]
      },
-     "metadata": {},
+     "metadata": {
+      "needs_background": "light"
+     },
      "output_type": "display_data"
     }
    ],
    "source": [
     "df_byte = pd.DataFrame()\n",
-    "df_byte[\"Loads / Loop Iteration\"] = (df_vldvst.set_index(\"nx\")[\"Vector Loads / Loop Iteration\"] + df_ldst.set_index(\"nx\")[\"Loads / Loop Iteration\"])*8\n",
-    "df_byte[\"Stores / Loop Iteration\"] = (df_vldvst.set_index(\"nx\")[\"Vector Stores / Loop Iteration\"] + df_ldst.set_index(\"nx\")[\"Stores / Loop Iteration\"])*8\n",
+    "df_byte[\"Loads\"]  = (df_vldvst.set_index(\"Grid Points\")[\"PM_VECTOR_LD_CMPL (min)\"] + df_ldst.set_index(\"Grid Points\")[\"PM_LD_CMPL (min)\"])*8\n",
+    "df_byte[\"Stores\"] = (df_vldvst.set_index(\"Grid Points\")[\"PM_VECTOR_ST_CMPL (min)\"] + df_ldst.set_index(\"Grid Points\")[\"PM_ST_CMPL (min)\"])*8\n",
     "ax = df_byte.plot()\n",
-    "ax.set_ylabel(\"Bytes / Loop Iteration\");"
+    "ax.set_ylabel(\"Bytes\");"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's quantify the difference by, again, fitting a linear function to the data."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Mean byte loaded: 37.52662546714877\tMean byte stored: 8.428951320998907\n"
+      "Counter  Loads is proportional to the grid points (nx*ny) by a factor of 37.5010 (± 0.000592)\n",
+      "Counter Stores is proportional to the grid points (nx*ny) by a factor of  8.4379 (± 0.000247)\n"
      ]
     }
    ],
    "source": [
-    "import numpy as np\n",
-    "mean_byte_ld = np.polyfit(df_byte[df_byte.index > 200].index, df_byte[df_byte.index > 200][\"Loads / Loop Iteration\"], 0)[0]\n",
-    "mean_byte_st = np.polyfit(df_byte[df_byte.index > 200].index, df_byte[df_byte.index > 200][\"Stores / Loop Iteration\"], 0)[0]\n",
-    "print(\"Mean byte loaded: {}\\tMean byte stored: {}\".format(mean_byte_ld, mean_byte_st))"
+    "_fit, _cov = common.print_and_return_fit(\n",
+    "    [\"Loads\", \"Stores\"], \n",
+    "    df_byte, \n",
+    "    linear_function\n",
+    ")\n",
+    "fit_parameters = {**fit_parameters, **_fit}\n",
+    "fit_covariance = {**fit_covariance, **_cov}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Analagously to the proportionality factors, this mich is loaded/stored per grid point."
    ]
   },
   {
@@ -3089,34 +3284,36 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 50,
    "metadata": {},
    "outputs": [],
    "source": [
     "df_bandwidth = pd.DataFrame()\n",
-    "df_bandwidth[\"Bandwidth / Byte/Cycle\"] = (df_byte[\"Loads / Loop Iteration\"] + df_byte[\"Stores / Loop Iteration\"]) / df.set_index(\"nx\")[\"Cycles / Loop Iteration\"]"
+    "df_bandwidth[\"Bandwidth / Byte/Cycle\"] = (df_byte[\"Loads\"] + df_byte[\"Stores\"]) / df.set_index(\"Grid Points\")[\"PM_RUN_CYC (min)\"]"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's display it as a function of `nx`. And also compare it to the available L1 cache bandwidth in a second (sub-)plot. Non-interactive users, call `make graph_task2c`."
+    "Let's display it as a function of grid points. And also compare it to the available L1 cache bandwidth in a second (sub-)plot. Non-interactive users, call `make graph_task2c`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 51,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "image/png": "\n",
+      "image/png": "\n",
       "text/plain": [
        "<Figure size 1008x432 with 2 Axes>"
       ]
      },
-     "metadata": {},
+     "metadata": {
+      "needs_background": "light"
+     },
      "output_type": "display_data"
     }
    ],
@@ -3146,7 +3343,7 @@
     "If you still have time, feel free to work on the following extended task.\n",
     "\n",
     "\n",
-    "**TASK**: Please measure counters for _vectorized_ floating point operations and _scalar_ floating point operations. The two counters can also not be measured during the same run. So please see the TODOs in [`poisson2d.sflops.c`](/edit/Tasks/poisson2d.sflops.c) and [`poisson2d.vflops.c`](/edit/Tasks/poisson2d.vflops.c). By now you should be able to find out the names of the counters by yourself (*Hint: they include the words scalar and vector…*).\n",
+    "**TASK**: Please measure counters for _vectorized_ floating point operations and _scalar_ floating point operations. The two counters can also not be measured during the same run. So please see the TODOs in [`poisson2d.sflops.c`](/edit/Tasks/poisson2d.sflops.c) and [`poisson2d.vflops.c`](/edit/Tasks/poisson2d.vflops.c). By now you should be able to find out the names of the counters by yourself (*Hint: they include the words »scalar« and »vector«…*).\n",
     "\n",
     "As usual, compile, test, and bench-run your program.\n",
     "\n",
@@ -3155,15 +3352,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "bsub -W 60 -nnodes 1 -Is -P GEN110 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.sflop.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.sflop.bin.csv\n",
-      "Job <4299> is submitted to default queue <batch>.\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.sflop.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.sflop.bin.csv\n",
+      "Job <24645> is submitted to default queue <batch>.\n",
       "<<Waiting for dispatch ...>>\n",
       "<<Starting on login1>>\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
@@ -3177,7 +3374,7 @@
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,20,0.0013,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,24,0.0014,0,0,0\n",
+      "200,32,24,0.0013,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,28,0.0014,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
@@ -3193,21 +3390,21 @@
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,52,0.0018,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,56,0.0019,0,0,0\n",
+      "200,32,56,0.0022,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,60,0.0020,0,0,0\n",
+      "200,32,60,0.0019,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,64,0.0021,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,68,0.0022,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,72,0.0022,0,0,0\n",
+      "200,32,72,0.0021,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,76,0.0022,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,80,0.0023,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,84,0.0024,0,0,0\n",
+      "200,32,84,0.0025,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,88,0.0024,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
@@ -3215,39 +3412,39 @@
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,96,0.0025,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,100,0.0028,0,0,0\n",
+      "200,32,100,0.0026,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,104,0.0027,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,108,0.0027,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,112,0.0029,0,0,0\n",
+      "200,32,112,0.0028,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,116,0.0028,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,120,0.0029,0,0,0\n",
+      "200,32,120,0.0031,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,124,0.0030,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,128,0.0031,0,0,0\n",
+      "200,32,128,0.0030,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,132,0.0031,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,136,0.0032,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,140,0.0033,0,0,0\n",
+      "200,32,140,0.0032,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,144,0.0034,0,0,0\n",
+      "200,32,144,0.0033,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,148,0.0034,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,152,0.0034,0,0,0\n",
+      "200,32,152,0.0035,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,156,0.0035,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,160,0.0036,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,164,0.0037,0,0,0\n",
+      "200,32,164,0.0036,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,168,0.0037,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
@@ -3257,13 +3454,13 @@
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,180,0.0039,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,184,0.0039,0,0,0\n",
+      "200,32,184,0.0040,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,188,0.0040,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,192,0.0041,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,196,0.0041,0,0,0\n",
+      "200,32,196,0.0042,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,200,0.0042,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
@@ -3275,9 +3472,9 @@
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,216,0.0045,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,220,0.0046,0,0,0\n",
+      "200,32,220,0.0045,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,224,0.0047,0,0,0\n",
+      "200,32,224,0.0046,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,228,0.0047,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
@@ -3289,97 +3486,91 @@
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,244,0.0049,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,248,0.0050,0,0,0\n",
+      "200,32,248,0.0051,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,252,0.0050,0,0,0\n",
+      "200,32,252,0.0051,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,256,0.0051,0,0,0\n",
+      "200,32,256,0.0053,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,260,0.0052,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,264,0.0053,0,0,0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,264,0.0053,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,268,0.0054,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,272,0.0055,0,0,0\n",
+      "200,32,272,0.0054,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,276,0.0055,0,0,0\n",
+      "200,32,276,0.0054,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,280,0.0055,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,284,0.0056,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,288,0.0057,0,0,0\n",
+      "200,32,288,0.0056,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,292,0.0057,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,296,0.0058,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,300,0.0059,0,0,0\n",
+      "200,32,300,0.0058,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,304,0.0059,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,308,0.0059,0,0,0\n",
+      "200,32,308,0.0060,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,312,0.0060,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,316,0.0061,0,0,0\n",
+      "200,32,316,0.0062,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,320,0.0061,0,0,0\n",
+      "200,32,320,0.0062,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,324,0.0062,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,328,0.0063,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,332,0.0065,0,0,0\n",
+      "200,32,332,0.0064,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,336,0.0064,0,0,0\n",
+      "200,32,336,0.0065,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,340,0.0065,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,344,0.0065,0,0,0\n",
+      "200,32,344,0.0066,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,348,0.0066,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,352,0.0067,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,356,0.0067,0,0,0\n",
+      "200,32,356,0.0068,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,360,0.0068,0,0,0\n",
+      "200,32,360,0.0069,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,364,0.0069,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,368,0.0070,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,372,0.0070,0,0,0\n",
+      "200,32,372,0.0072,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,376,0.0071,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,380,0.0072,0,0,0\n",
+      "200,32,380,0.0071,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,384,0.0072,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,388,0.0072,0,0,0\n",
+      "200,32,388,0.0073,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,392,0.0075,0,0,0\n",
+      "200,32,392,0.0074,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,396,0.0074,0,0,0\n",
+      "200,32,396,0.0076,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,400,0.0075,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,404,0.0075,0,0,0\n",
+      "200,32,404,0.0076,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,408,0.0076,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,412,0.0077,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,416,0.0077,0,0,0\n",
+      "200,32,416,0.0078,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,420,0.0078,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
@@ -3389,27 +3580,27 @@
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,432,0.0080,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,436,0.0080,0,0,0\n",
+      "200,32,436,0.0081,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,440,0.0081,0,0,0\n",
+      "200,32,440,0.0082,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,444,0.0083,0,0,0\n",
+      "200,32,444,0.0082,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,448,0.0084,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,452,0.0084,0,0,0\n",
+      "200,32,452,0.0083,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,456,0.0084,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,460,0.0085,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,464,0.0086,0,0,0\n",
+      "200,32,464,0.0085,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,468,0.0086,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,472,0.0088,0,0,0\n",
+      "200,32,472,0.0087,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,476,0.0087,0,0,0\n",
+      "200,32,476,0.0089,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,480,0.0088,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
@@ -3419,7 +3610,7 @@
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,492,0.0090,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,496,0.0090,0,0,0\n",
+      "200,32,496,0.0091,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,500,0.0092,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
@@ -3427,278 +3618,266 @@
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,508,0.0093,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,512,0.0092,0,0,0\n",
+      "200,32,512,0.0094,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,516,0.0093,0,0,0\n",
+      "200,32,516,0.0094,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,520,0.0094,0,0,0\n",
+      "200,32,520,0.0095,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,524,0.0094,0,0,0\n",
+      "200,32,524,0.0096,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,528,0.0094,0,0,0\n",
+      "200,32,528,0.0096,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,532,0.0095,0,0,0\n",
+      "200,32,532,0.0098,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,536,0.0096,0,0,0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,536,0.0097,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
       "200,32,540,0.0098,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,544,0.0097,0,0,0\n",
+      "200,32,544,0.0099,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,548,0.0098,0,0,0\n",
+      "200,32,548,0.0100,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,552,0.0099,0,0,0\n",
+      "200,32,552,0.0101,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,556,0.0099,0,0,0\n",
+      "200,32,556,0.0101,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,560,0.0100,0,0,0\n",
+      "200,32,560,0.0102,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,564,0.0102,0,0,0\n",
+      "200,32,564,0.0103,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,568,0.0102,0,0,0\n",
+      "200,32,568,0.0104,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,572,0.0103,0,0,0\n",
+      "200,32,572,0.0105,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,576,0.0103,0,0,0\n",
+      "200,32,576,0.0105,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,580,0.0105,0,0,0\n",
+      "200,32,580,0.0106,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,584,0.0104,0,0,0\n",
+      "200,32,584,0.0107,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,588,0.0106,0,0,0\n",
+      "200,32,588,0.0107,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,592,0.0107,0,0,0\n",
+      "200,32,592,0.0108,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,596,0.0106,0,0,0\n",
+      "200,32,596,0.0109,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,600,0.0107,0,0,0\n",
+      "200,32,600,0.0110,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,604,0.0109,0,0,0\n",
+      "200,32,604,0.0111,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,608,0.0109,0,0,0\n",
+      "200,32,608,0.0111,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,612,0.0109,0,0,0\n",
+      "200,32,612,0.0112,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,616,0.0110,0,0,0\n",
+      "200,32,616,0.0112,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,620,0.0117,0,0,0\n",
+      "200,32,620,0.0113,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,624,0.0112,0,0,0\n",
+      "200,32,624,0.0114,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,628,0.0111,0,0,0\n",
+      "200,32,628,0.0115,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,632,0.0112,0,0,0\n",
+      "200,32,632,0.0115,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,636,0.0113,0,0,0\n",
+      "200,32,636,0.0115,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,640,0.0115,0,0,0\n",
+      "200,32,640,0.0116,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,644,0.0114,0,0,0\n",
+      "200,32,644,0.0118,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,648,0.0115,0,0,0\n",
+      "200,32,648,0.0117,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,652,0.0116,0,0,0\n",
+      "200,32,652,0.0119,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,656,0.0117,0,0,0\n",
+      "200,32,656,0.0119,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,660,0.0117,0,0,0\n",
+      "200,32,660,0.0121,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,664,0.0118,0,0,0\n",
+      "200,32,664,0.0120,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,668,0.0119,0,0,0\n",
+      "200,32,668,0.0122,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,672,0.0119,0,0,0\n",
+      "200,32,672,0.0121,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,676,0.0119,0,0,0\n",
+      "200,32,676,0.0124,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,680,0.0120,0,0,0\n",
+      "200,32,680,0.0123,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,684,0.0121,0,0,0\n",
+      "200,32,684,0.0125,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,688,0.0122,0,0,0\n",
+      "200,32,688,0.0124,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,692,0.0122,0,0,0\n",
+      "200,32,692,0.0125,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,696,0.0123,0,0,0\n",
+      "200,32,696,0.0126,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,700,0.0124,0,0,0\n",
+      "200,32,700,0.0127,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,704,0.0124,0,0,0\n",
+      "200,32,704,0.0126,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,708,0.0125,0,0,0\n",
+      "200,32,708,0.0127,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,712,0.0125,0,0,0\n",
+      "200,32,712,0.0129,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,716,0.0126,0,0,0\n",
+      "200,32,716,0.0128,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,720,0.0126,0,0,0\n",
+      "200,32,720,0.0129,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,724,0.0127,0,0,0\n",
+      "200,32,724,0.0132,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,728,0.0128,0,0,0\n",
+      "200,32,728,0.0131,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,732,0.0128,0,0,0\n",
+      "200,32,732,0.0131,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,736,0.0129,0,0,0\n",
+      "200,32,736,0.0133,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,740,0.0130,0,0,0\n",
+      "200,32,740,0.0133,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,744,0.0130,0,0,0\n",
+      "200,32,744,0.0133,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,748,0.0131,0,0,0\n",
+      "200,32,748,0.0134,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,752,0.0131,0,0,0\n",
+      "200,32,752,0.0136,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,756,0.0132,0,0,0\n",
+      "200,32,756,0.0136,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,760,0.0133,0,0,0\n",
+      "200,32,760,0.0136,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,764,0.0134,0,0,0\n",
+      "200,32,764,0.0136,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,768,0.0134,0,0,0\n",
+      "200,32,768,0.0138,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,772,0.0136,0,0,0\n",
+      "200,32,772,0.0138,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,776,0.0136,0,0,0\n",
+      "200,32,776,0.0139,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,780,0.0136,0,0,0\n",
+      "200,32,780,0.0139,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,784,0.0137,0,0,0\n",
+      "200,32,784,0.0140,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,788,0.0138,0,0,0\n",
+      "200,32,788,0.0140,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,792,0.0139,0,0,0\n",
+      "200,32,792,0.0141,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,796,0.0139,0,0,0\n",
+      "200,32,796,0.0142,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,800,0.0140,0,0,0\n",
+      "200,32,800,0.0143,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,804,0.0141,0,0,0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,804,0.0143,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,808,0.0142,0,0,0\n",
+      "200,32,808,0.0144,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,812,0.0142,0,0,0\n",
+      "200,32,812,0.0144,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,816,0.0143,0,0,0\n",
+      "200,32,816,0.0145,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,820,0.0143,0,0,0\n",
+      "200,32,820,0.0146,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,824,0.0144,0,0,0\n",
+      "200,32,824,0.0148,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,828,0.0145,0,0,0\n",
+      "200,32,828,0.0147,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,832,0.0145,0,0,0\n",
+      "200,32,832,0.0148,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,836,0.0146,0,0,0\n",
+      "200,32,836,0.0149,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,840,0.0147,0,0,0\n",
+      "200,32,840,0.0150,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,844,0.0147,0,0,0\n",
+      "200,32,844,0.0150,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,848,0.0148,0,0,0\n",
+      "200,32,848,0.0150,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,852,0.0149,0,0,0\n",
+      "200,32,852,0.0151,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,856,0.0149,0,0,0\n",
+      "200,32,856,0.0152,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,860,0.0150,0,0,0\n",
+      "200,32,860,0.0152,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,864,0.0150,0,0,0\n",
+      "200,32,864,0.0153,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,868,0.0152,0,0,0\n",
+      "200,32,868,0.0154,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,872,0.0151,0,0,0\n",
+      "200,32,872,0.0156,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,876,0.0153,0,0,0\n",
+      "200,32,876,0.0156,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,880,0.0153,0,0,0\n",
+      "200,32,880,0.0156,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,884,0.0153,0,0,0\n",
+      "200,32,884,0.0157,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,888,0.0155,0,0,0\n",
+      "200,32,888,0.0157,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,892,0.0156,0,0,0\n",
+      "200,32,892,0.0158,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,896,0.0156,0,0,0\n",
+      "200,32,896,0.0159,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,900,0.0158,0,0,0\n",
+      "200,32,900,0.0159,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,904,0.0158,0,0,0\n",
+      "200,32,904,0.0161,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,908,0.0159,0,0,0\n",
+      "200,32,908,0.0162,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,912,0.0159,0,0,0\n",
+      "200,32,912,0.0164,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,916,0.0162,0,0,0\n",
+      "200,32,916,0.0163,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,920,0.0162,0,0,0\n",
+      "200,32,920,0.0164,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,924,0.0162,0,0,0\n",
+      "200,32,924,0.0165,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,928,0.0162,0,0,0\n",
+      "200,32,928,0.0166,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,932,0.0163,0,0,0\n",
+      "200,32,932,0.0166,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,936,0.0164,0,0,0\n",
+      "200,32,936,0.0167,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,940,0.0165,0,0,0\n",
+      "200,32,940,0.0167,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,944,0.0165,0,0,0\n",
+      "200,32,944,0.0168,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,948,0.0166,0,0,0\n",
+      "200,32,948,0.0169,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,952,0.0167,0,0,0\n",
+      "200,32,952,0.0172,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,956,0.0168,0,0,0\n",
+      "200,32,956,0.0171,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,960,0.0168,0,0,0\n",
+      "200,32,960,0.0172,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,964,0.0172,0,0,0\n",
+      "200,32,964,0.0175,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,968,0.0173,0,0,0\n",
+      "200,32,968,0.0175,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,972,0.0173,0,0,0\n",
+      "200,32,972,0.0176,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,976,0.0173,0,0,0\n",
+      "200,32,976,0.0177,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,980,0.0175,0,0,0\n",
+      "200,32,980,0.0178,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,984,0.0176,0,0,0\n",
+      "200,32,984,0.0178,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,988,0.0175,0,0,0\n",
+      "200,32,988,0.0179,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,992,0.0176,0,0,0\n",
+      "200,32,992,0.0179,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,996,0.0178,0,0,0\n",
+      "200,32,996,0.0182,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,1000,0.0177,0,0,0\n",
+      "200,32,1000,0.0181,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,1004,0.0178,0,0,0\n",
+      "200,32,1004,0.0182,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,1008,0.0178,0,0,0\n",
+      "200,32,1008,0.0182,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,1012,0.0181,0,0,0\n",
+      "200,32,1012,0.0184,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,1016,0.0180,0,0,0\n",
+      "200,32,1016,0.0184,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,1020,0.0182,0,0,0\n",
+      "200,32,1020,0.0186,0,0,0\n",
       "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,1024,0.0179,0,0,0\n",
-      "mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.sflop.bin.csv .\n",
-      "bsub -W 60 -nnodes 1 -Is -P GEN110 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vflop.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.vflop.bin.csv\n",
-      "Job <4300> is submitted to default queue <batch>.\n",
+      "200,32,1024,0.0182,0,0,0\n",
+      "mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.sflop.bin.csv .\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vflop.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.vflop.bin.csv\n",
+      "Job <24646> is submitted to default queue <batch>.\n",
       "<<Waiting for dispatch ...>>\n",
       "<<Starting on login1>>\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
@@ -3712,17 +3891,11 @@
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,20,0.0013,438000,2190,2190\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,24,0.0014,534000,2670,2670\n",
+      "200,32,24,0.0013,534000,2670,2670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,28,0.0014,630000,3150,3150\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,32,0.0015,726000,3630,3630\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,32,0.0015,726000,3630,3630\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,36,0.0016,822000,4110,4110\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
@@ -3730,29 +3903,29 @@
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,44,0.0017,1014000,5070,5070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,48,0.0018,1110000,5550,5550\n",
+      "200,32,48,0.0017,1110000,5550,5550\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,52,0.0018,1206000,6030,6030\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,56,0.0020,1302000,6510,6510\n",
+      "200,32,56,0.0019,1302000,6510,6510\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,60,0.0020,1398000,6990,6990\n",
+      "200,32,60,0.0019,1398000,6990,6990\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,64,0.0021,1494000,7470,7470\n",
+      "200,32,64,0.0020,1494000,7470,7470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,68,0.0022,1590000,7950,7950\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,72,0.0022,1686000,8430,8430\n",
+      "200,32,72,0.0021,1686000,8430,8430\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,76,0.0022,1782000,8910,8910\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,80,0.0023,1878000,9390,9390\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,84,0.0024,1974000,9870,9870\n",
+      "200,32,84,0.0025,1974000,9870,9870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,88,0.0024,2070000,10350,10350\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,92,0.0025,2166000,10830,10830\n",
+      "200,32,92,0.0026,2166000,10830,10830\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,96,0.0025,2262000,11310,11310\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
@@ -3760,13 +3933,13 @@
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,104,0.0027,2454000,12270,12270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,108,0.0028,2550000,12750,12750\n",
+      "200,32,108,0.0027,2550000,12750,12750\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,112,0.0028,2646000,13230,13230\n",
+      "200,32,112,0.0029,2646000,13230,13230\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,116,0.0029,2742000,13710,13710\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,120,0.0032,2838000,14190,14190\n",
+      "200,32,120,0.0029,2838000,14190,14190\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,124,0.0030,2934000,14670,14670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
@@ -3776,15 +3949,15 @@
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,136,0.0032,3222000,16110,16110\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,140,0.0033,3318000,16590,16590\n",
+      "200,32,140,0.0032,3318000,16590,16590\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,144,0.0033,3414000,17070,17070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,148,0.0034,3510000,17550,17550\n",
+      "200,32,148,0.0036,3510000,17550,17550\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,152,0.0034,3606000,18030,18030\n",
+      "200,32,152,0.0035,3606000,18030,18030\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,156,0.0036,3702000,18510,18510\n",
+      "200,32,156,0.0035,3702000,18510,18510\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,160,0.0036,3798000,18990,18990\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
@@ -3794,13 +3967,13 @@
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,172,0.0038,4086000,20430,20430\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,176,0.0039,4182000,20910,20910\n",
+      "200,32,176,0.0038,4182000,20910,20910\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,180,0.0039,4278000,21390,21390\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,184,0.0040,4374000,21870,21870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,188,0.0040,4470000,22350,22350\n",
+      "200,32,188,0.0041,4470000,22350,22350\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,192,0.0041,4566000,22830,22830\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
@@ -3810,25 +3983,25 @@
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,204,0.0043,4854000,24270,24270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,208,0.0043,4950000,24750,24750\n",
+      "200,32,208,0.0044,4950000,24750,24750\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,212,0.0044,5046000,25230,25230\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,216,0.0045,5142000,25710,25710\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,220,0.0047,5238000,26190,26190\n",
+      "200,32,220,0.0046,5238000,26190,26190\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,224,0.0046,5334000,26670,26670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,228,0.0047,5430000,27150,27150\n",
+      "200,32,228,0.0048,5430000,27150,27150\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,232,0.0047,5526000,27630,27630\n",
+      "200,32,232,0.0049,5526000,27630,27630\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,236,0.0048,5622000,28110,28110\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,240,0.0049,5718000,28590,28590\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,244,0.0050,5814000,29070,29070\n",
+      "200,32,244,0.0049,5814000,29070,29070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,248,0.0050,5910000,29550,29550\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
@@ -3838,25 +4011,19 @@
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,260,0.0052,6198000,30990,30990\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,264,0.0052,6294000,31470,31470\n",
+      "200,32,264,0.0053,6294000,31470,31470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,268,0.0053,6390000,31950,31950\n",
+      "200,32,268,0.0054,6390000,31950,31950\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,272,0.0054,6486000,32430,32430\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,276,0.0058,6582000,32910,32910\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,276,0.0054,6582000,32910,32910\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,280,0.0055,6678000,33390,33390\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,284,0.0056,6774000,33870,33870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,288,0.0056,6870000,34350,34350\n",
+      "200,32,288,0.0057,6870000,34350,34350\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,292,0.0057,6966000,34830,34830\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
@@ -3864,23 +4031,23 @@
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,300,0.0059,7158000,35790,35790\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,304,0.0060,7254000,36270,36270\n",
+      "200,32,304,0.0059,7254000,36270,36270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,308,0.0060,7350000,36750,36750\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,312,0.0061,7446000,37230,37230\n",
+      "200,32,312,0.0062,7446000,37230,37230\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,316,0.0061,7542000,37710,37710\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,320,0.0062,7638000,38190,38190\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,324,0.0063,7734000,38670,38670\n",
+      "200,32,324,0.0062,7734000,38670,38670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,328,0.0064,7830000,39150,39150\n",
+      "200,32,328,0.0063,7830000,39150,39150\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,332,0.0064,7926000,39630,39630\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,336,0.0064,8022000,40110,40110\n",
+      "200,32,336,0.0065,8022000,40110,40110\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,340,0.0065,8118000,40590,40590\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
@@ -3888,21 +4055,21 @@
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,348,0.0066,8310000,41550,41550\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,352,0.0068,8406000,42030,42030\n",
+      "200,32,352,0.0067,8406000,42030,42030\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,356,0.0069,8502000,42510,42510\n",
+      "200,32,356,0.0068,8502000,42510,42510\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,360,0.0068,8598000,42990,42990\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,364,0.0069,8694000,43470,43470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,368,0.0069,8790000,43950,43950\n",
+      "200,32,368,0.0070,8790000,43950,43950\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,372,0.0070,8886000,44430,44430\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,376,0.0071,8982000,44910,44910\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,380,0.0071,9078000,45390,45390\n",
+      "200,32,380,0.0072,9078000,45390,45390\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,384,0.0072,9174000,45870,45870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
@@ -3920,23 +4087,23 @@
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,412,0.0077,9846000,49230,49230\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,416,0.0077,9942000,49710,49710\n",
+      "200,32,416,0.0079,9942000,49710,49710\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,420,0.0078,10038000,50190,50190\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,424,0.0079,10134000,50670,50670\n",
+      "200,32,424,0.0080,10134000,50670,50670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,428,0.0079,10230000,51150,51150\n",
+      "200,32,428,0.0080,10230000,51150,51150\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,432,0.0080,10326000,51630,51630\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,436,0.0080,10422000,52110,52110\n",
+      "200,32,436,0.0083,10422000,52110,52110\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,440,0.0081,10518000,52590,52590\n",
+      "200,32,440,0.0082,10518000,52590,52590\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,444,0.0082,10614000,53070,53070\n",
+      "200,32,444,0.0083,10614000,53070,53070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,448,0.0082,10710000,53550,53550\n",
+      "200,32,448,0.0083,10710000,53550,53550\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,452,0.0083,10806000,54030,54030\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
@@ -3948,302 +4115,284 @@
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,468,0.0086,11190000,55950,55950\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,472,0.0088,11286000,56430,56430\n",
+      "200,32,472,0.0087,11286000,56430,56430\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,476,0.0089,11382000,56910,56910\n",
+      "200,32,476,0.0087,11382000,56910,56910\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,480,0.0088,11478000,57390,57390\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,484,0.0088,11574000,57870,57870\n",
+      "200,32,484,0.0089,11574000,57870,57870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,488,0.0089,11670000,58350,58350\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,492,0.0090,11766000,58830,58830\n",
+      "200,32,492,0.0091,11766000,58830,58830\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,496,0.0090,11862000,59310,59310\n",
+      "200,32,496,0.0091,11862000,59310,59310\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,500,0.0091,11958000,59790,59790\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
       "200,32,504,0.0092,12054000,60270,60270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,508,0.0094,12150000,60750,60750\n",
+      "200,32,508,0.0093,12150000,60750,60750\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,512,0.0092,12246000,61230,61230\n",
+      "200,32,512,0.0094,12246000,61230,61230\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,516,0.0093,12342000,61710,61710\n",
+      "200,32,516,0.0096,12342000,61710,61710\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,520,0.0093,12438000,62190,62190\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,520,0.0096,12438000,62190,62190\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,524,0.0094,12534000,62670,62670\n",
+      "200,32,524,0.0095,12534000,62670,62670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,528,0.0094,12630000,63150,63150\n",
+      "200,32,528,0.0098,12630000,63150,63150\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,532,0.0095,12726000,63630,63630\n",
+      "200,32,532,0.0097,12726000,63630,63630\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,536,0.0096,12822000,64110,64110\n",
+      "200,32,536,0.0097,12822000,64110,64110\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,540,0.0100,12918000,64590,64590\n",
+      "200,32,540,0.0098,12918000,64590,64590\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,544,0.0097,13014000,65070,65070\n",
+      "200,32,544,0.0100,13014000,65070,65070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,548,0.0098,13110000,65550,65550\n",
+      "200,32,548,0.0102,13110000,65550,65550\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,552,0.0099,13206000,66030,66030\n",
+      "200,32,552,0.0102,13206000,66030,66030\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,556,0.0100,13302000,66510,66510\n",
+      "200,32,556,0.0101,13302000,66510,66510\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,560,0.0101,13398000,66990,66990\n",
+      "200,32,560,0.0103,13398000,66990,66990\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,564,0.0102,13494000,67470,67470\n",
+      "200,32,564,0.0103,13494000,67470,67470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,568,0.0103,13590000,67950,67950\n",
+      "200,32,568,0.0104,13590000,67950,67950\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,572,0.0103,13686000,68430,68430\n",
+      "200,32,572,0.0105,13686000,68430,68430\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,576,0.0103,13782000,68910,68910\n",
+      "200,32,576,0.0105,13782000,68910,68910\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,580,0.0105,13878000,69390,69390\n",
+      "200,32,580,0.0107,13878000,69390,69390\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,584,0.0105,13974000,69870,69870\n",
+      "200,32,584,0.0108,13974000,69870,69870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,588,0.0106,14070000,70350,70350\n",
+      "200,32,588,0.0107,14070000,70350,70350\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,592,0.0106,14166000,70830,70830\n",
+      "200,32,592,0.0108,14166000,70830,70830\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,596,0.0106,14262000,71310,71310\n",
+      "200,32,596,0.0109,14262000,71310,71310\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,600,0.0108,14358000,71790,71790\n",
+      "200,32,600,0.0110,14358000,71790,71790\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,604,0.0109,14454000,72270,72270\n",
+      "200,32,604,0.0110,14454000,72270,72270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,608,0.0109,14550000,72750,72750\n",
+      "200,32,608,0.0111,14550000,72750,72750\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,612,0.0109,14646000,73230,73230\n",
+      "200,32,612,0.0114,14646000,73230,73230\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,616,0.0111,14742000,73710,73710\n",
+      "200,32,616,0.0112,14742000,73710,73710\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,620,0.0111,14838000,74190,74190\n",
+      "200,32,620,0.0113,14838000,74190,74190\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,624,0.0112,14934000,74670,74670\n",
+      "200,32,624,0.0114,14934000,74670,74670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,628,0.0112,15030000,75150,75150\n",
+      "200,32,628,0.0116,15030000,75150,75150\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,632,0.0112,15126000,75630,75630\n",
+      "200,32,632,0.0115,15126000,75630,75630\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,636,0.0114,15222000,76110,76110\n",
+      "200,32,636,0.0117,15222000,76110,76110\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,640,0.0114,15318000,76590,76590\n",
+      "200,32,640,0.0116,15318000,76590,76590\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,644,0.0114,15414000,77070,77070\n",
+      "200,32,644,0.0118,15414000,77070,77070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,648,0.0115,15510000,77550,77550\n",
+      "200,32,648,0.0117,15510000,77550,77550\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,652,0.0117,15606000,78030,78030\n",
+      "200,32,652,0.0119,15606000,78030,78030\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,656,0.0117,15702000,78510,78510\n",
+      "200,32,656,0.0119,15702000,78510,78510\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,660,0.0117,15798000,78990,78990\n",
+      "200,32,660,0.0120,15798000,78990,78990\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,664,0.0118,15894000,79470,79470\n",
+      "200,32,664,0.0120,15894000,79470,79470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,668,0.0120,15990000,79950,79950\n",
+      "200,32,668,0.0121,15990000,79950,79950\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,672,0.0120,16086000,80430,80430\n",
+      "200,32,672,0.0121,16086000,80430,80430\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,676,0.0121,16182000,80910,80910\n",
+      "200,32,676,0.0123,16182000,80910,80910\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,680,0.0120,16278000,81390,81390\n",
+      "200,32,680,0.0122,16278000,81390,81390\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,684,0.0121,16374000,81870,81870\n",
+      "200,32,684,0.0125,16374000,81870,81870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,688,0.0122,16470000,82350,82350\n",
+      "200,32,688,0.0124,16470000,82350,82350\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,692,0.0122,16566000,82830,82830\n",
+      "200,32,692,0.0126,16566000,82830,82830\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,696,0.0124,16662000,83310,83310\n",
+      "200,32,696,0.0125,16662000,83310,83310\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,700,0.0124,16758000,83790,83790\n",
+      "200,32,700,0.0127,16758000,83790,83790\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,704,0.0124,16854000,84270,84270\n",
+      "200,32,704,0.0128,16854000,84270,84270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,708,0.0125,16950000,84750,84750\n",
+      "200,32,708,0.0128,16950000,84750,84750\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,712,0.0125,17046000,85230,85230\n",
+      "200,32,712,0.0128,17046000,85230,85230\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,716,0.0126,17142000,85710,85710\n",
+      "200,32,716,0.0128,17142000,85710,85710\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,720,0.0126,17238000,86190,86190\n",
+      "200,32,720,0.0129,17238000,86190,86190\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,724,0.0127,17334000,86670,86670\n",
+      "200,32,724,0.0130,17334000,86670,86670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,728,0.0128,17430000,87150,87150\n",
+      "200,32,728,0.0130,17430000,87150,87150\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,732,0.0130,17526000,87630,87630\n",
+      "200,32,732,0.0132,17526000,87630,87630\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,736,0.0129,17622000,88110,88110\n",
+      "200,32,736,0.0132,17622000,88110,88110\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,740,0.0129,17718000,88590,88590\n",
+      "200,32,740,0.0133,17718000,88590,88590\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,744,0.0130,17814000,89070,89070\n",
+      "200,32,744,0.0133,17814000,89070,89070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,748,0.0131,17910000,89550,89550\n",
+      "200,32,748,0.0134,17910000,89550,89550\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,752,0.0132,18006000,90030,90030\n",
+      "200,32,752,0.0134,18006000,90030,90030\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,756,0.0132,18102000,90510,90510\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,756,0.0136,18102000,90510,90510\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,760,0.0133,18198000,90990,90990\n",
+      "200,32,760,0.0136,18198000,90990,90990\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,764,0.0134,18294000,91470,91470\n",
+      "200,32,764,0.0136,18294000,91470,91470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,768,0.0135,18390000,91950,91950\n",
+      "200,32,768,0.0137,18390000,91950,91950\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,772,0.0136,18486000,92430,92430\n",
+      "200,32,772,0.0139,18486000,92430,92430\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,776,0.0136,18582000,92910,92910\n",
+      "200,32,776,0.0139,18582000,92910,92910\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,780,0.0137,18678000,93390,93390\n",
+      "200,32,780,0.0139,18678000,93390,93390\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,784,0.0137,18774000,93870,93870\n",
+      "200,32,784,0.0140,18774000,93870,93870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,788,0.0138,18870000,94350,94350\n",
+      "200,32,788,0.0140,18870000,94350,94350\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,792,0.0138,18966000,94830,94830\n",
+      "200,32,792,0.0142,18966000,94830,94830\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,796,0.0140,19062000,95310,95310\n",
+      "200,32,796,0.0142,19062000,95310,95310\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,800,0.0140,19158000,95790,95790\n",
+      "200,32,800,0.0144,19158000,95790,95790\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,804,0.0140,19254000,96270,96270\n",
+      "200,32,804,0.0143,19254000,96270,96270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,808,0.0141,19350000,96750,96750\n",
+      "200,32,808,0.0144,19350000,96750,96750\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,812,0.0142,19446000,97230,97230\n",
+      "200,32,812,0.0145,19446000,97230,97230\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,816,0.0143,19542000,97710,97710\n",
+      "200,32,816,0.0145,19542000,97710,97710\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,820,0.0143,19638000,98190,98190\n",
+      "200,32,820,0.0146,19638000,98190,98190\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,824,0.0144,19734000,98670,98670\n",
+      "200,32,824,0.0147,19734000,98670,98670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,828,0.0146,19830000,99150,99150\n",
+      "200,32,828,0.0147,19830000,99150,99150\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,832,0.0146,19926000,99630,99630\n",
+      "200,32,832,0.0148,19926000,99630,99630\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,836,0.0146,20022000,100110,100110\n",
+      "200,32,836,0.0151,20022000,100110,100110\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,840,0.0147,20118000,100590,100590\n",
+      "200,32,840,0.0150,20118000,100590,100590\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,844,0.0147,20214000,101070,101070\n",
+      "200,32,844,0.0150,20214000,101070,101070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,848,0.0148,20310000,101550,101550\n",
+      "200,32,848,0.0151,20310000,101550,101550\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,852,0.0148,20406000,102030,102030\n",
+      "200,32,852,0.0152,20406000,102030,102030\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,856,0.0150,20502000,102510,102510\n",
+      "200,32,856,0.0152,20502000,102510,102510\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,860,0.0150,20598000,102990,102990\n",
+      "200,32,860,0.0152,20598000,102990,102990\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,864,0.0151,20694000,103470,103470\n",
+      "200,32,864,0.0153,20694000,103470,103470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,868,0.0151,20790000,103950,103950\n",
+      "200,32,868,0.0154,20790000,103950,103950\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,872,0.0152,20886000,104430,104430\n",
+      "200,32,872,0.0155,20886000,104430,104430\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,876,0.0153,20982000,104910,104910\n",
+      "200,32,876,0.0155,20982000,104910,104910\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,880,0.0154,21078000,105390,105390\n",
+      "200,32,880,0.0157,21078000,105390,105390\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,884,0.0154,21174000,105870,105870\n",
+      "200,32,884,0.0157,21174000,105870,105870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,888,0.0154,21270000,106350,106350\n",
+      "200,32,888,0.0158,21270000,106350,106350\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,892,0.0155,21366000,106830,106830\n",
+      "200,32,892,0.0158,21366000,106830,106830\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,896,0.0157,21462000,107310,107310\n",
+      "200,32,896,0.0159,21462000,107310,107310\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,900,0.0156,21558000,107790,107790\n",
+      "200,32,900,0.0161,21558000,107790,107790\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,904,0.0158,21654000,108270,108270\n",
+      "200,32,904,0.0162,21654000,108270,108270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,908,0.0159,21750000,108750,108750\n",
+      "200,32,908,0.0161,21750000,108750,108750\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,912,0.0159,21846000,109230,109230\n",
+      "200,32,912,0.0163,21846000,109230,109230\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,916,0.0161,21942000,109710,109710\n",
+      "200,32,916,0.0164,21942000,109710,109710\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,920,0.0161,22038000,110190,110190\n",
+      "200,32,920,0.0165,22038000,110190,110190\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,924,0.0162,22134000,110670,110670\n",
+      "200,32,924,0.0164,22134000,110670,110670\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,928,0.0164,22230000,111150,111150\n",
+      "200,32,928,0.0166,22230000,111150,111150\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,932,0.0164,22326000,111630,111630\n",
+      "200,32,932,0.0166,22326000,111630,111630\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,936,0.0164,22422000,112110,112110\n",
+      "200,32,936,0.0167,22422000,112110,112110\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,940,0.0164,22518000,112590,112590\n",
+      "200,32,940,0.0168,22518000,112590,112590\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,944,0.0165,22614000,113070,113070\n",
+      "200,32,944,0.0168,22614000,113070,113070\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,948,0.0167,22710000,113550,113550\n",
+      "200,32,948,0.0169,22710000,113550,113550\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,952,0.0168,22806000,114030,114030\n",
+      "200,32,952,0.0170,22806000,114030,114030\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,956,0.0168,22902000,114510,114510\n",
+      "200,32,956,0.0170,22902000,114510,114510\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,960,0.0168,22998000,114990,114990\n",
+      "200,32,960,0.0171,22998000,114990,114990\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,964,0.0174,23094000,115470,115470\n",
+      "200,32,964,0.0176,23094000,115470,115470\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,968,0.0172,23190000,115950,115950\n",
+      "200,32,968,0.0176,23190000,115950,115950\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,972,0.0173,23286000,116430,116430\n",
+      "200,32,972,0.0177,23286000,116430,116430\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,976,0.0172,23382000,116910,116910\n",
+      "200,32,976,0.0177,23382000,116910,116910\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,980,0.0174,23478000,117390,117390\n",
+      "200,32,980,0.0178,23478000,117390,117390\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,984,0.0174,23574000,117870,117870\n",
+      "200,32,984,0.0178,23574000,117870,117870\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,988,0.0176,23670000,118350,118350\n",
+      "200,32,988,0.0179,23670000,118350,118350\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,992,0.0176,23766000,118830,118830\n",
+      "200,32,992,0.0180,23766000,118830,118830\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,996,0.0179,23862000,119310,119310\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "200,32,996,0.0181,23862000,119310,119310\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,1000,0.0177,23958000,119790,119790\n",
+      "200,32,1000,0.0182,23958000,119790,119790\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,1004,0.0178,24054000,120270,120270\n",
+      "200,32,1004,0.0182,24054000,120270,120270\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,1008,0.0178,24150000,120750,120750\n",
+      "200,32,1008,0.0182,24150000,120750,120750\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,1012,0.0180,24246000,121230,121230\n",
+      "200,32,1012,0.0184,24246000,121230,121230\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,1016,0.0180,24342000,121710,121710\n",
+      "200,32,1016,0.0185,24342000,121710,121710\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,1020,0.0181,24438000,122190,122190\n",
+      "200,32,1020,0.0184,24438000,122190,122190\n",
       "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,1024,0.0178,24534000,122670,122670\n",
-      "mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vflop.bin.csv .\n"
+      "200,32,1024,0.0182,24534000,122670,122670\n",
+      "mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.vflop.bin.csv .\n"
      ]
     }
    ],
@@ -4253,51 +4402,225 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 39,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>nx</th>\n",
+       "      <th>iter</th>\n",
+       "      <th>ny</th>\n",
+       "      <th>Runtime</th>\n",
+       "      <th>PM_SCALAR_FLOP_CMPL (total)</th>\n",
+       "      <th>PM_SCALAR_FLOP_CMPL (min)</th>\n",
+       "      <th>PM_SCALAR_FLOP_CMPL (max)</th>\n",
+       "      <th>PM_VECTOR_FLOP_CMPL (total)</th>\n",
+       "      <th>PM_VECTOR_FLOP_CMPL (min)</th>\n",
+       "      <th>PM_VECTOR_FLOP_CMPL (max)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>4</td>\n",
+       "      <td>200</td>\n",
+       "      <td>32</td>\n",
+       "      <td>0.0010</td>\n",
+       "      <td>96000</td>\n",
+       "      <td>480</td>\n",
+       "      <td>480</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>8</td>\n",
+       "      <td>200</td>\n",
+       "      <td>32</td>\n",
+       "      <td>0.0011</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>150000</td>\n",
+       "      <td>750</td>\n",
+       "      <td>750</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>12</td>\n",
+       "      <td>200</td>\n",
+       "      <td>32</td>\n",
+       "      <td>0.0012</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>246000</td>\n",
+       "      <td>1230</td>\n",
+       "      <td>1230</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>16</td>\n",
+       "      <td>200</td>\n",
+       "      <td>32</td>\n",
+       "      <td>0.0012</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>342000</td>\n",
+       "      <td>1710</td>\n",
+       "      <td>1710</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>20</td>\n",
+       "      <td>200</td>\n",
+       "      <td>32</td>\n",
+       "      <td>0.0013</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>438000</td>\n",
+       "      <td>2190</td>\n",
+       "      <td>2190</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   nx  iter  ny  Runtime  PM_SCALAR_FLOP_CMPL (total)  \\\n",
+       "0   4   200  32   0.0010                        96000   \n",
+       "1   8   200  32   0.0011                            0   \n",
+       "2  12   200  32   0.0012                            0   \n",
+       "3  16   200  32   0.0012                            0   \n",
+       "4  20   200  32   0.0013                            0   \n",
+       "\n",
+       "   PM_SCALAR_FLOP_CMPL (min)   PM_SCALAR_FLOP_CMPL (max)  \\\n",
+       "0                        480                         480   \n",
+       "1                          0                           0   \n",
+       "2                          0                           0   \n",
+       "3                          0                           0   \n",
+       "4                          0                           0   \n",
+       "\n",
+       "   PM_VECTOR_FLOP_CMPL (total)  PM_VECTOR_FLOP_CMPL (min)  \\\n",
+       "0                            0                          0   \n",
+       "1                       150000                        750   \n",
+       "2                       246000                       1230   \n",
+       "3                       342000                       1710   \n",
+       "4                       438000                       2190   \n",
+       "\n",
+       "    PM_VECTOR_FLOP_CMPL (max)  \n",
+       "0                           0  \n",
+       "1                         750  \n",
+       "2                        1230  \n",
+       "3                        1710  \n",
+       "4                        2190  "
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "df_sflop = pd.read_csv(\"poisson2d.sflop.bin.csv\", skiprows=range(2, 50000, 2))\n",
     "df_vflop = pd.read_csv(\"poisson2d.vflop.bin.csv\", skiprows=range(2, 50000, 2))\n",
-    "df_flop = pd.concat([df_sflop.set_index(\"nx\"), df_vflop.set_index(\"nx\")[['PM_VECTOR_FLOP_CMPL (total)', 'PM_VECTOR_FLOP_CMPL (min)', ' PM_VECTOR_FLOP_CMPL (max)']]], axis=1).reset_index()"
+    "df_flop = pd.concat([df_sflop.set_index(\"nx\"), df_vflop.set_index(\"nx\")[['PM_VECTOR_FLOP_CMPL (total)', 'PM_VECTOR_FLOP_CMPL (min)', ' PM_VECTOR_FLOP_CMPL (max)']]], axis=1).reset_index()\n",
+    "df_flop.head()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The name of the vector counter is a bit misleading; not floating point operations are measured but floating point instructions. To get *real* floating point operations, each value needs to be multiplied by the vector width (2). We can plot the values afterwards (non-interactive: `make graph_task4`)."
+    "Again, the name of the vector counter is a bit misleading; not floating point operations are measured but floating point instructions. To get *real* floating point operations, each value needs to be multiplied by the vector width (2). We can plot the values afterwards (non-interactive: `make graph_task4`)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 40,
    "metadata": {},
    "outputs": [],
    "source": [
-    "common.normalize(df_flop, \"PM_SCALAR_FLOP_CMPL (min)\", \"Scalar FlOps / Loop Iteration\")\n",
-    "common.normalize(df_flop, \"PM_VECTOR_FLOP_CMPL (min)\", \"Vector Instructions / Loop Iteration\")\n",
-    "df_flop[\"Vector FlOps / Loop Iteration\"] = df_flop[\"Vector Instructions / Loop Iteration\"] * 2"
+    "df_flop[\"Grid Points\"] = df_flop[\"nx\"] * df_flop[\"ny\"]\n",
+    "df_flop[\"Vector FlOps (min)\"] = df_flop[\"PM_VECTOR_FLOP_CMPL (min)\"] * 2\n",
+    "df_flop[\"Scalar FlOps (min)\"] = df_flop[\"PM_SCALAR_FLOP_CMPL (min)\"]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 41,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "image/png": "\n",
+      "image/png": "\n",
       "text/plain": [
        "<Figure size 1008x432 with 1 Axes>"
       ]
      },
-     "metadata": {},
+     "metadata": {
+      "needs_background": "light"
+     },
      "output_type": "display_data"
     }
    ],
    "source": [
-    "df_flop.set_index(\"nx\")[[\"Scalar FlOps / Loop Iteration\", \"Vector FlOps / Loop Iteration\"]].plot();"
+    "df_flop.set_index(\"Grid Points\")[[\"Scalar FlOps (min)\", \"Vector FlOps (min)\"]].plot();"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Counter Scalar FlOps (min) is proportional to the grid points (nx*ny) by a factor of -0.0003 (± 0.0002)\n",
+      "Counter Vector FlOps (min) is proportional to the grid points (nx*ny) by a factor of  7.5004 (± 0.0002)\n"
+     ]
+    }
+   ],
+   "source": [
+    "_fit, _cov = common.print_and_return_fit(\n",
+    "    [\"Scalar FlOps (min)\", \"Vector FlOps (min)\"], \n",
+    "    df_flop.set_index(\"Grid Points\"), \n",
+    "    linear_function\n",
+    ")\n",
+    "fit_parameters = {**fit_parameters, **_fit}\n",
+    "fit_covariance = {**fit_covariance, **_cov}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "exercise": "solution"
+   },
+   "source": [
+    "Interesting! We seem to be using the vector registers of our system very well. Basically all operations are vector operations!"
    ]
   },
   {
@@ -4317,29 +4640,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 56,
    "metadata": {},
    "outputs": [],
    "source": [
-    "I_flop_scalar = df_flop.set_index(\"nx\")[\"Scalar FlOps / Loop Iteration\"]\n",
-    "I_flop_vector = df_flop.set_index(\"nx\")[\"Vector FlOps / Loop Iteration\"]\n",
-    "I_mem_load    = df_byte[\"Loads / Loop Iteration\"]\n",
-    "I_mem_store   = df_byte[\"Stores / Loop Iteration\"]"
+    "I_flop_scalar = df_flop.set_index(\"Grid Points\")[\"Scalar FlOps (min)\"]\n",
+    "I_flop_vector = df_flop.set_index(\"Grid Points\")[\"Vector FlOps (min)\"]\n",
+    "I_mem_load    = df_byte[\"Loads\"]\n",
+    "I_mem_store   = df_byte[\"Stores\"]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 57,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "image/png": "\n",
+      "image/png": "\n",
       "text/plain": [
        "<Figure size 1008x432 with 1 Axes>"
       ]
      },
-     "metadata": {},
+     "metadata": {
+      "needs_background": "light"
+     },
      "output_type": "display_data"
     }
    ],
@@ -4366,6 +4691,8 @@
     "\n",
     "If you still still have time, you might venture into your own benchmarking adventure.\n",
     "\n",
+    "Maybe you noticed already, for instance in Task 2 C: At the very right to very large numbers of grid points, the behaviour of the graph changed. Something is happening there!\n",
+    "\n",
     "\n",
     "**TASK**: Revisit the counters measured above for a larger range of `nx`. Right now, we only studied `nx` until 1000. New effects appear above that value – partly only well above, though ($nx > 15000$).\n",
     "\n",
@@ -4393,9 +4720,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.1"
+   "version": "3.7.0"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
-}
\ No newline at end of file
+ "nbformat_minor": 4
+}
diff --git a/2-Performance_Counters/Handson/.master/Makefile b/2-Performance_Counters/Handson/.master/Makefile
index 6f3849f4d54147c860ede7b0ff427176284a83ff..1db4b2f76ed5e40ed11f543e3d3837e46fa33080 100644
--- a/2-Performance_Counters/Handson/.master/Makefile
+++ b/2-Performance_Counters/Handson/.master/Makefile
@@ -82,32 +82,25 @@ graph_task2c: plot-task2c.pdf
 graph_task4: plot-task4.pdf
 graph_task4-2: plot-task4-2.pdf
 plot-task1.pdf: poisson2d.ins_cyc.bin.csv
-	@test "$$SC19_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task1()"
 	@test -n "$$DISPLAY" || "No X forwarding found. Either reconnect with X forwarding (-X / -Y) or download $@ with scp."
 	display $@
 plot-task2a.pdf: poisson2d.ld_st.bin.csv
-	@test "$$SC19_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task2a()"
 	display $@
 plot-task2b.pdf: poisson2d.vld.bin.csv poisson2d.vst.bin.csv
-	@test "$$SC19_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task2b()"
 	display $@
 plot-task2b-2.pdf: poisson2d.vld.bin.csv poisson2d.vst.bin.csv poisson2d.ld_st.bin.csv
-	@test "$$SC19_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task2b(bytes=True)"
 	display $@
 plot-task2c.pdf: poisson2d.vld.bin.csv poisson2d.vst.bin.csv poisson2d.ld_st.bin.csv poisson2d.ins_cyc.bin.csv
-	@test "$$SC19_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task2c()"
 	display $@
 plot-task4.pdf: poisson2d.sflop.bin.csv poisson2d.vflop.bin.csv
-	@test "$$SC19_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task4()"
 	display $@
 plot-task4-2.pdf: poisson2d.sflop.bin.csv poisson2d.vflop.bin.csv
-	@test "$$SC19_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task4(ai=True)"
 	display $@
 
diff --git a/2-Performance_Counters/Handson/.master/README.md b/2-Performance_Counters/Handson/.master/README.md
index 3ee0057da9448f49ece6e6afededeafee38ff907..8887dd78d9f2db7644f575df47c48c4c41ed655e 100644
--- a/2-Performance_Counters/Handson/.master/README.md
+++ b/2-Performance_Counters/Handson/.master/README.md
@@ -2,7 +2,5 @@
 
 This folder holds the files for the first hands-on exercise about Performance Counters on POWER9.
 
-Make sure to load all modules of this session by typing `module load sc18/handson1` into the shell.
-
 All task description is in an accompanying Jupyter Notebook. Open it interactively on Ascent with port forwarding. If that is impossible to do, use the static convert to HTML or PDF of the Notebook and follow along accordingly.
 
diff --git a/2-Performance_Counters/Handson/.master/common.py b/2-Performance_Counters/Handson/.master/common.py
index 1891a0341f369f7564b4a29b3f4a60e314f4bc9b..9033865e014fce9ece4137cdb11a42884acceae4 100644
--- a/2-Performance_Counters/Handson/.master/common.py
+++ b/2-Performance_Counters/Handson/.master/common.py
@@ -1,2 +1,22 @@
 def normalize(df, old_column, new_column):
 	df[new_column] = df[old_column] / (df["ny"] * df["nx"])
+    
+def print_and_return_fit(list_of_quantities, dataframe, function, format_value=">7.4f", format_uncertainty="f", _print=True):
+    """Use `curve_fit` to fit each quantity in `list_of_quantity` wrt to `dataframe.index`. Print (selectable) and return the result."""
+    import numpy as np
+    from scipy.optimize import curve_fit 
+    _fit_parameters = {}
+    _fit_covariance = {}
+    _quantity_padding = np.max([len(_str) for _str in list_of_quantities])
+    for quantity in list_of_quantities:
+        _fit_parameters[quantity], _fit_covariance[quantity] = curve_fit(function, dataframe.index, dataframe[quantity])
+        if (_print):
+            print("Counter {:>{_quantity_padding}} is proportional to the grid points (nx*ny) by a factor of {:{format_value}} (± {:{format_uncertainty}})".format(
+                quantity, 
+                _fit_parameters[quantity][0], 
+                np.sqrt(np.diag(_fit_covariance[quantity]))[0],
+                _quantity_padding=_quantity_padding,
+                format_value=format_value,
+                format_uncertainty=format_uncertainty
+        ))
+    return (_fit_parameters, _fit_covariance)
\ No newline at end of file
diff --git a/2-Performance_Counters/Handson/.master/copyNotebook.mk b/2-Performance_Counters/Handson/.master/copyNotebook.mk
index a90882b672855d41aa4e36861cf5fe6d5f248d6f..8432d91d30fcf721a6b0a0d12a4f462c94a897e5 100755
--- a/2-Performance_Counters/Handson/.master/copyNotebook.mk
+++ b/2-Performance_Counters/Handson/.master/copyNotebook.mk
@@ -21,17 +21,17 @@ solutions: $(TGT_SOLUTIONS)
 tasks: $(TGT_BLANK)
 
 $(addprefix ../,$(addsuffix .html,$(basename $(SRC)))): $(SRC)
-	jupyter nbconvert --to html --output $@ --ClearOutputPreprocessor.enabled=True $< 
+	notebook-splitter --remove solution --keep task $< | jupyter nbconvert --to html --output $@ --ClearOutputPreprocessor.enabled=True --stdin 
 $(addprefix ../,$(addsuffix .pdf,$(basename $(SRC)))): $(SRC)
-	jupyter nbconvert --to pdf --output $@ --template better-article.tplx --ClearOutputPreprocessor.enabled=True $< 
+	notebook-splitter --remove solution --keep task $< | jupyter nbconvert --to pdf --output $@ --template better-article.tplx --ClearOutputPreprocessor.enabled=True --stdin 
 	mv $@.pdf $@
 $(addprefix ../,$(SRC)): $(SRC)
-	jupyter nbconvert --to ipynb --output $@ --ClearOutputPreprocessor.enabled=True $< 
+	notebook-splitter --remove solution --keep task $< | jupyter nbconvert --to ipynb --output $@ --ClearOutputPreprocessor.enabled=True --stdin 
 
 $(addprefix ../Solutions/,$(addsuffix .html,$(basename $(SRC)))): $(SRC)
-	jupyter nbconvert --to html --output $@ $< 
+	notebook-splitter --remove task --keep solution $< | jupyter nbconvert --to html --output $@ --stdin
 $(addprefix ../Solutions/,$(addsuffix .pdf,$(basename $(SRC)))): $(SRC)
-	jupyter nbconvert --to pdf --output $@ --template better-article.tplx $<
+	notebook-splitter --remove task --keep solution $< | jupyter nbconvert --to pdf --output $@ --template better-article.tplx --stdin
 	mv $@.pdf $@
 $(addprefix ../Solutions/,$(SRC)): $(SRC)
-	cp $< $@ 
+	notebook-splitter --remove task --keep solution $< -o $@ 
diff --git a/2-Performance_Counters/Handson/Hands-On-Performance-Counters.html b/2-Performance_Counters/Handson/Hands-On-Performance-Counters.html
index 9be777b398d8d1c7dc240c0021aa1957eec3b17b..8db553e9c7efa7a2ed66b3b43569c67ec4a20af7 100644
--- a/2-Performance_Counters/Handson/Hands-On-Performance-Counters.html
+++ b/2-Performance_Counters/Handson/Hands-On-Performance-Counters.html
@@ -2,7 +2,7 @@
 <html>
 <head><meta charset="utf-8" />
 
-<title>Hands-On-Performance-Counters</title>
+<title>Notebook</title>
 
 <script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js"></script>
 <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.3/jquery.min.js"></script>
@@ -13116,7 +13116,7 @@ div#notebook {
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h1 id="Hands-On:-Performance-Counters">Hands-On: Performance Counters<a class="anchor-link" href="#Hands-On:-Performance-Counters">&#182;</a></h1><p>This Notebook is part of the exercises for the SC18 Tutorial »Application Porting and Optimization on GPU-accelerated POWER Architectures«. It is to be run on a POWER9 machine; in the tutorial: on Ascent, the POWER9 training cluster of Oak Ridge National Lab.</p>
+<h1 id="Hands-On:-Performance-Counters">Hands-On: Performance Counters<a class="anchor-link" href="#Hands-On:-Performance-Counters">&#182;</a></h1><p>This Notebook is part of the exercises for the SC19 Tutorial »Application Porting and Optimization on GPU-accelerated POWER Architectures«. It is to be run on a POWER9 machine; in the tutorial: on Ascent, the POWER9 training cluster of Oak Ridge National Lab.</p>
 <p>This Notebook can be run interactively on Ascent. If this capability is unavailable to you, use it as a description for executing the tasks on Ascent via a shell access. During data evaluation, the Notebook mentions the corresponding commands to execute in case you are not able to run the Notebook interactively directly on Ascent.</p>
 <h2 id="Table-of-Contents">Table of Contents<a class="anchor-link" href="#Table-of-Contents">&#182;</a></h2><p><a name="toc"></a></p>
 <ul>
@@ -13149,15 +13149,29 @@ div#notebook {
     <span class="p">}</span>
 <span class="p">}</span>
 </pre></div>
-<p>After <code>PAPI_add_named_event()</code> is used to add named PMU events outside of the relaxation iteration, <code>PAPI_start()</code>
+<p>The code is instrumented using PAPI. The API routine <code>PAPI_add_named_event()</code> is used to add <em>named</em> PMU events outside of the relaxation iteration. After that, calls to <code>PAPI_start()</code>
 and <code>PAPI_stop()</code> can be used to count how often a PMU event is incremented.</p>
-<p>For the first task, we will measure quantities often used to characterize an application, cycles and instructions.</p>
-<p><strong>TASK</strong>: Please measure counters for completed instructions and run cycles. See the TODOs in <a href="/edit/Tasks/poisson2d.ins_cyc.c"><code>poisson2d.ins_cyc.c</code></a>. Either edit with Jupyter capabilities by clicking on the link of the file or use a dedicated editor (<code>vim</code> is available). The names of the counters to be implemented are <code>PM_INST_CMPL</code> and <code>PM_RUN_CYC</code>.</p>
-<p>After changing the source code, compile it with <code>make task1</code> or by executing the following cell (we need to change directories first, though).</p>
+<p>For the first task, we will measure quantities often used to characterize an application: cycles and instructions.</p>
+<p><strong>TASK</strong>: Please measure counters for completed instructions and run cycles. See the TODOs in file <a href="poisson2d.ins_cyc.c"><code>poisson2d.ins_cyc.c</code></a>. You can either edit the files with Jupyter capabilities by clicking on the link of the file or selecting it in the file drawer on the left; or use a dedicated editor on the system(<code>vim</code> is available). The names of the counters to be implemented are <code>PM_INST_CMPL</code> and <code>PM_RUN_CYC</code>.</p>
+<p>After changing the source code, compile it with <code>make task1</code> or by executing the following cell (we need to change directories first, though).<br>
+<em>(Using the <code>Makefile</code> we have hidden quite a few intricacies from you in order to focus on the relevant content at hand. Don't worry too much about it right now – we'll un-hide it gradually during the course of the tutorial.)</em></p>
 <p><a href="#toc">Back to top</a></p>
 
 </div>
 </div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="o">!</span>pwd
+</pre></div>
+
+    </div>
+</div>
+</div>
+
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
@@ -13189,7 +13203,7 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>Make sure your program is measuring correctly, by invoking it, for instance with these arguments: <code>./poisson2d.ins_cyc.bin 100 64 32</code> – see the next cell. The <code>100</code> specifies the number of iterations to perform, <code>64</code> and <code>32</code> are the size of the grid in y and x direction, respectively.</p>
+<p>Before we launch our measurement campaign we should make sure that the program is measuring correctly. Let's invoking it, for instance, with these arguments: <code>./poisson2d.ins_cyc.bin 100 64 32</code> – see the next cell. The <code>100</code> specifies the number of iterations to perform, <code>64</code> and <code>32</code> are the size of the grid in y and x direction, respectively.</p>
 
 </div>
 </div>
@@ -13211,7 +13225,8 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>For the following runs, we are going to use Ascent's compute backend nodes which are not shared amongst users and also have six GPUs available. We use the available batch scheduler <em>IBM Spectrum LSF</em> for this. For convenience, a call to the batch submission system is stored in the environment variable <code>$SC18_SUBMIT_CMD</code>. You are welcome to adapt it once you get more familiar with the system.</p>
+<p>Alright! That should return a comma-seperated list of measurements.</p>
+<p>For the following runs, we are going to use Ascent's compute backend nodes which are not shared amongst users and also have six GPUs available (each!). We use the available batch scheduler <em>IBM Spectrum LSF</em> for this. For convenience, a call to the batch submission system is stored in the environment variable <code>$SC19_SUBMIT_CMD</code>. You are welcome to adapt it once you get more familiar with the system.</p>
 <p>For now, we want to run our first benchmarking run and measure cycles and instructions for different data sizes, as a function of <code>nx</code>. The Makefile holds a target for this, call it with <code>make bench_task1</code>:</p>
 
 </div>
@@ -13233,7 +13248,7 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>Once the run is completed, let's have a look at the data!</p>
+<p>Once the run is completed, let's study the data!</p>
 <p>This can be done best in the interactive version of the Jupyter Notebook. In case this version of the description is unavailable to you, call the Makefile target <code>make graph_task1</code> (either with X forwarding, or download the resulting PDF).</p>
 
 </div>
@@ -13244,7 +13259,8 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 <div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">import</span> <span class="nn">seaborn</span> <span class="k">as</span> <span class="nn">sns</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
+<span class="kn">import</span> <span class="nn">seaborn</span> <span class="k">as</span> <span class="nn">sns</span>
 <span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
 <span class="kn">import</span> <span class="nn">matplotlib.pyplot</span> <span class="k">as</span> <span class="nn">plt</span>
 <span class="kn">import</span> <span class="nn">common</span>
@@ -13257,6 +13273,27 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 </div>
 </div>
 
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Execute the following cell if you want to switch to color-blind-safer colors</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">sns</span><span class="o">.</span><span class="n">set_palette</span><span class="p">(</span><span class="s2">&quot;colorblind&quot;</span><span class="p">)</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
@@ -13265,8 +13302,7 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">plt</span><span class="o">.</span><span class="n">rcParams</span><span class="p">[</span><span class="s1">&#39;figure.figsize&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="mi">14</span><span class="p">,</span> <span class="mi">6</span><span class="p">]</span>
 <span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&quot;poisson2d.ins_cyc.bin.csv&quot;</span><span class="p">,</span> <span class="n">skiprows</span><span class="o">=</span><span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">50000</span><span class="p">,</span> <span class="mi">2</span><span class="p">))</span>  <span class="c1"># Read in the CSV file from the bench run; parse with Pandas</span>
-<span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="s2">&quot;PM_INST_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Instructions / Loop Iteration&quot;</span><span class="p">)</span>  <span class="c1"># Normalize to each grid cell</span>
-<span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="s2">&quot;PM_RUN_CYC (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Cycles / Loop Iteration&quot;</span><span class="p">)</span>
+<span class="n">df</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="s2">&quot;nx&quot;</span><span class="p">]</span> <span class="o">*</span> <span class="n">df</span><span class="p">[</span><span class="s2">&quot;ny&quot;</span><span class="p">]</span>  <span class="c1"># Add a new column of the number of grid points (the product of nx and ny)</span>
 <span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>  <span class="c1"># Display the head of the Pandas dataframe</span>
 </pre></div>
 
@@ -13274,16 +13310,95 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 </div>
 </div>
 
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Let's have a look at the counters we've just measured and see how they scaling with increasing number of grid points.</p>
+<p><em>In the following, we are always using the minimal value of the counter (indicated by »(min)«) as this should give us an estimate of the best achievable result of the architecture.</em></p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">fig</span><span class="p">,</span> <span class="p">(</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">)</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_RUN_CYC (min)&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax1</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+<span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_INST_CMPL (min)&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax2</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Although some slight variations can be seen for run cycles for many grid points, the correlation looks quite linear (as one would naively expect). Let's test that by fitting a linear function!</p>
+<p><em>The details of the fitting have been extracted into dedicated function, <code>print_and_return_fit()</code>, of the <code>common.py</code> helper file. If you're interested, <a href="common.py">go have a look at it</a>.</em></p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="k">def</span> <span class="nf">linear_function</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">a</span><span class="o">*</span><span class="n">x</span><span class="o">+</span><span class="n">b</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">fit_parameters</span><span class="p">,</span> <span class="n">fit_covariance</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">print_and_return_fit</span><span class="p">(</span>
+    <span class="p">[</span><span class="s2">&quot;PM_RUN_CYC (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;PM_INST_CMPL (min)&quot;</span><span class="p">],</span> 
+    <span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">),</span> 
+    <span class="n">linear_function</span><span class="p">,</span>
+    <span class="n">format_uncertainty</span><span class="o">=</span><span class="s2">&quot;.4f&quot;</span>
+<span class="p">)</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Let's overlay our fits to the graphs from before.</p>
+
+</div>
+</div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
 <div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># Plot Cycles and Instructions - both per grid cell</span>
-<span class="n">fig</span><span class="p">,</span> <span class="p">(</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">)</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
-<span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Cycles / Loop Iteration&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax1</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
-<span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Instructions / Loop Iteration&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax2</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">fig</span><span class="p">,</span> <span class="p">(</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">)</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="k">for</span> <span class="n">ax</span><span class="p">,</span> <span class="n">pmu_counter</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">([</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;PM_RUN_CYC (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;PM_INST_CMPL (min)&quot;</span><span class="p">]):</span>
+    <span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="n">pmu_counter</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+    <span class="n">ax</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span>
+        <span class="n">df</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">],</span> 
+        <span class="n">linear_function</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">],</span> <span class="o">*</span><span class="n">fit_parameters</span><span class="p">[</span><span class="n">pmu_counter</span><span class="p">]),</span> 
+        <span class="n">linestyle</span><span class="o">=</span><span class="s2">&quot;--&quot;</span><span class="p">,</span> 
+        <span class="n">label</span><span class="o">=</span><span class="s2">&quot;Fit: </span><span class="si">{:.2f}</span><span class="s2"> * x + </span><span class="si">{:.2f}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="o">*</span><span class="n">fit_parameters</span><span class="p">[</span><span class="n">pmu_counter</span><span class="p">])</span>
+    <span class="p">)</span>
+    <span class="n">ax</span><span class="o">.</span><span class="n">legend</span><span class="p">();</span>
 </pre></div>
 
     </div>
@@ -13294,7 +13409,38 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>What is your result? What value do the graphs come asymptotically close too?</p>
+<p>Please execute the next cell to summarize the first task.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="nb">print</span><span class="p">(</span><span class="s2">&quot;The algorithm under investigation runs about </span><span class="si">{:.0f}</span><span class="s2"> cycles and executes about </span><span class="si">{:.0f}</span><span class="s2"> instructions per grid point&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
+    <span class="o">*</span><span class="p">[</span><span class="n">fit_parameters</span><span class="p">[</span><span class="n">pmu_counter</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">pmu_counter</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;PM_RUN_CYC (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;PM_INST_CMPL (min)&quot;</span><span class="p">]]</span>
+<span class="p">))</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p><strong>Bonus:</strong></p>
+<p>The linear fits also calculate a y intersection (»<code>b</code>«). How do you interpret this value?</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
 <p>We are revisiting the graph in a little while.</p>
 <p><a href="#toc">Back to top</a></p>
 
@@ -13307,7 +13453,8 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 <h2 id="Task-2:-Measuring-Loads-and-Stores">Task 2: Measuring Loads and Stores<a class="anchor-link" href="#Task-2:-Measuring-Loads-and-Stores">&#182;</a></h2><p><a name="task2"></a></p>
 <p>Looking at the source code, how many loads and stores from / to memory do you expect? Have a look at the loop which we instrumented.</p>
 <p>Let's compare your estimate to what the system actually does!</p>
-<p><a name="task2-a"></a><strong>TASK A</strong>: Please measure counters for loads and stores. See the TODOs in <a href="/edit/Tasks/poisson2d.ld_st.c"><code>poisson2d.ld_st.c</code></a>. This time, implement <code>PM_LD_CMPL</code> and <code>PM_ST_CMPL</code>.</p>
+<h3 id="Task-A">Task A<a class="anchor-link" href="#Task-A">&#182;</a></h3><p><a name="task2-a"></a></p>
+<p>Please measure counters for loads and stores. See the TODOs in <a href="/edit/Tasks/poisson2d.ld_st.c"><code>poisson2d.ld_st.c</code></a>. This time, implement <code>PM_LD_CMPL</code> and <code>PM_ST_CMPL</code>.</p>
 <p>Compile with <code>make task2</code>, test your program with a single run with <code>make run_task2</code>, and then finally submit a benchmarking run to the batch system with <code>make bench_task2</code>. The following cell will take care of all this.</p>
 <p><a href="#toc">Back to top</a></p>
 
@@ -13330,7 +13477,7 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>Once the run finished, let's plot it again with the following cell (non-interactive: <code>make graph_task2a</code>).</p>
+<p>Once the run finished, let's plot it again in the course of the following cells (non-interactive: <code>make graph_task2a</code>).</p>
 
 </div>
 </div>
@@ -13341,8 +13488,7 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_ldst</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&quot;poisson2d.ld_st.bin.csv&quot;</span><span class="p">,</span> <span class="n">skiprows</span><span class="o">=</span><span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">50000</span><span class="p">,</span> <span class="mi">2</span><span class="p">))</span>
-<span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df_ldst</span><span class="p">,</span> <span class="s2">&quot;PM_LD_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Loads / Loop Iteration&quot;</span><span class="p">)</span>
-<span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df_ldst</span><span class="p">,</span> <span class="s2">&quot;PM_ST_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Stores / Loop Iteration&quot;</span><span class="p">)</span>
+<span class="n">df_ldst</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df_ldst</span><span class="p">[</span><span class="s2">&quot;nx&quot;</span><span class="p">]</span> <span class="o">*</span> <span class="n">df_ldst</span><span class="p">[</span><span class="s2">&quot;ny&quot;</span><span class="p">]</span> 
 <span class="n">df_ldst</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
 </pre></div>
 
@@ -13357,8 +13503,66 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">fig</span><span class="p">,</span> <span class="p">(</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">)</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
-<span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Loads / Loop Iteration&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax1</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
-<span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Stores / Loop Iteration&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax2</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+<span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_LD_CMPL (min)&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax1</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+<span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_ST_CMPL (min)&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax2</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Also this behaviour looks – at a first glance – linear. We can again fit a first-order polynom (and re-use our previously defined function <code>curve_fit</code>)!</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">_fit</span><span class="p">,</span> <span class="n">_cov</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">print_and_return_fit</span><span class="p">(</span>
+    <span class="p">[</span><span class="s2">&quot;PM_LD_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;PM_ST_CMPL (min)&quot;</span><span class="p">],</span> 
+    <span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">),</span> 
+    <span class="n">linear_function</span><span class="p">,</span>
+    <span class="n">format_value</span><span class="o">=</span><span class="s2">&quot;.4f&quot;</span>
+<span class="p">)</span>
+<span class="n">fit_parameters</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_parameters</span><span class="p">,</span> <span class="o">**</span><span class="n">_fit</span><span class="p">}</span>
+<span class="n">fit_covariance</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_covariance</span><span class="p">,</span> <span class="o">**</span><span class="n">_cov</span><span class="p">}</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Let's overlay this in one common plot:</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">fig</span><span class="p">,</span> <span class="p">(</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">)</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="k">for</span> <span class="n">ax</span><span class="p">,</span> <span class="n">pmu_counter</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">([</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;PM_LD_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;PM_ST_CMPL (min)&quot;</span><span class="p">]):</span>
+    <span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="n">pmu_counter</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+    <span class="n">ax</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span>
+        <span class="n">df_ldst</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">],</span> 
+        <span class="n">linear_function</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">],</span> <span class="o">*</span><span class="n">fit_parameters</span><span class="p">[</span><span class="n">pmu_counter</span><span class="p">]),</span> 
+        <span class="n">linestyle</span><span class="o">=</span><span class="s2">&quot;--&quot;</span><span class="p">,</span> 
+        <span class="n">label</span><span class="o">=</span><span class="s2">&quot;Fit: </span><span class="si">{:.2f}</span><span class="s2"> * x + </span><span class="si">{:.2f}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="o">*</span><span class="n">fit_parameters</span><span class="p">[</span><span class="n">pmu_counter</span><span class="p">])</span>
+    <span class="p">)</span>
+    <span class="n">ax</span><span class="o">.</span><span class="n">legend</span><span class="p">();</span>
 </pre></div>
 
     </div>
@@ -13370,8 +13574,9 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
 <p>Did you expect more?</p>
-<p>The reason is simple: Among the load and store instructions counted by <code>PM_LD_CMPL</code> and <code>PM_ST_CMPL</code> are vector instructions which can load and store multiple (two) values at a time. To see how many <em>bytes</em> are loaded and stored, we need to measure counters for vectorized loads and stores as well.</p>
-<p><a name="task2-b"></a><strong>TASK B</strong>: Please measure counters for <em>vectorized</em> loads and <em>vectorized</em> stores. See the TODOs in <a href="/edit/Tasks/poisson2d.vld.c"><code>poisson2d.vld.c</code></a> and <a href="/edit/Tasks/poisson2d.vst.c"><code>poisson2d.vst.c</code></a> (<em>Note: These vector counters can not be measured together and need separate files and runs</em>). Can you find out the name of the counters yourself, using <code>papi_native_avail | grep VECTOR_</code>?</p>
+<p>The reason is simple: Among the load and store instructions counted by <code>PM_LD_CMPL</code> and <code>PM_ST_CMPL</code> are vector instructions which can load and store multiple (in this case: two) values at a time. To see how many <em>bytes</em> are loaded and stored, we need to measure counters for vectorized loads and stores as well.</p>
+<h3 id="TASK-B">TASK B<a class="anchor-link" href="#TASK-B">&#182;</a></h3><p><a name="task2-b"></a></p>
+<p>Please measure counters for <em>vectorized</em> loads and <em>vectorized</em> stores. See the TODOs in <a href="poisson2d.vld.c"><code>poisson2d.vld.c</code></a> and <a href="poisson2d.vst.c"><code>poisson2d.vst.c</code></a> (<em>Note: These vector counters can not be measured together and need separate files and runs</em>). Can you find out the name of the counters yourself, using <code>papi_native_avail | grep VECTOR_</code>?</p>
 <p>Compile, test, and bench-run your program again.</p>
 <p><a href="#toc">Back to top</a></p>
 
@@ -13416,7 +13621,7 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
 <p>Let's plot it again, as soon as the run finishes! Non-interactively, call <code>graph_task2b</code>.</p>
-<p><em>We need to read in two CSV files now, which we combine to one common dataframe <code>df_vldvst</code>.</em></p>
+<p><em>Because we couldn't measure the two vector counters at the same time, we have two CSV files to read in now. We combine them into one common dataframe <code>df_vldvst</code> in the following.</em></p>
 
 </div>
 </div>
@@ -13441,8 +13646,7 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 <div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df_vldvst</span><span class="p">,</span> <span class="s2">&quot;PM_VECTOR_LD_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Vector Loads / Loop Iteration&quot;</span><span class="p">)</span>
-<span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df_vldvst</span><span class="p">,</span> <span class="s2">&quot;PM_VECTOR_ST_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Vector Stores / Loop Iteration&quot;</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_vldvst</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df_vldvst</span><span class="p">[</span><span class="s2">&quot;nx&quot;</span><span class="p">]</span> <span class="o">*</span> <span class="n">df_vldvst</span><span class="p">[</span><span class="s2">&quot;ny&quot;</span><span class="p">]</span> 
 <span class="n">df_vldvst</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
 </pre></div>
 
@@ -13457,8 +13661,58 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">fig</span><span class="p">,</span> <span class="p">(</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">)</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
-<span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Vector Loads / Loop Iteration&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax1</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
-<span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Vector Stores / Loop Iteration&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax2</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+<span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_VECTOR_LD_CMPL (min)&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax1</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+<span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_VECTOR_ST_CMPL (min)&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax2</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Also here seems to be a linear correlation. Let's do our fitting and plot directly.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">_fit</span><span class="p">,</span> <span class="n">_cov</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">print_and_return_fit</span><span class="p">(</span>
+    <span class="p">[</span><span class="s2">&quot;PM_VECTOR_LD_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;PM_VECTOR_ST_CMPL (min)&quot;</span><span class="p">],</span> 
+    <span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">),</span> 
+    <span class="n">linear_function</span><span class="p">,</span>
+    <span class="n">format_value</span><span class="o">=</span><span class="s2">&quot;.4f&quot;</span><span class="p">,</span>
+<span class="p">)</span>
+<span class="n">fit_parameters</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_parameters</span><span class="p">,</span> <span class="o">**</span><span class="n">_fit</span><span class="p">}</span>
+<span class="n">fit_covariance</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_covariance</span><span class="p">,</span> <span class="o">**</span><span class="n">_cov</span><span class="p">}</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">fig</span><span class="p">,</span> <span class="p">(</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">)</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="k">for</span> <span class="n">ax</span><span class="p">,</span> <span class="n">pmu_counter</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">([</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;PM_VECTOR_LD_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;PM_VECTOR_ST_CMPL (min)&quot;</span><span class="p">]):</span>
+    <span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="n">pmu_counter</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+    <span class="n">ax</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span>
+        <span class="n">df_vldvst</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">],</span> 
+        <span class="n">linear_function</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">],</span> <span class="o">*</span><span class="n">fit_parameters</span><span class="p">[</span><span class="n">pmu_counter</span><span class="p">]),</span> 
+        <span class="n">linestyle</span><span class="o">=</span><span class="s2">&quot;--&quot;</span><span class="p">,</span> 
+        <span class="n">label</span><span class="o">=</span><span class="s2">&quot;Fit: </span><span class="si">{:.2f}</span><span class="s2"> * x + </span><span class="si">{:.2f}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="o">*</span><span class="n">fit_parameters</span><span class="p">[</span><span class="n">pmu_counter</span><span class="p">])</span>
+    <span class="p">)</span>
+    <span class="n">ax</span><span class="o">.</span><span class="n">legend</span><span class="p">();</span>
 </pre></div>
 
     </div>
@@ -13492,32 +13746,51 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_byte</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">()</span>
-<span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Loads / Loop Iteration&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Vector Loads / Loop Iteration&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Loads / Loop Iteration&quot;</span><span class="p">])</span><span class="o">*</span><span class="mi">8</span>
-<span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Stores / Loop Iteration&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Vector Stores / Loop Iteration&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Stores / Loop Iteration&quot;</span><span class="p">])</span><span class="o">*</span><span class="mi">8</span>
+<span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Loads&quot;</span><span class="p">]</span>  <span class="o">=</span> <span class="p">(</span><span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_VECTOR_LD_CMPL (min)&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_LD_CMPL (min)&quot;</span><span class="p">])</span><span class="o">*</span><span class="mi">8</span>
+<span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Stores&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_VECTOR_ST_CMPL (min)&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_ST_CMPL (min)&quot;</span><span class="p">])</span><span class="o">*</span><span class="mi">8</span>
 <span class="n">ax</span> <span class="o">=</span> <span class="n">df_byte</span><span class="o">.</span><span class="n">plot</span><span class="p">()</span>
-<span class="n">ax</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s2">&quot;Bytes / Loop Iteration&quot;</span><span class="p">);</span>
+<span class="n">ax</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s2">&quot;Bytes&quot;</span><span class="p">);</span>
 </pre></div>
 
     </div>
 </div>
 </div>
 
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Let's quantify the difference by, again, fitting a linear function to the data.</p>
+
+</div>
+</div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
 <div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
-<span class="n">mean_byte_ld</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">polyfit</span><span class="p">(</span><span class="n">df_byte</span><span class="p">[</span><span class="n">df_byte</span><span class="o">.</span><span class="n">index</span> <span class="o">&gt;</span> <span class="mi">200</span><span class="p">]</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">df_byte</span><span class="p">[</span><span class="n">df_byte</span><span class="o">.</span><span class="n">index</span> <span class="o">&gt;</span> <span class="mi">200</span><span class="p">][</span><span class="s2">&quot;Loads / Loop Iteration&quot;</span><span class="p">],</span> <span class="mi">0</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
-<span class="n">mean_byte_st</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">polyfit</span><span class="p">(</span><span class="n">df_byte</span><span class="p">[</span><span class="n">df_byte</span><span class="o">.</span><span class="n">index</span> <span class="o">&gt;</span> <span class="mi">200</span><span class="p">]</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">df_byte</span><span class="p">[</span><span class="n">df_byte</span><span class="o">.</span><span class="n">index</span> <span class="o">&gt;</span> <span class="mi">200</span><span class="p">][</span><span class="s2">&quot;Stores / Loop Iteration&quot;</span><span class="p">],</span> <span class="mi">0</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
-<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Mean byte loaded: </span><span class="si">{}</span><span class="se">\t</span><span class="s2">Mean byte stored: </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">mean_byte_ld</span><span class="p">,</span> <span class="n">mean_byte_st</span><span class="p">))</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">_fit</span><span class="p">,</span> <span class="n">_cov</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">print_and_return_fit</span><span class="p">(</span>
+    <span class="p">[</span><span class="s2">&quot;Loads&quot;</span><span class="p">,</span> <span class="s2">&quot;Stores&quot;</span><span class="p">],</span> 
+    <span class="n">df_byte</span><span class="p">,</span> 
+    <span class="n">linear_function</span>
+<span class="p">)</span>
+<span class="n">fit_parameters</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_parameters</span><span class="p">,</span> <span class="o">**</span><span class="n">_fit</span><span class="p">}</span>
+<span class="n">fit_covariance</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_covariance</span><span class="p">,</span> <span class="o">**</span><span class="n">_cov</span><span class="p">}</span>
 </pre></div>
 
     </div>
 </div>
 </div>
 
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Analagously to the proportionality factors, this mich is loaded/stored per grid point.</p>
+
+</div>
+</div>
 </div>
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
@@ -13533,7 +13806,7 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_bandwidth</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">()</span>
-<span class="n">df_bandwidth</span><span class="p">[</span><span class="s2">&quot;Bandwidth / Byte/Cycle&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Loads / Loop Iteration&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Stores / Loop Iteration&quot;</span><span class="p">])</span> <span class="o">/</span> <span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Cycles / Loop Iteration&quot;</span><span class="p">]</span>
+<span class="n">df_bandwidth</span><span class="p">[</span><span class="s2">&quot;Bandwidth / Byte/Cycle&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Loads&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Stores&quot;</span><span class="p">])</span> <span class="o">/</span> <span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_RUN_CYC (min)&quot;</span><span class="p">]</span>
 </pre></div>
 
     </div>
@@ -13544,7 +13817,7 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>Let's display it as a function of <code>nx</code>. And also compare it to the available L1 cache bandwidth in a second (sub-)plot. Non-interactive users, call <code>make graph_task2c</code>.</p>
+<p>Let's display it as a function of grid points. And also compare it to the available L1 cache bandwidth in a second (sub-)plot. Non-interactive users, call <code>make graph_task2c</code>.</p>
 
 </div>
 </div>
@@ -13580,7 +13853,7 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 <div class="text_cell_render border-box-sizing rendered_html">
 <h2 id="Task-E1:-Measuring-FlOps">Task E1: Measuring FlOps<a class="anchor-link" href="#Task-E1:-Measuring-FlOps">&#182;</a></h2><p><a name="taske1"></a></p>
 <p>If you still have time, feel free to work on the following extended task.</p>
-<p><strong>TASK</strong>: Please measure counters for <em>vectorized</em> floating point operations and <em>scalar</em> floating point operations. The two counters can also not be measured during the same run. So please see the TODOs in <a href="/edit/Tasks/poisson2d.sflops.c"><code>poisson2d.sflops.c</code></a> and <a href="/edit/Tasks/poisson2d.vflops.c"><code>poisson2d.vflops.c</code></a>. By now you should be able to find out the names of the counters by yourself (<em>Hint: they include the words scalar and vector…</em>).</p>
+<p><strong>TASK</strong>: Please measure counters for <em>vectorized</em> floating point operations and <em>scalar</em> floating point operations. The two counters can also not be measured during the same run. So please see the TODOs in <a href="/edit/Tasks/poisson2d.sflops.c"><code>poisson2d.sflops.c</code></a> and <a href="/edit/Tasks/poisson2d.vflops.c"><code>poisson2d.vflops.c</code></a>. By now you should be able to find out the names of the counters by yourself (<em>Hint: they include the words »scalar« and »vector«…</em>).</p>
 <p>As usual, compile, test, and bench-run your program.</p>
 <p><a href="#toc">Back to top</a></p>
 
@@ -13608,6 +13881,7 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_sflop</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&quot;poisson2d.sflop.bin.csv&quot;</span><span class="p">,</span> <span class="n">skiprows</span><span class="o">=</span><span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">50000</span><span class="p">,</span> <span class="mi">2</span><span class="p">))</span>
 <span class="n">df_vflop</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&quot;poisson2d.vflop.bin.csv&quot;</span><span class="p">,</span> <span class="n">skiprows</span><span class="o">=</span><span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">50000</span><span class="p">,</span> <span class="mi">2</span><span class="p">))</span>
 <span class="n">df_flop</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">([</span><span class="n">df_sflop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">),</span> <span class="n">df_vflop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[[</span><span class="s1">&#39;PM_VECTOR_FLOP_CMPL (total)&#39;</span><span class="p">,</span> <span class="s1">&#39;PM_VECTOR_FLOP_CMPL (min)&#39;</span><span class="p">,</span> <span class="s1">&#39; PM_VECTOR_FLOP_CMPL (max)&#39;</span><span class="p">]]],</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span>
+<span class="n">df_flop</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
 </pre></div>
 
     </div>
@@ -13618,7 +13892,7 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>The name of the vector counter is a bit misleading; not floating point operations are measured but floating point instructions. To get <em>real</em> floating point operations, each value needs to be multiplied by the vector width (2). We can plot the values afterwards (non-interactive: <code>make graph_task4</code>).</p>
+<p>Again, the name of the vector counter is a bit misleading; not floating point operations are measured but floating point instructions. To get <em>real</em> floating point operations, each value needs to be multiplied by the vector width (2). We can plot the values afterwards (non-interactive: <code>make graph_task4</code>).</p>
 
 </div>
 </div>
@@ -13628,9 +13902,22 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 <div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df_flop</span><span class="p">,</span> <span class="s2">&quot;PM_SCALAR_FLOP_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Scalar FlOps / Loop Iteration&quot;</span><span class="p">)</span>
-<span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df_flop</span><span class="p">,</span> <span class="s2">&quot;PM_VECTOR_FLOP_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Vector Instructions / Loop Iteration&quot;</span><span class="p">)</span>
-<span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;Vector FlOps / Loop Iteration&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;Vector Instructions / Loop Iteration&quot;</span><span class="p">]</span> <span class="o">*</span> <span class="mi">2</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;nx&quot;</span><span class="p">]</span> <span class="o">*</span> <span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;ny&quot;</span><span class="p">]</span>
+<span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;Vector FlOps (min)&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;PM_VECTOR_FLOP_CMPL (min)&quot;</span><span class="p">]</span> <span class="o">*</span> <span class="mi">2</span>
+<span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;Scalar FlOps (min)&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;PM_SCALAR_FLOP_CMPL (min)&quot;</span><span class="p">]</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_flop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[[</span><span class="s2">&quot;Scalar FlOps (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Vector FlOps (min)&quot;</span><span class="p">]]</span><span class="o">.</span><span class="n">plot</span><span class="p">();</span>
 </pre></div>
 
     </div>
@@ -13643,7 +13930,13 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 <div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_flop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[[</span><span class="s2">&quot;Scalar FlOps / Loop Iteration&quot;</span><span class="p">,</span> <span class="s2">&quot;Vector FlOps / Loop Iteration&quot;</span><span class="p">]]</span><span class="o">.</span><span class="n">plot</span><span class="p">();</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">_fit</span><span class="p">,</span> <span class="n">_cov</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">print_and_return_fit</span><span class="p">(</span>
+    <span class="p">[</span><span class="s2">&quot;Scalar FlOps (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Vector FlOps (min)&quot;</span><span class="p">],</span> 
+    <span class="n">df_flop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">),</span> 
+    <span class="n">linear_function</span>
+<span class="p">)</span>
+<span class="n">fit_parameters</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_parameters</span><span class="p">,</span> <span class="o">**</span><span class="n">_fit</span><span class="p">}</span>
+<span class="n">fit_covariance</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_covariance</span><span class="p">,</span> <span class="o">**</span><span class="n">_cov</span><span class="p">}</span>
 </pre></div>
 
     </div>
@@ -13668,10 +13961,10 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 <div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">I_flop_scalar</span> <span class="o">=</span> <span class="n">df_flop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Scalar FlOps / Loop Iteration&quot;</span><span class="p">]</span>
-<span class="n">I_flop_vector</span> <span class="o">=</span> <span class="n">df_flop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Vector FlOps / Loop Iteration&quot;</span><span class="p">]</span>
-<span class="n">I_mem_load</span>    <span class="o">=</span> <span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Loads / Loop Iteration&quot;</span><span class="p">]</span>
-<span class="n">I_mem_store</span>   <span class="o">=</span> <span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Stores / Loop Iteration&quot;</span><span class="p">]</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">I_flop_scalar</span> <span class="o">=</span> <span class="n">df_flop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;Scalar FlOps (min)&quot;</span><span class="p">]</span>
+<span class="n">I_flop_vector</span> <span class="o">=</span> <span class="n">df_flop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;Vector FlOps (min)&quot;</span><span class="p">]</span>
+<span class="n">I_mem_load</span>    <span class="o">=</span> <span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Loads&quot;</span><span class="p">]</span>
+<span class="n">I_mem_store</span>   <span class="o">=</span> <span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Stores&quot;</span><span class="p">]</span>
 </pre></div>
 
     </div>
@@ -13708,6 +14001,7 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 <div class="text_cell_render border-box-sizing rendered_html">
 <h2 id="Task-E2:-Measuring-a-Larger-Range">Task E2: Measuring a Larger Range<a class="anchor-link" href="#Task-E2:-Measuring-a-Larger-Range">&#182;</a></h2><p><a name="taske2"></a></p>
 <p>If you still still have time, you might venture into your own benchmarking adventure.</p>
+<p>Maybe you noticed already, for instance in Task 2 C: At the very right to very large numbers of grid points, the behaviour of the graph changed. Something is happening there!</p>
 <p><strong>TASK</strong>: Revisit the counters measured above for a larger range of <code>nx</code>. Right now, we only studied <code>nx</code> until 1000. New effects appear above that value – partly only well above, though ($nx &gt; 15000$).</p>
 <p>You're on your own here. Edit the <code>bench.sh</code> script to change the range and the stepping increments.</p>
 <p><strong>Good luck!</strong></p>
diff --git a/2-Performance_Counters/Handson/Hands-On-Performance-Counters.ipynb b/2-Performance_Counters/Handson/Hands-On-Performance-Counters.ipynb
index 7942959e4d442db3198e4a5178f0cbe4da613c5f..c704269f97e25628079d4c31f8ef107f457e69d9 100644
--- a/2-Performance_Counters/Handson/Hands-On-Performance-Counters.ipynb
+++ b/2-Performance_Counters/Handson/Hands-On-Performance-Counters.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# Hands-On: Performance Counters\n",
     "\n",
-    "This Notebook is part of the exercises for the SC18 Tutorial »Application Porting and Optimization on GPU-accelerated POWER Architectures«. It is to be run on a POWER9 machine; in the tutorial: on Ascent, the POWER9 training cluster of Oak Ridge National Lab.\n",
+    "This Notebook is part of the exercises for the SC19 Tutorial »Application Porting and Optimization on GPU-accelerated POWER Architectures«. It is to be run on a POWER9 machine; in the tutorial: on Ascent, the POWER9 training cluster of Oak Ridge National Lab.\n",
     "\n",
     "This Notebook can be run interactively on Ascent. If this capability is unavailable to you, use it as a description for executing the tasks on Ascent via a shell access. During data evaluation, the Notebook mentions the corresponding commands to execute in case you are not able to run the Notebook interactively directly on Ascent.\n",
     "\n",
@@ -43,18 +43,28 @@
     "}\n",
     "```\n",
     "\n",
-    "After `PAPI_add_named_event()` is used to add named PMU events outside of the relaxation iteration, `PAPI_start()`\n",
+    "The code is instrumented using PAPI. The API routine `PAPI_add_named_event()` is used to add *named* PMU events outside of the relaxation iteration. After that, calls to `PAPI_start()`\n",
     "and `PAPI_stop()` can be used to count how often a PMU event is incremented.\n",
     "\n",
-    "For the first task, we will measure quantities often used to characterize an application, cycles and instructions.\n",
+    "For the first task, we will measure quantities often used to characterize an application: cycles and instructions.\n",
     "\n",
-    "**TASK**: Please measure counters for completed instructions and run cycles. See the TODOs in [`poisson2d.ins_cyc.c`](/edit/Tasks/poisson2d.ins_cyc.c). Either edit with Jupyter capabilities by clicking on the link of the file or use a dedicated editor (`vim` is available). The names of the counters to be implemented are `PM_INST_CMPL` and `PM_RUN_CYC`.\n",
+    "**TASK**: Please measure counters for completed instructions and run cycles. See the TODOs in file [`poisson2d.ins_cyc.c`](poisson2d.ins_cyc.c). You can either edit the files with Jupyter capabilities by clicking on the link of the file or selecting it in the file drawer on the left; or use a dedicated editor on the system(`vim` is available). The names of the counters to be implemented are `PM_INST_CMPL` and `PM_RUN_CYC`.\n",
     "\n",
-    "After changing the source code, compile it with `make task1` or by executing the following cell (we need to change directories first, though).\n",
+    "After changing the source code, compile it with `make task1` or by executing the following cell (we need to change directories first, though).  \n",
+    "*(Using the `Makefile` we have hidden quite a few intricacies from you in order to focus on the relevant content at hand. Don't worry too much about it right now – we'll un-hide it gradually during the course of the tutorial.)*\n",
     "\n",
     "[Back to top](#toc)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pwd"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -78,7 +88,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Make sure your program is measuring correctly, by invoking it, for instance with these arguments: `./poisson2d.ins_cyc.bin 100 64 32` – see the next cell. The `100` specifies the number of iterations to perform, `64` and `32` are the size of the grid in y and x direction, respectively."
+    "Before we launch our measurement campaign we should make sure that the program is measuring correctly. Let's invoking it, for instance, with these arguments: `./poisson2d.ins_cyc.bin 100 64 32` – see the next cell. The `100` specifies the number of iterations to perform, `64` and `32` are the size of the grid in y and x direction, respectively."
    ]
   },
   {
@@ -95,7 +105,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For the following runs, we are going to use Ascent's compute backend nodes which are not shared amongst users and also have six GPUs available. We use the available batch scheduler *IBM Spectrum LSF* for this. For convenience, a call to the batch submission system is stored in the environment variable `$SC18_SUBMIT_CMD`. You are welcome to adapt it once you get more familiar with the system.\n",
+    "Alright! That should return a comma-seperated list of measurements.\n",
+    "\n",
+    "For the following runs, we are going to use Ascent's compute backend nodes which are not shared amongst users and also have six GPUs available (each!). We use the available batch scheduler *IBM Spectrum LSF* for this. For convenience, a call to the batch submission system is stored in the environment variable `$SC19_SUBMIT_CMD`. You are welcome to adapt it once you get more familiar with the system.\n",
     "\n",
     "For now, we want to run our first benchmarking run and measure cycles and instructions for different data sizes, as a function of `nx`. The Makefile holds a target for this, call it with `make bench_task1`:"
    ]
@@ -113,7 +125,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Once the run is completed, let's have a look at the data!\n",
+    "Once the run is completed, let's study the data!\n",
     "\n",
     "This can be done best in the interactive version of the Jupyter Notebook. In case this version of the description is unavailable to you, call the Makefile target `make graph_task1` (either with X forwarding, or download the resulting PDF)."
    ]
@@ -124,6 +136,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import numpy as np\n",
     "import seaborn as sns\n",
     "import pandas as pd\n",
     "import matplotlib.pyplot as plt\n",
@@ -133,6 +146,22 @@
     "plt.rcParams['figure.figsize'] = [14, 6]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute the following cell if you want to switch to color-blind-safer colors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.set_palette(\"colorblind\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -141,29 +170,119 @@
    "source": [
     "plt.rcParams['figure.figsize'] = [14, 6]\n",
     "df = pd.read_csv(\"poisson2d.ins_cyc.bin.csv\", skiprows=range(2, 50000, 2))  # Read in the CSV file from the bench run; parse with Pandas\n",
-    "common.normalize(df, \"PM_INST_CMPL (min)\", \"Instructions / Loop Iteration\")  # Normalize to each grid cell\n",
-    "common.normalize(df, \"PM_RUN_CYC (min)\", \"Cycles / Loop Iteration\")\n",
+    "df[\"Grid Points\"] = df[\"nx\"] * df[\"ny\"]  # Add a new column of the number of grid points (the product of nx and ny)\n",
     "df.head()  # Display the head of the Pandas dataframe"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's have a look at the counters we've just measured and see how they scaling with increasing number of grid points.\n",
+    "\n",
+    "*In the following, we are always using the minimal value of the counter (indicated by »(min)«) as this should give us an estimate of the best achievable result of the architecture.*"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Plot Cycles and Instructions - both per grid cell\n",
     "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
-    "df.set_index(\"nx\")[\"Cycles / Loop Iteration\"].plot(ax=ax1, legend=True);\n",
-    "df.set_index(\"nx\")[\"Instructions / Loop Iteration\"].plot(ax=ax2, legend=True);"
+    "df.set_index(\"Grid Points\")[\"PM_RUN_CYC (min)\"].plot(ax=ax1, legend=True);\n",
+    "df.set_index(\"Grid Points\")[\"PM_INST_CMPL (min)\"].plot(ax=ax2, legend=True);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Although some slight variations can be seen for run cycles for many grid points, the correlation looks quite linear (as one would naively expect). Let's test that by fitting a linear function!\n",
+    "\n",
+    "*The details of the fitting have been extracted into dedicated function, `print_and_return_fit()`, of the `common.py` helper file. If you're interested, [go have a look at it](common.py).* "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def linear_function(x, a, b):\n",
+    "    return a*x+b"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fit_parameters, fit_covariance = common.print_and_return_fit(\n",
+    "    [\"PM_RUN_CYC (min)\", \"PM_INST_CMPL (min)\"], \n",
+    "    df.set_index(\"Grid Points\"), \n",
+    "    linear_function,\n",
+    "    format_uncertainty=\".4f\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's overlay our fits to the graphs from before."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
+    "for ax, pmu_counter in zip([ax1, ax2], [\"PM_RUN_CYC (min)\", \"PM_INST_CMPL (min)\"]):\n",
+    "    df.set_index(\"Grid Points\")[pmu_counter].plot(ax=ax, legend=True);\n",
+    "    ax.plot(\n",
+    "        df[\"Grid Points\"], \n",
+    "        linear_function(df[\"Grid Points\"], *fit_parameters[pmu_counter]), \n",
+    "        linestyle=\"--\", \n",
+    "        label=\"Fit: {:.2f} * x + {:.2f}\".format(*fit_parameters[pmu_counter])\n",
+    "    )\n",
+    "    ax.legend();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Please execute the next cell to summarize the first task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"The algorithm under investigation runs about {:.0f} cycles and executes about {:.0f} instructions per grid point\".format(\n",
+    "    *[fit_parameters[pmu_counter][0] for pmu_counter in [\"PM_RUN_CYC (min)\", \"PM_INST_CMPL (min)\"]]\n",
+    "))"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "What is your result? What value do the graphs come asymptotically close too?\n",
+    "**Bonus:**\n",
     "\n",
+    "The linear fits also calculate a y intersection (»`b`«). How do you interpret this value?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "We are revisiting the graph in a little while.\n",
     "\n",
     "[Back to top](#toc)"
@@ -180,7 +299,10 @@
     "\n",
     "Let's compare your estimate to what the system actually does!\n",
     "\n",
-    "<a name=\"task2-a\"></a>**TASK A**: Please measure counters for loads and stores. See the TODOs in [`poisson2d.ld_st.c`](/edit/Tasks/poisson2d.ld_st.c). This time, implement `PM_LD_CMPL` and `PM_ST_CMPL`.\n",
+    "### Task A\n",
+    "<a name=\"task2-a\"></a>\n",
+    "\n",
+    "Please measure counters for loads and stores. See the TODOs in [`poisson2d.ld_st.c`](/edit/Tasks/poisson2d.ld_st.c). This time, implement `PM_LD_CMPL` and `PM_ST_CMPL`.\n",
     "\n",
     "Compile with `make task2`, test your program with a single run with `make run_task2`, and then finally submit a benchmarking run to the batch system with `make bench_task2`. The following cell will take care of all this.\n",
     "\n",
@@ -200,7 +322,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Once the run finished, let's plot it again with the following cell (non-interactive: `make graph_task2a`)."
+    "Once the run finished, let's plot it again in the course of the following cells (non-interactive: `make graph_task2a`)."
    ]
   },
   {
@@ -210,8 +332,7 @@
    "outputs": [],
    "source": [
     "df_ldst = pd.read_csv(\"poisson2d.ld_st.bin.csv\", skiprows=range(2, 50000, 2))\n",
-    "common.normalize(df_ldst, \"PM_LD_CMPL (min)\", \"Loads / Loop Iteration\")\n",
-    "common.normalize(df_ldst, \"PM_ST_CMPL (min)\", \"Stores / Loop Iteration\")\n",
+    "df_ldst[\"Grid Points\"] = df_ldst[\"nx\"] * df_ldst[\"ny\"] \n",
     "df_ldst.head()"
    ]
   },
@@ -222,8 +343,56 @@
    "outputs": [],
    "source": [
     "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
-    "df_ldst.set_index(\"nx\")[\"Loads / Loop Iteration\"].plot(ax=ax1, legend=True);\n",
-    "df_ldst.set_index(\"nx\")[\"Stores / Loop Iteration\"].plot(ax=ax2, legend=True);"
+    "df_ldst.set_index(\"Grid Points\")[\"PM_LD_CMPL (min)\"].plot(ax=ax1, legend=True);\n",
+    "df_ldst.set_index(\"Grid Points\")[\"PM_ST_CMPL (min)\"].plot(ax=ax2, legend=True);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Also this behaviour looks – at a first glance – linear. We can again fit a first-order polynom (and re-use our previously defined function `curve_fit`)!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_fit, _cov = common.print_and_return_fit(\n",
+    "    [\"PM_LD_CMPL (min)\", \"PM_ST_CMPL (min)\"], \n",
+    "    df_ldst.set_index(\"Grid Points\"), \n",
+    "    linear_function,\n",
+    "    format_value=\".4f\"\n",
+    ")\n",
+    "fit_parameters = {**fit_parameters, **_fit}\n",
+    "fit_covariance = {**fit_covariance, **_cov}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's overlay this in one common plot:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
+    "for ax, pmu_counter in zip([ax1, ax2], [\"PM_LD_CMPL (min)\", \"PM_ST_CMPL (min)\"]):\n",
+    "    df_ldst.set_index(\"Grid Points\")[pmu_counter].plot(ax=ax, legend=True);\n",
+    "    ax.plot(\n",
+    "        df_ldst[\"Grid Points\"], \n",
+    "        linear_function(df[\"Grid Points\"], *fit_parameters[pmu_counter]), \n",
+    "        linestyle=\"--\", \n",
+    "        label=\"Fit: {:.2f} * x + {:.2f}\".format(*fit_parameters[pmu_counter])\n",
+    "    )\n",
+    "    ax.legend();"
    ]
   },
   {
@@ -232,9 +401,12 @@
    "source": [
     "Did you expect more?\n",
     "\n",
-    "The reason is simple: Among the load and store instructions counted by `PM_LD_CMPL` and `PM_ST_CMPL` are vector instructions which can load and store multiple (two) values at a time. To see how many *bytes* are loaded and stored, we need to measure counters for vectorized loads and stores as well.\n",
+    "The reason is simple: Among the load and store instructions counted by `PM_LD_CMPL` and `PM_ST_CMPL` are vector instructions which can load and store multiple (in this case: two) values at a time. To see how many *bytes* are loaded and stored, we need to measure counters for vectorized loads and stores as well.\n",
     "\n",
-    "<a name=\"task2-b\"></a>**TASK B**: Please measure counters for _vectorized_ loads and _vectorized_ stores. See the TODOs in [`poisson2d.vld.c`](/edit/Tasks/poisson2d.vld.c) and [`poisson2d.vst.c`](/edit/Tasks/poisson2d.vst.c) (*Note: These vector counters can not be measured together and need separate files and runs*). Can you find out the name of the counters yourself, using `papi_native_avail | grep VECTOR_`?\n",
+    "### TASK B\n",
+    "<a name=\"task2-b\"></a>\n",
+    "\n",
+    "Please measure counters for _vectorized_ loads and _vectorized_ stores. See the TODOs in [`poisson2d.vld.c`](poisson2d.vld.c) and [`poisson2d.vst.c`](poisson2d.vst.c) (*Note: These vector counters can not be measured together and need separate files and runs*). Can you find out the name of the counters yourself, using `papi_native_avail | grep VECTOR_`?\n",
     "\n",
     "Compile, test, and bench-run your program again.\n",
     "\n",
@@ -272,7 +444,7 @@
    "source": [
     "Let's plot it again, as soon as the run finishes! Non-interactively, call `graph_task2b`.\n",
     "\n",
-    "*We need to read in two CSV files now, which we combine to one common dataframe `df_vldvst`.*"
+    "*Because we couldn't measure the two vector counters at the same time, we have two CSV files to read in now. We combine them into one common dataframe `df_vldvst` in the following.*"
    ]
   },
   {
@@ -292,8 +464,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "common.normalize(df_vldvst, \"PM_VECTOR_LD_CMPL (min)\", \"Vector Loads / Loop Iteration\")\n",
-    "common.normalize(df_vldvst, \"PM_VECTOR_ST_CMPL (min)\", \"Vector Stores / Loop Iteration\")\n",
+    "df_vldvst[\"Grid Points\"] = df_vldvst[\"nx\"] * df_vldvst[\"ny\"] \n",
     "df_vldvst.head()"
    ]
   },
@@ -304,8 +475,49 @@
    "outputs": [],
    "source": [
     "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
-    "df_vldvst.set_index(\"nx\")[\"Vector Loads / Loop Iteration\"].plot(ax=ax1, legend=True);\n",
-    "df_vldvst.set_index(\"nx\")[\"Vector Stores / Loop Iteration\"].plot(ax=ax2, legend=True);"
+    "df_vldvst.set_index(\"Grid Points\")[\"PM_VECTOR_LD_CMPL (min)\"].plot(ax=ax1, legend=True);\n",
+    "df_vldvst.set_index(\"Grid Points\")[\"PM_VECTOR_ST_CMPL (min)\"].plot(ax=ax2, legend=True);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Also here seems to be a linear correlation. Let's do our fitting and plot directly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_fit, _cov = common.print_and_return_fit(\n",
+    "    [\"PM_VECTOR_LD_CMPL (min)\", \"PM_VECTOR_ST_CMPL (min)\"], \n",
+    "    df_vldvst.set_index(\"Grid Points\"), \n",
+    "    linear_function,\n",
+    "    format_value=\".4f\",\n",
+    ")\n",
+    "fit_parameters = {**fit_parameters, **_fit}\n",
+    "fit_covariance = {**fit_covariance, **_cov}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
+    "for ax, pmu_counter in zip([ax1, ax2], [\"PM_VECTOR_LD_CMPL (min)\", \"PM_VECTOR_ST_CMPL (min)\"]):\n",
+    "    df_vldvst.set_index(\"Grid Points\")[pmu_counter].plot(ax=ax, legend=True);\n",
+    "    ax.plot(\n",
+    "        df_vldvst[\"Grid Points\"], \n",
+    "        linear_function(df[\"Grid Points\"], *fit_parameters[pmu_counter]), \n",
+    "        linestyle=\"--\", \n",
+    "        label=\"Fit: {:.2f} * x + {:.2f}\".format(*fit_parameters[pmu_counter])\n",
+    "    )\n",
+    "    ax.legend();"
    ]
   },
   {
@@ -339,10 +551,17 @@
    "outputs": [],
    "source": [
     "df_byte = pd.DataFrame()\n",
-    "df_byte[\"Loads / Loop Iteration\"] = (df_vldvst.set_index(\"nx\")[\"Vector Loads / Loop Iteration\"] + df_ldst.set_index(\"nx\")[\"Loads / Loop Iteration\"])*8\n",
-    "df_byte[\"Stores / Loop Iteration\"] = (df_vldvst.set_index(\"nx\")[\"Vector Stores / Loop Iteration\"] + df_ldst.set_index(\"nx\")[\"Stores / Loop Iteration\"])*8\n",
+    "df_byte[\"Loads\"]  = (df_vldvst.set_index(\"Grid Points\")[\"PM_VECTOR_LD_CMPL (min)\"] + df_ldst.set_index(\"Grid Points\")[\"PM_LD_CMPL (min)\"])*8\n",
+    "df_byte[\"Stores\"] = (df_vldvst.set_index(\"Grid Points\")[\"PM_VECTOR_ST_CMPL (min)\"] + df_ldst.set_index(\"Grid Points\")[\"PM_ST_CMPL (min)\"])*8\n",
     "ax = df_byte.plot()\n",
-    "ax.set_ylabel(\"Bytes / Loop Iteration\");"
+    "ax.set_ylabel(\"Bytes\");"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's quantify the difference by, again, fitting a linear function to the data."
    ]
   },
   {
@@ -351,10 +570,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import numpy as np\n",
-    "mean_byte_ld = np.polyfit(df_byte[df_byte.index > 200].index, df_byte[df_byte.index > 200][\"Loads / Loop Iteration\"], 0)[0]\n",
-    "mean_byte_st = np.polyfit(df_byte[df_byte.index > 200].index, df_byte[df_byte.index > 200][\"Stores / Loop Iteration\"], 0)[0]\n",
-    "print(\"Mean byte loaded: {}\\tMean byte stored: {}\".format(mean_byte_ld, mean_byte_st))"
+    "_fit, _cov = common.print_and_return_fit(\n",
+    "    [\"Loads\", \"Stores\"], \n",
+    "    df_byte, \n",
+    "    linear_function\n",
+    ")\n",
+    "fit_parameters = {**fit_parameters, **_fit}\n",
+    "fit_covariance = {**fit_covariance, **_cov}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Analagously to the proportionality factors, this mich is loaded/stored per grid point."
    ]
   },
   {
@@ -371,14 +600,14 @@
    "outputs": [],
    "source": [
     "df_bandwidth = pd.DataFrame()\n",
-    "df_bandwidth[\"Bandwidth / Byte/Cycle\"] = (df_byte[\"Loads / Loop Iteration\"] + df_byte[\"Stores / Loop Iteration\"]) / df.set_index(\"nx\")[\"Cycles / Loop Iteration\"]"
+    "df_bandwidth[\"Bandwidth / Byte/Cycle\"] = (df_byte[\"Loads\"] + df_byte[\"Stores\"]) / df.set_index(\"Grid Points\")[\"PM_RUN_CYC (min)\"]"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's display it as a function of `nx`. And also compare it to the available L1 cache bandwidth in a second (sub-)plot. Non-interactive users, call `make graph_task2c`."
+    "Let's display it as a function of grid points. And also compare it to the available L1 cache bandwidth in a second (sub-)plot. Non-interactive users, call `make graph_task2c`."
    ]
   },
   {
@@ -412,7 +641,7 @@
     "If you still have time, feel free to work on the following extended task.\n",
     "\n",
     "\n",
-    "**TASK**: Please measure counters for _vectorized_ floating point operations and _scalar_ floating point operations. The two counters can also not be measured during the same run. So please see the TODOs in [`poisson2d.sflops.c`](/edit/Tasks/poisson2d.sflops.c) and [`poisson2d.vflops.c`](/edit/Tasks/poisson2d.vflops.c). By now you should be able to find out the names of the counters by yourself (*Hint: they include the words scalar and vector…*).\n",
+    "**TASK**: Please measure counters for _vectorized_ floating point operations and _scalar_ floating point operations. The two counters can also not be measured during the same run. So please see the TODOs in [`poisson2d.sflops.c`](/edit/Tasks/poisson2d.sflops.c) and [`poisson2d.vflops.c`](/edit/Tasks/poisson2d.vflops.c). By now you should be able to find out the names of the counters by yourself (*Hint: they include the words »scalar« and »vector«…*).\n",
     "\n",
     "As usual, compile, test, and bench-run your program.\n",
     "\n",
@@ -436,14 +665,26 @@
    "source": [
     "df_sflop = pd.read_csv(\"poisson2d.sflop.bin.csv\", skiprows=range(2, 50000, 2))\n",
     "df_vflop = pd.read_csv(\"poisson2d.vflop.bin.csv\", skiprows=range(2, 50000, 2))\n",
-    "df_flop = pd.concat([df_sflop.set_index(\"nx\"), df_vflop.set_index(\"nx\")[['PM_VECTOR_FLOP_CMPL (total)', 'PM_VECTOR_FLOP_CMPL (min)', ' PM_VECTOR_FLOP_CMPL (max)']]], axis=1).reset_index()"
+    "df_flop = pd.concat([df_sflop.set_index(\"nx\"), df_vflop.set_index(\"nx\")[['PM_VECTOR_FLOP_CMPL (total)', 'PM_VECTOR_FLOP_CMPL (min)', ' PM_VECTOR_FLOP_CMPL (max)']]], axis=1).reset_index()\n",
+    "df_flop.head()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The name of the vector counter is a bit misleading; not floating point operations are measured but floating point instructions. To get *real* floating point operations, each value needs to be multiplied by the vector width (2). We can plot the values afterwards (non-interactive: `make graph_task4`)."
+    "Again, the name of the vector counter is a bit misleading; not floating point operations are measured but floating point instructions. To get *real* floating point operations, each value needs to be multiplied by the vector width (2). We can plot the values afterwards (non-interactive: `make graph_task4`)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_flop[\"Grid Points\"] = df_flop[\"nx\"] * df_flop[\"ny\"]\n",
+    "df_flop[\"Vector FlOps (min)\"] = df_flop[\"PM_VECTOR_FLOP_CMPL (min)\"] * 2\n",
+    "df_flop[\"Scalar FlOps (min)\"] = df_flop[\"PM_SCALAR_FLOP_CMPL (min)\"]"
    ]
   },
   {
@@ -452,9 +693,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "common.normalize(df_flop, \"PM_SCALAR_FLOP_CMPL (min)\", \"Scalar FlOps / Loop Iteration\")\n",
-    "common.normalize(df_flop, \"PM_VECTOR_FLOP_CMPL (min)\", \"Vector Instructions / Loop Iteration\")\n",
-    "df_flop[\"Vector FlOps / Loop Iteration\"] = df_flop[\"Vector Instructions / Loop Iteration\"] * 2"
+    "df_flop.set_index(\"Grid Points\")[[\"Scalar FlOps (min)\", \"Vector FlOps (min)\"]].plot();"
    ]
   },
   {
@@ -463,7 +702,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_flop.set_index(\"nx\")[[\"Scalar FlOps / Loop Iteration\", \"Vector FlOps / Loop Iteration\"]].plot();"
+    "_fit, _cov = common.print_and_return_fit(\n",
+    "    [\"Scalar FlOps (min)\", \"Vector FlOps (min)\"], \n",
+    "    df_flop.set_index(\"Grid Points\"), \n",
+    "    linear_function\n",
+    ")\n",
+    "fit_parameters = {**fit_parameters, **_fit}\n",
+    "fit_covariance = {**fit_covariance, **_cov}"
    ]
   },
   {
@@ -487,10 +732,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "I_flop_scalar = df_flop.set_index(\"nx\")[\"Scalar FlOps / Loop Iteration\"]\n",
-    "I_flop_vector = df_flop.set_index(\"nx\")[\"Vector FlOps / Loop Iteration\"]\n",
-    "I_mem_load    = df_byte[\"Loads / Loop Iteration\"]\n",
-    "I_mem_store   = df_byte[\"Stores / Loop Iteration\"]"
+    "I_flop_scalar = df_flop.set_index(\"Grid Points\")[\"Scalar FlOps (min)\"]\n",
+    "I_flop_vector = df_flop.set_index(\"Grid Points\")[\"Vector FlOps (min)\"]\n",
+    "I_mem_load    = df_byte[\"Loads\"]\n",
+    "I_mem_store   = df_byte[\"Stores\"]"
    ]
   },
   {
@@ -521,6 +766,8 @@
     "\n",
     "If you still still have time, you might venture into your own benchmarking adventure.\n",
     "\n",
+    "Maybe you noticed already, for instance in Task 2 C: At the very right to very large numbers of grid points, the behaviour of the graph changed. Something is happening there!\n",
+    "\n",
     "\n",
     "**TASK**: Revisit the counters measured above for a larger range of `nx`. Right now, we only studied `nx` until 1000. New effects appear above that value – partly only well above, though ($nx > 15000$).\n",
     "\n",
@@ -548,9 +795,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.1"
+   "version": "3.7.0"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/2-Performance_Counters/Handson/Hands-On-Performance-Counters.pdf b/2-Performance_Counters/Handson/Hands-On-Performance-Counters.pdf
index 73a4faac63aa8e05cd229237c0810f168ad6fcd9..570da387d6836ef559a0c47b1a0a53bc19b847b6 100644
Binary files a/2-Performance_Counters/Handson/Hands-On-Performance-Counters.pdf and b/2-Performance_Counters/Handson/Hands-On-Performance-Counters.pdf differ
diff --git a/2-Performance_Counters/Handson/README.md b/2-Performance_Counters/Handson/README.md
index 3ee0057da9448f49ece6e6afededeafee38ff907..8887dd78d9f2db7644f575df47c48c4c41ed655e 100644
--- a/2-Performance_Counters/Handson/README.md
+++ b/2-Performance_Counters/Handson/README.md
@@ -2,7 +2,5 @@
 
 This folder holds the files for the first hands-on exercise about Performance Counters on POWER9.
 
-Make sure to load all modules of this session by typing `module load sc18/handson1` into the shell.
-
 All task description is in an accompanying Jupyter Notebook. Open it interactively on Ascent with port forwarding. If that is impossible to do, use the static convert to HTML or PDF of the Notebook and follow along accordingly.
 
diff --git a/2-Performance_Counters/Handson/Solutions/Hands-On-Performance-Counters.html b/2-Performance_Counters/Handson/Solutions/Hands-On-Performance-Counters.html
index 9880bb07996842b407fb2ee33df39f8c14388edf..70a67890f406b9636d1397ce7ca82b2e66642e19 100644
--- a/2-Performance_Counters/Handson/Solutions/Hands-On-Performance-Counters.html
+++ b/2-Performance_Counters/Handson/Solutions/Hands-On-Performance-Counters.html
@@ -2,7 +2,7 @@
 <html>
 <head><meta charset="utf-8" />
 
-<title>Hands-On-Performance-Counters</title>
+<title>Notebook</title>
 
 <script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js"></script>
 <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.3/jquery.min.js"></script>
@@ -13116,7 +13116,7 @@ div#notebook {
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h1 id="Hands-On:-Performance-Counters">Hands-On: Performance Counters<a class="anchor-link" href="#Hands-On:-Performance-Counters">&#182;</a></h1><p>This Notebook is part of the exercises for the SC18 Tutorial »Application Porting and Optimization on GPU-accelerated POWER Architectures«. It is to be run on a POWER9 machine; in the tutorial: on Ascent, the POWER9 training cluster of Oak Ridge National Lab.</p>
+<h1 id="Hands-On:-Performance-Counters">Hands-On: Performance Counters<a class="anchor-link" href="#Hands-On:-Performance-Counters">&#182;</a></h1><p>This Notebook is part of the exercises for the SC19 Tutorial »Application Porting and Optimization on GPU-accelerated POWER Architectures«. It is to be run on a POWER9 machine; in the tutorial: on Ascent, the POWER9 training cluster of Oak Ridge National Lab.</p>
 <p>This Notebook can be run interactively on Ascent. If this capability is unavailable to you, use it as a description for executing the tasks on Ascent via a shell access. During data evaluation, the Notebook mentions the corresponding commands to execute in case you are not able to run the Notebook interactively directly on Ascent.</p>
 <h2 id="Table-of-Contents">Table of Contents<a class="anchor-link" href="#Table-of-Contents">&#182;</a></h2><p><a name="toc"></a></p>
 <ul>
@@ -13149,15 +13149,47 @@ div#notebook {
     <span class="p">}</span>
 <span class="p">}</span>
 </pre></div>
-<p>After <code>PAPI_add_named_event()</code> is used to add named PMU events outside of the relaxation iteration, <code>PAPI_start()</code>
+<p>The code is instrumented using PAPI. The API routine <code>PAPI_add_named_event()</code> is used to add <em>named</em> PMU events outside of the relaxation iteration. After that, calls to <code>PAPI_start()</code>
 and <code>PAPI_stop()</code> can be used to count how often a PMU event is incremented.</p>
-<p>For the first task, we will measure quantities often used to characterize an application, cycles and instructions.</p>
-<p><strong>TASK</strong>: Please measure counters for completed instructions and run cycles. See the TODOs in <a href="/edit/Tasks/poisson2d.ins_cyc.c"><code>poisson2d.ins_cyc.c</code></a>. Either edit with Jupyter capabilities by clicking on the link of the file or use a dedicated editor (<code>vim</code> is available). The names of the counters to be implemented are <code>PM_INST_CMPL</code> and <code>PM_RUN_CYC</code>.</p>
-<p>After changing the source code, compile it with <code>make task1</code> or by executing the following cell (we need to change directories first, though).</p>
+<p>For the first task, we will measure quantities often used to characterize an application: cycles and instructions.</p>
+<p><strong>TASK</strong>: Please measure counters for completed instructions and run cycles. See the TODOs in file <a href="poisson2d.ins_cyc.c"><code>poisson2d.ins_cyc.c</code></a>. You can either edit the files with Jupyter capabilities by clicking on the link of the file or selecting it in the file drawer on the left; or use a dedicated editor on the system(<code>vim</code> is available). The names of the counters to be implemented are <code>PM_INST_CMPL</code> and <code>PM_RUN_CYC</code>.</p>
+<p>After changing the source code, compile it with <code>make task1</code> or by executing the following cell (we need to change directories first, though).<br>
+<em>(Using the <code>Makefile</code> we have hidden quite a few intricacies from you in order to focus on the relevant content at hand. Don't worry too much about it right now – we'll un-hide it gradually during the course of the tutorial.)</em></p>
 <p><a href="#toc">Back to top</a></p>
 
 </div>
 </div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[1]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="o">!</span>pwd
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>/autofs/nccsopen-svm1_home/aherten/OpenPOWER-SC19/Prototyping/2-Performance_Counters/Handson/Solutions
+</pre>
+</div>
+</div>
+
+</div>
+</div>
+
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
@@ -13193,7 +13225,7 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[20]:</div>
+<div class="prompt input_prompt">In&nbsp;[2]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="o">!</span>make task1
@@ -13213,7 +13245,7 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>gcc -DUSE_DOUBLE -Ofast -std=c99 -lm -lpapi  poisson2d.ins_cyc.c -o poisson2d.ins_cyc.bin
+<pre>gcc -DUSE_DOUBLE -Ofast -std=c99 -lm -lpapi  poisson2d.ins_cyc.c -o poisson2d.ins_cyc.bin
 </pre>
 </div>
 </div>
@@ -13225,14 +13257,14 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>Make sure your program is measuring correctly, by invoking it, for instance with these arguments: <code>./poisson2d.ins_cyc.bin 100 64 32</code> – see the next cell. The <code>100</code> specifies the number of iterations to perform, <code>64</code> and <code>32</code> are the size of the grid in y and x direction, respectively.</p>
+<p>Before we launch our measurement campaign we should make sure that the program is measuring correctly. Let's invoking it, for instance, with these arguments: <code>./poisson2d.ins_cyc.bin 100 64 32</code> – see the next cell. The <code>100</code> specifies the number of iterations to perform, <code>64</code> and <code>32</code> are the size of the grid in y and x direction, respectively.</p>
 
 </div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[25]:</div>
+<div class="prompt input_prompt">In&nbsp;[1]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="o">!</span>./poisson2d.ins_cyc.bin <span class="m">100</span> <span class="m">64</span> <span class="m">32</span>
@@ -13253,8 +13285,8 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-100,64,32,0.0011,3324000,33229,34329,1902422,18803,27821
+<pre>iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
+100,64,32,0.0011,3324225,33235,33960,1859440,18357,25033
 </pre>
 </div>
 </div>
@@ -13266,7 +13298,8 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>For the following runs, we are going to use Ascent's compute backend nodes which are not shared amongst users and also have six GPUs available. We use the available batch scheduler <em>IBM Spectrum LSF</em> for this. For convenience, a call to the batch submission system is stored in the environment variable <code>$SC18_SUBMIT_CMD</code>. You are welcome to adapt it once you get more familiar with the system.</p>
+<p>Alright! That should return a comma-seperated list of measurements.</p>
+<p>For the following runs, we are going to use Ascent's compute backend nodes which are not shared amongst users and also have six GPUs available (each!). We use the available batch scheduler <em>IBM Spectrum LSF</em> for this. For convenience, a call to the batch submission system is stored in the environment variable <code>$SC19_SUBMIT_CMD</code>. You are welcome to adapt it once you get more familiar with the system.</p>
 <p>For now, we want to run our first benchmarking run and measure cycles and instructions for different data sizes, as a function of <code>nx</code>. The Makefile holds a target for this, call it with <code>make bench_task1</code>:</p>
 
 </div>
@@ -13274,7 +13307,7 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[80]:</div>
+<div class="prompt input_prompt">In&nbsp;[2]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="o">!</span>make bench_task1
@@ -13294,524 +13327,523 @@ and <code>PAPI_stop()</code> can be used to count how often a PMU event is incre
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>gcc -DUSE_DOUBLE -Ofast -std=c99 -lm -lpapi  poisson2d.ins_cyc.c -o poisson2d.ins_cyc.bin
-bsub -W 60 -nnodes 1 -Is -P GEN110 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.ins_cyc.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv
-Job &lt;4318&gt; is submitted to default queue &lt;batch&gt;.
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.ins_cyc.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.ins_cyc.bin.csv
+Job &lt;24059&gt; is submitted to default queue &lt;batch&gt;.
 &lt;&lt;Waiting for dispatch ...&gt;&gt;
 &lt;&lt;Starting on login1&gt;&gt;
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,4,0.0012,548153,2735,3888,266504,1243,4753
+200,32,4,0.0012,572978,2861,3639,261330,1235,4684
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,8,0.0014,1082153,5405,6558,668070,3227,6573
+200,32,8,0.0014,1082978,5411,6189,601962,2914,5099
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,12,0.0014,1442153,7205,8358,872094,4181,12974
+200,32,12,0.0014,1442978,7211,7989,811603,3992,5761
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,16,0.0015,1802153,9005,10158,1074585,5230,7975
+200,32,16,0.0014,1802978,9011,9789,1017305,4988,7017
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,20,0.0015,2162153,10805,11958,1281118,6236,14107
+200,32,20,0.0015,2162978,10811,11589,1221559,6002,7999
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,24,0.0016,2522153,12605,13758,1479347,7222,10037
+200,32,24,0.0016,2522978,12611,13389,1435167,7037,9259
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,28,0.0019,2882153,14405,15558,1682827,8251,11219
+200,32,28,0.0016,2882978,14411,15189,1633061,8054,9789
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,32,0.0017,3242153,16205,17358,1871170,9210,12109
+200,32,32,0.0017,3242978,16211,16989,1842895,9092,10889
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,36,0.0018,3602153,18005,19158,2075730,10193,13063
+200,32,36,0.0018,3602978,18011,18789,2042894,10108,12457
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,40,0.0019,3962153,19805,20958,2272736,11258,14491
+200,32,40,0.0019,3962978,19811,20589,2261332,11191,14233
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,44,0.0019,4322153,21605,22758,2491982,12249,17554
+200,32,44,0.0020,4322978,21611,22389,2458267,12112,14375
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,48,0.0020,4682153,23405,24558,2692600,13292,16003
+200,32,48,0.0020,4682978,23411,24189,2658621,13164,15613
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,52,0.0020,5042153,25205,26358,2878730,14277,17055
+200,32,52,0.0020,5042978,25211,25989,2866175,14190,16864
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,56,0.0021,5402153,27005,28158,3084915,15295,18583
+200,32,56,0.0021,5402978,27011,27789,3080357,15237,21565
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,60,0.0022,5762153,28805,29958,3291836,16330,19233
+200,32,60,0.0022,5762978,28811,29589,3283103,16278,18799
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,64,0.0023,6122153,30605,31758,3622134,17946,20887
+200,32,64,0.0022,6122978,30611,31389,3587582,17820,19681
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,68,0.0024,6482153,32405,33558,3930512,19200,22297
+200,32,68,0.0025,6482978,32411,33189,3893368,19284,20847
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,72,0.0027,6842153,34205,35358,4270649,20402,22797
+200,32,72,0.0025,6842978,34211,34989,4289441,21278,22715
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,76,0.0025,7202153,36005,37158,4209408,20894,24035
+200,32,76,0.0024,7202978,36011,36789,4208700,20936,22677
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,80,0.0025,7562153,37805,38958,4410712,21911,24986
+200,32,80,0.0025,7562978,37811,38589,4409613,21897,23855
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,84,0.0026,7922153,39605,40758,4631259,23020,25649
+200,32,84,0.0026,7922978,39611,40389,4611755,22921,24910
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,88,0.0027,8282153,41405,42558,4814218,23914,26743
+200,32,88,0.0026,8282978,41411,42189,4821904,23974,26087
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,92,0.0027,8642153,43205,44358,5039020,24944,37612
+200,32,92,0.0028,8642978,43211,43989,5104722,25036,38488
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,96,0.0030,9002153,45005,46158,5247046,26072,29012
+200,32,96,0.0028,9002978,45011,45789,5238952,26060,27927
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,100,0.0029,9362153,46805,47958,5426721,26963,29831
+200,32,100,0.0028,9362978,46811,47589,5441545,27049,29275
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,104,0.0029,9722153,48605,49758,5619647,27963,31679
+200,32,104,0.0030,9722978,48611,49389,5920763,28136,72679
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,108,0.0030,10082153,50405,51558,5828776,28956,31626
+200,32,108,0.0030,10082978,50411,51189,5853554,29106,31403
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,112,0.0031,10442153,52205,53358,6033005,30029,32674
+200,32,112,0.0030,10442978,52211,52989,6053498,30123,32279
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,116,0.0031,10802153,54005,55158,6244763,30994,35257
+200,32,116,0.0031,10802978,54011,54789,6296056,31338,33377
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,120,0.0032,11162153,55805,56958,6425499,31972,34572
+200,32,120,0.0033,11162978,55811,56589,6468115,32146,33869
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,124,0.0033,11522153,57605,58758,6654149,33094,35931
+200,32,124,0.0032,11522978,57611,58389,6675248,33233,35075
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,128,0.0033,11882153,59405,60558,6851733,34090,36755
+200,32,128,0.0033,11882978,59411,60189,6894325,34338,36207
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,132,0.0034,12242153,61205,62358,7052529,35058,39834
+200,32,132,0.0034,12242978,61211,61989,7093543,35299,37463
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,136,0.0035,12602153,63005,64158,7241645,36039,38957
+200,32,136,0.0034,12602978,63011,63789,7312105,36353,48105
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,140,0.0035,12962153,64805,65958,7438548,37024,39702
+200,32,140,0.0035,12962978,64811,65589,7503757,37375,39247
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,144,0.0036,13322153,66605,67758,7649807,38039,46041
+200,32,144,0.0036,13322978,66611,67389,7692611,38277,40419
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,148,0.0037,13682153,68405,69558,7837686,39006,41671
+200,32,148,0.0037,13682978,68411,69189,7968094,39656,42113
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,152,0.0037,14042153,70205,71358,8039582,40031,42707
+200,32,152,0.0037,14042978,70211,70989,8122466,40468,42706
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,156,0.0038,14402153,72005,73158,8272212,41195,43645
+200,32,156,0.0038,14402978,72011,72789,8328043,41484,45104
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,160,0.0040,14762153,73805,74958,8471858,42200,44594
+200,32,160,0.0040,14762978,73811,74589,8547674,42493,54216
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,164,0.0039,15122153,75605,76758,8657085,43103,45699
+200,32,164,0.0039,15122978,75611,76389,8738805,43542,45427
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,168,0.0039,15482153,77405,78558,8856462,44110,46863
+200,32,168,0.0040,15482978,77411,78189,8948025,44560,46819
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,172,0.0040,15842153,79205,80358,9050337,45084,47600
+200,32,172,0.0040,15842978,79211,79989,9186567,45735,47659
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,176,0.0041,16202153,81005,82158,9267755,46142,55546
+200,32,176,0.0041,16202978,81011,81789,9391949,46573,70131
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,180,0.0042,16562153,82805,83958,9452041,47058,49763
+200,32,180,0.0042,16562978,82811,83589,9549568,47559,54271
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,184,0.0042,16922153,84605,85758,9655929,48043,50875
+200,32,184,0.0042,16922978,84611,85389,9766306,48609,58645
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,188,0.0043,17282153,86405,87558,9906002,49331,52491
+200,32,188,0.0043,17282978,86411,87189,9974165,49613,56721
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,192,0.0043,17642153,88205,89358,10089481,50268,52937
+200,32,192,0.0044,17642978,88211,88989,10187263,50734,52953
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,196,0.0044,18002153,90005,91158,10292606,51256,54507
+200,32,196,0.0044,18002978,90011,90789,10386920,51763,53773
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,200,0.0045,18362153,91805,92958,10466174,52144,54851
+200,32,200,0.0045,18362978,91811,92589,10593326,52744,54962
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,204,0.0045,18722153,93605,94758,10710242,53145,77999
+200,32,204,0.0045,18722978,93611,94389,10791966,53796,55775
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,208,0.0046,19082153,95405,96558,10872705,54177,57081
+200,32,208,0.0046,19082978,95411,96189,10993938,54691,56692
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,212,0.0047,19442153,97205,98358,11284063,56244,58937
+200,32,212,0.0047,19442978,97211,97989,11183564,55716,57663
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,216,0.0047,19802153,99005,100158,11267668,56162,58869
+200,32,216,0.0047,19802978,99011,99789,11413409,56842,65317
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,220,0.0048,20162153,100805,101958,11510801,57350,60362
+200,32,220,0.0049,20162978,100811,101589,11747337,57952,85917
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,224,0.0051,20522153,102605,103758,11730908,58406,61013
+200,32,224,0.0049,20522978,102611,103389,11967444,58993,147575
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,228,0.0050,20882153,104405,105558,11891323,59260,62051
+200,32,228,0.0050,20882978,104411,105189,12176974,59986,107137
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,232,0.0050,21242153,106205,107358,12083458,60220,63113
+200,32,232,0.0051,21242978,106211,106989,12243039,61011,62843
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,236,0.0050,21602153,108005,109158,12290078,61234,68599
+200,32,236,0.0051,21602978,108011,108789,12454738,61985,74677
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,240,0.0051,21962153,109805,110958,12547828,62267,88616
+200,32,240,0.0051,21962978,109811,110589,12632612,62912,64911
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,244,0.0052,22322153,111605,112758,12674066,63146,66333
+200,32,244,0.0052,22322978,111611,112389,12844679,63954,74316
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,248,0.0052,22682153,113405,114558,12882346,64155,67081
+200,32,248,0.0053,22682978,113411,114189,13049050,65048,67067
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,252,0.0053,23042153,115205,116358,13140221,65490,68231
+200,32,252,0.0054,23042978,115211,115989,13274577,66113,68093
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,256,0.0054,23402153,117005,118158,13331460,66431,69187
+200,32,256,0.0054,23402978,117011,117789,13479975,67191,69232
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,260,0.0054,23762153,118805,119958,13531478,67456,70141
+200,32,260,0.0055,23762978,118811,119589,13702476,68321,70257
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,264,0.0055,24122153,120605,121758,13710546,68246,81094
+200,32,264,0.0055,24122978,120611,121389,13885554,69178,71473
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,268,0.0055,24482153,122405,123558,13890638,69208,72412
+200,32,268,0.0056,24482978,122411,123189,14091173,70236,72538
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,272,0.0056,24842153,124205,125358,14130816,70366,88752
+200,32,272,0.0057,24842978,124211,124989,14277355,71142,73153
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,276,0.0057,25202153,126005,127158,14355067,71208,93990
+200,32,276,0.0057,25202978,126011,126789,14477479,72149,74585
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,280,0.0057,25562153,127805,128958,14513593,72251,85857
+200,32,280,0.0058,25562978,127811,128589,14807542,73365,106386
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,284,0.0059,25922153,129605,130758,14800806,73802,76775
+200,32,284,0.0059,25922978,129611,130389,14919273,74349,83988
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,288,0.0059,26282153,131405,132558,14959572,74579,77267
+200,32,288,0.0060,26282978,131411,132189,15262342,75369,108903
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,292,0.0059,26642153,133205,134358,15130033,75389,78361
+200,32,292,0.0061,26642978,133211,133989,15457489,76550,112579
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,296,0.0060,27002153,135005,136158,15314583,76370,79151
+200,32,296,0.0061,27002978,135011,135789,15587890,77470,113796
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,300,0.0061,27362153,136805,137958,15515700,77373,80055
+200,32,300,0.0063,27362978,136811,137589,15736737,78474,80976
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,304,0.0061,27722153,138605,139758,15739536,78395,81351
+200,32,304,0.0062,27722978,138611,139389,15931699,79424,85309
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,308,0.0062,28082153,140405,141558,15910915,79341,82085
+200,32,308,0.0064,28082978,140411,141189,16127895,80426,82181
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,312,0.0063,28442153,142205,143358,16119259,80297,83271
+200,32,312,0.0063,28442978,142211,142989,16353667,81487,91316
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,316,0.0063,28802153,144005,145158,16376727,81668,84481
+200,32,316,0.0064,28802978,144011,144789,16544730,82526,84583
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,320,0.0064,29162153,145805,146958,16575917,82685,85800
+200,32,320,0.0064,29162978,145811,146589,16778054,83692,85621
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,324,0.0065,29522153,147605,148758,16752101,83529,86861
+200,32,324,0.0065,29522978,147611,148389,16975790,84670,86933
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,328,0.0065,29882153,149405,150558,16931954,84456,87199
+200,32,328,0.0066,29882978,149411,150189,17193806,85651,95908
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,332,0.0066,30242153,151205,152358,17129562,85462,88022
+200,32,332,0.0067,30242978,151211,151989,17391042,86658,92746
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,336,0.0067,30602153,153005,154158,17522378,87337,90235
+200,32,336,0.0067,30602978,153011,153789,17579650,87566,101073
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,340,0.0067,30962153,154805,155958,17525540,87379,89947
+200,32,340,0.0068,30962978,154811,155589,17823659,88601,131503
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,344,0.0068,31322153,156605,157758,17811817,88413,169057
+200,32,344,0.0069,31322978,156611,157389,18045749,89720,131352
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,348,0.0069,31682153,158405,159558,17999372,89772,92601
+200,32,348,0.0069,31682978,158411,159189,18233228,90790,129666
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,352,0.0069,32042153,160205,161358,18204371,90776,101494
+200,32,352,0.0070,32042978,160211,160989,18429938,91908,93827
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,356,0.0070,32402153,162005,163158,18393456,91621,107055
+200,32,356,0.0071,32402978,162011,162789,18723870,92891,169000
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,360,0.0070,32762153,163805,164958,18567077,92476,114024
+200,32,360,0.0071,32762978,163811,164589,18839189,93872,104313
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,364,0.0072,33122153,165605,166758,18749614,93562,96291
+200,32,364,0.0072,33122978,165611,166389,19052230,94828,108456
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,368,0.0073,33482153,167405,168558,18957503,94465,97467
+200,32,368,0.0072,33482978,167411,168189,19224348,95828,106832
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,372,0.0072,33842153,169205,170358,19137907,95471,98421
+200,32,372,0.0073,33842978,169211,169989,19409746,96825,98825
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,376,0.0073,34202153,171005,172158,19350029,96457,99505
+200,32,376,0.0074,34202978,171011,171789,19635914,97934,100015
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,380,0.0075,34562153,172805,173958,19657158,97897,122483
+200,32,380,0.0075,34562978,172811,173589,19901265,99194,108856
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,384,0.0075,34922153,174605,175758,20019224,98872,199167
+200,32,384,0.0075,34922978,174611,175389,20087150,100132,113306
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,388,0.0075,35282153,176405,177558,19999785,99747,102911
+200,32,388,0.0076,35282978,176411,177189,20289560,101187,111225
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,392,0.0077,35642153,178205,179358,20188679,100586,121054
+200,32,392,0.0076,35642978,178211,178989,20478069,102158,104431
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,396,0.0076,36002153,180005,181158,20368637,101583,105060
+200,32,396,0.0077,36002978,180011,180789,20703541,103136,118462
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,400,0.0077,36362153,181805,182958,20628698,102607,152896
+200,32,400,0.0078,36362978,181811,182589,20889687,104097,116051
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,404,0.0078,36722153,183605,184758,20759711,103503,111551
+200,32,404,0.0078,36722978,183611,184389,21103371,105019,150497
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,408,0.0078,37082153,185405,186558,21008339,104552,136230
+200,32,408,0.0079,37082978,185411,186189,21343392,106235,146574
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,412,0.0080,37442153,187205,188358,21248565,105961,109252
+200,32,412,0.0080,37442978,187211,187989,21499750,107213,116228
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,416,0.0080,37802153,189005,190158,21446394,106998,110446
+200,32,416,0.0081,37802978,189011,189789,21769516,108354,153304
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,420,0.0081,38162153,190805,191958,21618503,107795,119989
+200,32,420,0.0082,38162978,190811,191589,22016040,109333,166344
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,424,0.0081,38522153,192605,193758,21778142,108604,112064
+200,32,424,0.0082,38522978,192611,193389,22124948,110298,112586
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,428,0.0081,38882153,194405,195558,21989784,109653,120306
+200,32,428,0.0083,38882978,194411,195189,22375892,111391,164691
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,432,0.0082,39242153,196205,197358,22191881,110730,113916
+200,32,432,0.0083,39242978,196211,196989,22605417,112244,161120
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,436,0.0083,39602153,198005,199158,22373426,111587,115657
+200,32,436,0.0084,39602978,198011,198789,22698406,113231,115888
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,440,0.0084,39962153,199805,200958,22596402,112638,130342
+200,32,440,0.0084,39962978,199811,200589,22946025,114347,124840
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,444,0.0084,40322153,201605,202758,22868323,114041,124888
+200,32,444,0.0085,40322978,201611,202389,23138571,115404,122324
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,448,0.0085,40682153,203405,204558,23084361,115132,128588
+200,32,448,0.0086,40682978,203411,204189,23382319,116666,118990
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,452,0.0086,41042153,205205,206358,23255449,115787,156348
+200,32,452,0.0086,41042978,205211,205989,23582320,117634,123005
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,456,0.0088,41402153,207005,208158,23400730,116742,119985
+200,32,456,0.0087,41402978,207011,207789,23777586,118606,121054
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,460,0.0087,41762153,208805,209958,23616057,117782,125672
+200,32,460,0.0088,41762978,208811,209589,24021078,119638,157473
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,464,0.0088,42122153,210605,211758,23845815,118769,150383
+200,32,464,0.0089,42122978,210611,211389,24177273,120536,137152
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,468,0.0089,42482153,212405,213558,23982677,119580,123029
+200,32,468,0.0089,42482978,212411,213189,24354431,121510,124378
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,472,0.0090,42842153,214205,215358,24183894,120688,124270
+200,32,472,0.0090,42842978,214211,214989,24680874,122798,163001
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,476,0.0090,43202153,216005,217158,24479273,122149,125974
+200,32,476,0.0092,43202978,216011,216789,24806941,123695,126112
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,480,0.0091,43562153,217805,218958,24768939,123125,164217
+200,32,480,0.0091,43562978,217811,218589,25036974,124855,131240
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,484,0.0092,43922153,219605,220758,24828983,123895,127390
+200,32,484,0.0092,43922978,219611,220389,25277560,125834,159926
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,488,0.0091,44282153,221405,222558,25011559,124768,128788
+200,32,488,0.0093,44282978,221411,222189,25492002,126931,169890
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,492,0.0092,44642153,223205,224358,25219550,125760,132732
+200,32,492,0.0094,44642978,223211,223989,25799993,127811,292316
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,496,0.0093,45002153,225005,226158,25447017,126853,140428
+200,32,496,0.0094,45002978,225011,225789,25879076,128748,186367
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,500,0.0093,45362153,226805,227958,25586059,127650,131094
+200,32,500,0.0094,45362978,226811,227589,26021482,129705,143377
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,504,0.0094,45722153,228605,229758,25796559,128739,131932
+200,32,504,0.0095,45722978,228611,229389,26309697,130875,185497
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,508,0.0095,46082153,230405,231558,26122261,130275,141242
+200,32,508,0.0096,46082978,230411,231189,26445482,131853,134810
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,512,0.0095,46442153,232205,233358,26303806,130890,135216
+200,32,512,0.0097,46442978,232211,232989,26722882,133313,135480
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,516,0.0096,46802153,234005,235158,26441241,131860,137807
+200,32,516,0.0097,46802978,234011,234789,26902984,134116,143429
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,520,0.0097,47162153,235805,236958,26620814,132726,144193
+200,32,520,0.0098,47162978,235811,236589,27143327,135173,182663
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,524,0.0097,47522153,237605,238758,26895547,133979,180810
+200,32,524,0.0101,47522978,237611,238389,27899728,139067,143412
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,528,0.0098,47882153,239405,240558,27103175,134594,195038
+200,32,528,0.0099,47882978,239411,240189,27539695,137281,153792
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,532,0.0099,48242153,241205,242358,27216804,135653,148537
+200,32,532,0.0100,48242978,241211,241989,27665652,137957,156345
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,536,0.0100,48602153,243005,244158,27609711,137157,225927
+200,32,536,0.0102,48602978,243011,243789,27888664,139123,142069
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,540,0.0101,48962153,244805,245958,27856165,138525,222412
+200,32,540,0.0102,48962978,244811,245589,28116288,140162,167093
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,544,0.0101,49322153,246605,247758,27949313,139206,146089
+200,32,544,0.0102,49322978,246611,247389,28395864,141365,191687
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,548,0.0102,49682153,248405,249558,28071639,140106,144061
+200,32,548,0.0105,49682978,248411,249189,28539300,142352,144923
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,552,0.0102,50042153,250205,251358,28221254,140771,147826
+200,32,552,0.0104,50042978,250211,250989,28772000,143499,153080
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,556,0.0103,50402153,252005,253158,28466442,141994,145849
+200,32,556,0.0104,50402978,252011,252789,28943938,144344,160802
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,560,0.0105,50762153,253805,254958,28785863,142904,194917
+200,32,560,0.0105,50762978,253811,254589,29192011,145318,205574
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,564,0.0105,51122153,255605,256758,28851831,143902,156411
+200,32,564,0.0106,51122978,255611,256389,29371768,146296,173660
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,568,0.0106,51482153,257405,258558,29223120,145608,162476
+200,32,568,0.0107,51482978,257411,258189,29607085,147402,185216
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,572,0.0108,51842153,259205,260358,29438332,146788,151895
+200,32,572,0.0109,51842978,259211,259989,29760468,148529,150992
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,576,0.0108,52202153,261005,262158,29557331,147210,151262
+200,32,576,0.0108,52202978,261011,261789,30001693,149671,152448
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,580,0.0108,52562153,262805,263958,29704990,148198,158557
+200,32,580,0.0109,52562978,262811,263589,30194219,150474,161954
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,584,0.0108,52922153,264605,265758,29996452,149016,250006
+200,32,584,0.0110,52922978,264611,265389,30465237,151575,196784
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,588,0.0109,53282153,266405,267558,30123135,150270,154069
+200,32,588,0.0112,53282978,266411,267189,30866027,152658,345805
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,592,0.0110,53642153,268205,269358,30283611,150978,165439
+200,32,592,0.0112,53642978,268211,268989,30806266,153631,162459
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,596,0.0110,54002153,270005,271158,30512807,152128,156216
+200,32,596,0.0112,54002978,270011,270789,31013348,154624,161083
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,600,0.0111,54362153,271805,272958,30713954,153227,157015
+200,32,600,0.0113,54362978,271811,272589,31227644,155782,158034
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,604,0.0113,54722153,273605,274758,31116246,155098,162946
+200,32,604,0.0115,54722978,273611,274389,31534633,156837,219588
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,608,0.0113,55082153,275405,276558,31292429,155792,166047
+200,32,608,0.0114,55082978,275411,276189,31675474,157869,168332
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,612,0.0113,55442153,277205,278358,31367681,156312,187819
+200,32,612,0.0115,55442978,277211,277989,31953436,158989,218652
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,616,0.0114,55802153,279005,280158,31509163,156923,173955
+200,32,616,0.0116,55802978,279011,279789,32108644,160138,180416
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,620,0.0115,56162153,280805,281958,31751550,158349,162413
+200,32,620,0.0116,56162978,280811,281589,32277424,160849,182393
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,624,0.0116,56522153,282605,283758,32010052,159426,164990
+200,32,624,0.0118,56522978,282611,283389,32423394,161797,164245
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,628,0.0116,56882153,284405,285558,32270071,160471,206182
+200,32,628,0.0117,56882978,284411,285189,32609412,162678,167394
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,632,0.0118,57242153,286205,287358,32379821,161317,166154
+200,32,632,0.0118,57242978,286211,286989,32869379,163975,168634
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,636,0.0118,57602153,288005,289158,32621237,162719,174455
+200,32,636,0.0119,57602978,288011,288789,33151217,165037,223167
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,640,0.0118,57962153,289805,290958,32760054,163283,174727
+200,32,640,0.0119,57962978,289811,290589,33341299,166215,181218
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,644,0.0119,58322153,291605,292758,32895462,163973,168568
+200,32,644,0.0121,58322978,291611,292389,33649260,167751,199967
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,648,0.0119,58682153,293405,294558,33046462,164805,176098
+200,32,648,0.0121,58682978,293411,294189,33719599,168221,178799
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,652,0.0120,59042153,295205,296358,33305627,166069,179927
+200,32,652,0.0122,59042978,295211,295989,34067206,169536,235514
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,656,0.0121,59402153,297005,298158,33611780,166989,248127
+200,32,656,0.0122,59402978,297011,297789,34164102,170144,235618
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,660,0.0121,59762153,298805,299958,33791922,168433,184984
+200,32,660,0.0123,59762978,298811,299589,34456636,171594,235316
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,664,0.0121,60122153,300605,301758,33927065,169140,182483
+200,32,664,0.0124,60122978,300611,301389,34541178,172177,211827
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,668,0.0124,60482153,302405,303558,34476798,171567,188679
+200,32,668,0.0124,60482978,302411,303189,34905159,173832,222673
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,672,0.0123,60842153,304205,305358,34350802,171240,175365
+200,32,672,0.0126,60842978,304211,304989,34988298,174422,188003
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,676,0.0123,61202153,306005,307158,34529315,172118,202239
+200,32,676,0.0126,61202978,306011,306789,35263092,175911,185984
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,680,0.0124,61562153,307805,308958,34716545,172878,244909
+200,32,680,0.0127,61562978,307811,308589,35503073,176323,305860
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,684,0.0126,61922153,309605,310758,35111667,174820,186347
+200,32,684,0.0128,61922978,309611,310389,35672483,178036,180851
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,688,0.0126,62282153,311405,312558,35200811,175517,179013
+200,32,688,0.0128,62282978,311411,312189,35790039,178289,217803
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,692,0.0126,62642153,313205,314358,35391859,176015,252609
+200,32,692,0.0128,62642978,313211,313989,36045752,179866,188983
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,696,0.0127,63002153,315005,316158,35696188,177815,200506
+200,32,696,0.0130,63002978,315011,315789,36175144,180438,195986
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,700,0.0128,63362153,316805,317958,35825556,178736,191521
+200,32,700,0.0131,63362978,316811,317589,36529049,182248,184897
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,704,0.0129,63722153,318605,319758,36008866,179237,218743
+200,32,704,0.0130,63722978,318611,319389,36611747,182765,185703
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,708,0.0129,64082153,320405,321558,36282257,180511,214158
+200,32,708,0.0130,64082978,320411,321189,36811496,183626,191140
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,712,0.0129,64442153,322205,323358,36251857,180793,191833
+200,32,712,0.0131,64442978,322211,322989,37060383,184588,255521
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,716,0.0131,64802153,324005,325158,36828270,182903,229477
+200,32,716,0.0132,64802978,324011,324789,37267356,185684,240236
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,720,0.0130,65162153,325805,326958,36775140,183107,213910
+200,32,720,0.0132,65162978,325811,326589,37393434,186562,204926
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,724,0.0131,65522153,327605,328758,36946255,184028,240244
+200,32,724,0.0133,65522978,327611,328389,37611724,187635,203956
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,728,0.0132,65882153,329405,330558,37189420,185485,206103
+200,32,728,0.0135,65882978,329411,330189,37844476,188685,217329
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,732,0.0133,66242153,331205,332358,37526856,187108,192940
+200,32,732,0.0136,66242978,331211,331989,38097715,189879,238003
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,736,0.0134,66602153,333005,334158,37747623,188004,201070
+200,32,736,0.0136,66602978,333011,333789,38249665,190960,193797
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,740,0.0134,66962153,334805,335958,37844347,188709,198675
+200,32,740,0.0137,66962978,334811,335589,38496135,191882,202980
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,744,0.0134,67322153,336605,337758,37874634,189009,203611
+200,32,744,0.0136,67322978,336611,337389,38643004,192776,211409
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,748,0.0136,67682153,338405,339558,38360815,190893,193995
+200,32,748,0.0138,67682978,338411,339189,38834497,193752,204307
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,752,0.0137,68042153,340205,341358,38702052,192377,222451
+200,32,752,0.0139,68042978,340211,340989,39026422,194674,207102
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,756,0.0136,68402153,342005,343158,38548177,192033,249435
+200,32,756,0.0139,68402978,342011,342789,39292510,195755,242534
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,760,0.0138,68762153,343805,344958,39152996,194437,272148
+200,32,760,0.0140,68762978,343811,344589,39445808,196904,199749
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,764,0.0138,69122153,345605,346758,39070056,194876,204988
+200,32,764,0.0140,69122978,345611,346389,39707448,198140,208159
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,768,0.0138,69482153,347405,348558,39192485,195337,208507
+200,32,768,0.0141,69482978,347411,348189,39961335,199314,213386
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,772,0.0139,69842153,349205,350358,39509976,197063,216644
+200,32,772,0.0142,69842978,349211,349989,40195551,200268,262442
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,776,0.0140,70202153,351005,352158,39643299,197720,238164
+200,32,776,0.0143,70202978,351011,351789,40369481,201262,243178
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,780,0.0141,70562153,352805,353958,40047395,199611,212284
+200,32,780,0.0143,70562978,352811,353589,40454251,201889,204769
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,784,0.0142,70922153,354605,355758,40474213,201350,218018
+200,32,784,0.0143,70922978,354611,355389,40804167,203132,292206
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,788,0.0143,71282153,356405,357558,40369690,200941,270257
+200,32,788,0.0144,71282978,356411,357189,40880258,203888,220805
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,792,0.0143,71642153,358205,359358,40667289,202430,244792
+200,32,792,0.0145,71642978,358211,358989,41141375,205195,222680
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,796,0.0145,72002153,360005,361158,41245212,205315,244622
+200,32,796,0.0145,72002978,360011,360789,41346667,205890,276619
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,800,0.0144,72362153,361805,362958,41042713,204407,249254
+200,32,800,0.0146,72362978,361811,362589,41586665,207290,248916
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,804,0.0145,72722153,363605,364758,41137099,205254,211445
+200,32,804,0.0147,72722978,363611,364389,41696398,208106,211465
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,808,0.0145,73082153,365405,366558,41267168,205869,210553
+200,32,808,0.0148,73082978,365411,366189,41978951,209272,255137
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,812,0.0146,73442153,367205,368358,41538016,207083,242270
+200,32,812,0.0148,73442978,367211,367989,42187366,209918,283393
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,816,0.0147,73802153,369005,370158,41856937,208198,257079
+200,32,816,0.0149,73802978,369011,369789,42482639,211214,322437
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,820,0.0149,74162153,370805,371958,42581251,211598,220361
+200,32,820,0.0149,74162978,370811,371589,42512865,212010,227823
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,824,0.0148,74522153,372605,373758,42106929,210144,214780
+200,32,824,0.0151,74522978,372611,373389,42861251,213412,278868
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,828,0.0151,74882153,374405,375558,42954101,213100,216189
+200,32,828,0.0151,74882978,374411,375189,42979335,214191,262439
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,832,0.0150,75242153,376205,377358,42591682,212393,217281
+200,32,832,0.0152,75242978,376211,376989,43402619,215543,296991
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,836,0.0150,75602153,378005,379158,42833889,213607,225147
+200,32,836,0.0152,75602978,378011,378789,43382253,216450,232179
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,840,0.0151,75962153,379805,380958,42888365,213833,258282
+200,32,840,0.0154,75962978,379811,380589,43665001,217538,261020
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,844,0.0151,76322153,381605,382758,43234463,215605,228741
+200,32,844,0.0154,76322978,381611,382389,43762162,218196,232967
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,848,0.0152,76682153,383405,384558,43340508,216058,240778
+200,32,848,0.0156,76682978,383411,384189,44077885,219619,233562
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,852,0.0154,77042153,385205,386358,43964132,218702,263707
+200,32,852,0.0155,77042978,385211,385989,44269902,220266,357562
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,856,0.0155,77402153,387005,388158,43738562,218168,230126
+200,32,856,0.0156,77402978,387011,387789,44458368,221658,275183
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,860,0.0154,77762153,388805,389958,44071523,219837,238185
+200,32,860,0.0156,77762978,388811,389589,44599845,222530,244104
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,864,0.0155,78122153,390605,391758,44411093,221177,232408
+200,32,864,0.0158,78122978,390611,391389,44856987,223898,229495
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,868,0.0157,78482153,392405,393558,44526424,222013,237960
+200,32,868,0.0157,78482978,392411,393189,45070339,224667,268426
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,872,0.0158,78842153,394205,395358,45188815,224084,346189
+200,32,872,0.0158,78842978,394211,394989,45243346,225686,238504
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,876,0.0156,79202153,396005,397158,44700630,222996,237268
+200,32,876,0.0160,79202978,396011,396789,45425044,226467,285843
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,880,0.0158,79562153,397805,398958,45208957,224813,328325
+200,32,880,0.0160,79562978,397811,398589,45637897,227585,255503
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,884,0.0159,79922153,399605,400758,45474656,226439,239215
+200,32,884,0.0163,79922978,399611,400389,45922301,228540,294854
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,888,0.0160,80282153,401405,402558,45766475,227867,240911
+200,32,888,0.0161,80282978,401411,402189,46210377,229936,317062
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,892,0.0160,80642153,403205,404358,45940503,228819,243891
+200,32,892,0.0161,80642978,403211,403989,46224897,230736,244030
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,896,0.0161,81002153,405005,406158,45973712,229111,241548
+200,32,896,0.0163,81002978,405011,405789,46706945,232252,393574
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,900,0.0162,81362153,406805,407958,46447521,230613,346027
+200,32,900,0.0163,81362978,406811,407589,46846573,233803,243774
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,904,0.0163,81722153,408605,409758,46859527,233117,305572
+200,32,904,0.0165,81722978,408611,409389,47211102,235424,247115
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,908,0.0164,82082153,410405,411558,47123610,234871,284329
+200,32,908,0.0165,82082978,410411,411189,47420647,236067,308146
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,912,0.0166,82442153,412205,413358,47816182,237201,366650
+200,32,912,0.0167,82442978,412211,412989,47664515,237299,252663
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,916,0.0166,82802153,414005,415158,47456504,236767,248921
+200,32,916,0.0166,82802978,414011,414789,47825500,238210,307878
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,920,0.0165,83162153,415805,416958,47592162,237459,265738
+200,32,920,0.0168,83162978,415811,416589,48024315,239591,249230
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,924,0.0167,83522153,417605,418758,48057683,239541,276783
+200,32,924,0.0168,83522978,417611,418389,48204506,240348,286103
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,928,0.0167,83882153,419405,420558,48171706,239841,277682
+200,32,928,0.0168,83882978,419411,420189,48474452,241766,272232
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,932,0.0170,84242153,421205,422358,48721591,242883,245719
+200,32,932,0.0169,84242978,421211,421989,48643328,242408,310910
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,936,0.0169,84602153,423005,424158,48377712,241387,254877
+200,32,936,0.0170,84602978,423011,423789,49041567,243670,350571
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,940,0.0169,84962153,424805,425958,48721762,242855,255300
+200,32,940,0.0171,84962978,424811,425589,49009612,244295,313509
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,944,0.0170,85322153,426605,427758,49035991,243372,370914
+200,32,944,0.0171,85322978,426611,427389,49257311,245620,259650
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,948,0.0171,85682153,428405,429558,49070436,244800,262067
+200,32,948,0.0172,85682978,428411,429189,49415667,246533,254714
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,952,0.0171,86042153,430205,431358,49234273,245636,258683
+200,32,952,0.0172,86042978,430211,430989,49711139,247671,319628
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,956,0.0172,86402153,432005,433158,49586922,247001,316148
+200,32,956,0.0174,86402978,432011,432789,49856592,248552,271876
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,960,0.0172,86762153,433805,434958,49640943,247637,284307
+200,32,960,0.0174,86762978,433811,434589,50136102,249978,265617
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,964,0.0177,87122153,435605,436758,51436885,256453,266477
+200,32,964,0.0176,87122978,435611,436389,50925446,253713,295499
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,968,0.0178,87482153,437405,438558,51146832,254991,267861
+200,32,968,0.0178,87482978,437411,438189,51035835,253858,318894
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,972,0.0177,87842153,439205,440358,51377929,256333,274159
+200,32,972,0.0177,87842978,439211,439989,51188317,255334,306288
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,976,0.0179,88202153,441005,442158,51360933,256336,265049
+200,32,976,0.0178,88202978,441011,441789,51436023,256205,289239
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,980,0.0179,88562153,442805,443958,51845435,258521,293602
+200,32,980,0.0179,88562978,442811,443589,51703656,257814,300077
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,984,0.0180,88922153,444605,445758,52129373,259818,262711
+200,32,984,0.0179,88922978,444611,445389,51801305,257947,349721
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,988,0.0181,89282153,446405,447558,52262963,260903,278224
+200,32,988,0.0181,89282978,446411,447189,52056854,259676,262216
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,992,0.0182,89642153,448205,449358,52407317,261432,272849
+200,32,992,0.0182,89642978,448211,448989,52237864,260535,269494
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,996,0.0184,90002153,450005,451158,53286503,265403,275404
+200,32,996,0.0183,90002978,450011,450789,52526126,262024,274178
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,1000,0.0182,90362153,451805,452958,53051777,264487,273734
+200,32,1000,0.0182,90362978,451811,452589,52578843,262284,265526
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,1004,0.0183,90722153,453605,454758,53153647,264834,340140
+200,32,1004,0.0183,90722978,453611,454389,52896370,263840,273834
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,1008,0.0183,91082153,455405,456558,53025643,264711,274578
+200,32,1008,0.0183,91082978,455411,456189,53074476,264385,308471
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,1012,0.0185,91442153,457205,458358,53709439,267192,353247
+200,32,1012,0.0184,91442978,457211,457989,53382079,266422,284446
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,1016,0.0186,91802153,459005,460158,54036527,268786,339099
+200,32,1016,0.0186,91802978,459011,459789,53434221,266486,275700
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,1020,0.0186,92162153,460805,461958,54154888,269844,327020
+200,32,1020,0.0186,92162978,460811,461589,53712164,268036,277528
 iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)
-200,32,1024,0.0183,92522153,462605,463758,52875104,262839,332332
-mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .
+200,32,1024,0.0187,92522978,462611,463389,53754294,268076,276795
+mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.ins_cyc.bin.csv .
 </pre>
 </div>
 </div>
@@ -13823,7 +13855,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>Once the run is completed, let's have a look at the data!</p>
+<p>Once the run is completed, let's study the data!</p>
 <p>This can be done best in the interactive version of the Jupyter Notebook. In case this version of the description is unavailable to you, call the Makefile target <code>make graph_task1</code> (either with X forwarding, or download the resulting PDF).</p>
 
 </div>
@@ -13831,10 +13863,11 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[3]:</div>
+<div class="prompt input_prompt">In&nbsp;[1]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">import</span> <span class="nn">seaborn</span> <span class="k">as</span> <span class="nn">sns</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
+<span class="kn">import</span> <span class="nn">seaborn</span> <span class="k">as</span> <span class="nn">sns</span>
 <span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
 <span class="kn">import</span> <span class="nn">matplotlib.pyplot</span> <span class="k">as</span> <span class="nn">plt</span>
 <span class="kn">import</span> <span class="nn">common</span>
@@ -13847,16 +13880,36 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .
 </div>
 </div>
 
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Execute the following cell if you want to switch to color-blind-safer colors</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">sns</span><span class="o">.</span><span class="n">set_palette</span><span class="p">(</span><span class="s2">&quot;colorblind&quot;</span><span class="p">)</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[77]:</div>
+<div class="prompt input_prompt">In&nbsp;[2]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">plt</span><span class="o">.</span><span class="n">rcParams</span><span class="p">[</span><span class="s1">&#39;figure.figsize&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="mi">14</span><span class="p">,</span> <span class="mi">6</span><span class="p">]</span>
 <span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&quot;poisson2d.ins_cyc.bin.csv&quot;</span><span class="p">,</span> <span class="n">skiprows</span><span class="o">=</span><span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">50000</span><span class="p">,</span> <span class="mi">2</span><span class="p">))</span>  <span class="c1"># Read in the CSV file from the bench run; parse with Pandas</span>
-<span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="s2">&quot;PM_INST_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Instructions / Loop Iteration&quot;</span><span class="p">)</span>  <span class="c1"># Normalize to each grid cell</span>
-<span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="s2">&quot;PM_RUN_CYC (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Cycles / Loop Iteration&quot;</span><span class="p">)</span>
+<span class="n">df</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="s2">&quot;nx&quot;</span><span class="p">]</span> <span class="o">*</span> <span class="n">df</span><span class="p">[</span><span class="s2">&quot;ny&quot;</span><span class="p">]</span>  <span class="c1"># Add a new column of the number of grid points (the product of nx and ny)</span>
 <span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>  <span class="c1"># Display the head of the Pandas dataframe</span>
 </pre></div>
 
@@ -13870,7 +13923,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .
 
 <div class="output_area">
 
-    <div class="prompt output_prompt">Out[77]:</div>
+    <div class="prompt output_prompt">Out[2]:</div>
 
 
 
@@ -13903,8 +13956,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .
       <th>PM_RUN_CYC (total)</th>
       <th>PM_RUN_CYC (min)</th>
       <th>PM_RUN_CYC (max)</th>
-      <th>Instructions / Loop Iteration</th>
-      <th>Cycles / Loop Iteration</th>
+      <th>Grid Points</th>
     </tr>
   </thead>
   <tbody>
@@ -13914,14 +13966,13 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .
       <td>32</td>
       <td>4</td>
       <td>0.0012</td>
-      <td>548153</td>
-      <td>2735</td>
-      <td>3888</td>
-      <td>266883</td>
-      <td>1237</td>
-      <td>4793</td>
-      <td>21.367188</td>
-      <td>9.664062</td>
+      <td>572978</td>
+      <td>2861</td>
+      <td>3639</td>
+      <td>261330</td>
+      <td>1235</td>
+      <td>4684</td>
+      <td>128</td>
     </tr>
     <tr>
       <th>1</th>
@@ -13929,14 +13980,13 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .
       <td>32</td>
       <td>8</td>
       <td>0.0014</td>
-      <td>1082153</td>
-      <td>5405</td>
-      <td>6558</td>
-      <td>668819</td>
-      <td>3214</td>
-      <td>6623</td>
-      <td>21.113281</td>
-      <td>12.554688</td>
+      <td>1082978</td>
+      <td>5411</td>
+      <td>6189</td>
+      <td>601962</td>
+      <td>2914</td>
+      <td>5099</td>
+      <td>256</td>
     </tr>
     <tr>
       <th>2</th>
@@ -13944,44 +13994,41 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .
       <td>32</td>
       <td>12</td>
       <td>0.0014</td>
-      <td>1442153</td>
-      <td>7205</td>
-      <td>8358</td>
-      <td>872913</td>
-      <td>4187</td>
-      <td>11640</td>
-      <td>18.763021</td>
-      <td>10.903646</td>
+      <td>1442978</td>
+      <td>7211</td>
+      <td>7989</td>
+      <td>811603</td>
+      <td>3992</td>
+      <td>5761</td>
+      <td>384</td>
     </tr>
     <tr>
       <th>3</th>
       <td>200</td>
       <td>32</td>
       <td>16</td>
-      <td>0.0015</td>
-      <td>1802153</td>
-      <td>9005</td>
-      <td>10158</td>
-      <td>1077532</td>
-      <td>5254</td>
-      <td>8147</td>
-      <td>17.587891</td>
-      <td>10.261719</td>
+      <td>0.0014</td>
+      <td>1802978</td>
+      <td>9011</td>
+      <td>9789</td>
+      <td>1017305</td>
+      <td>4988</td>
+      <td>7017</td>
+      <td>512</td>
     </tr>
     <tr>
       <th>4</th>
       <td>200</td>
       <td>32</td>
       <td>20</td>
-      <td>0.0016</td>
-      <td>2162153</td>
-      <td>10805</td>
-      <td>11958</td>
-      <td>1277957</td>
-      <td>6209</td>
-      <td>9015</td>
-      <td>16.882812</td>
-      <td>9.701562</td>
+      <td>0.0015</td>
+      <td>2162978</td>
+      <td>10811</td>
+      <td>11589</td>
+      <td>1221559</td>
+      <td>6002</td>
+      <td>7999</td>
+      <td>640</td>
     </tr>
   </tbody>
 </table>
@@ -13993,16 +14040,24 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .
 </div>
 </div>
 
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Let's have a look at the counters we've just measured and see how they scaling with increasing number of grid points.</p>
+<p><em>In the following, we are always using the minimal value of the counter (indicated by »(min)«) as this should give us an estimate of the best achievable result of the architecture.</em></p>
+
+</div>
+</div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[78]:</div>
+<div class="prompt input_prompt">In&nbsp;[3]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># Plot Cycles and Instructions - both per grid cell</span>
-<span class="n">fig</span><span class="p">,</span> <span class="p">(</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">)</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
-<span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Cycles / Loop Iteration&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax1</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
-<span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Instructions / Loop Iteration&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax2</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">fig</span><span class="p">,</span> <span class="p">(</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">)</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_RUN_CYC (min)&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax1</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+<span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_INST_CMPL (min)&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax2</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
 </pre></div>
 
     </div>
@@ -14021,7 +14076,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .
 
 
 <div class="output_png output_subarea ">
-<img src="
+<img src="
 "
 >
 </div>
@@ -14035,7 +14090,176 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>What is your result? What value do the graphs come asymptotically close too?</p>
+<p>Although some slight variations can be seen for run cycles for many grid points, the correlation looks quite linear (as one would naively expect). Let's test that by fitting a linear function!</p>
+<p><em>The details of the fitting have been extracted into dedicated function, <code>print_and_return_fit()</code>, of the <code>common.py</code> helper file. If you're interested, <a href="common.py">go have a look at it</a>.</em></p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[4]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="k">def</span> <span class="nf">linear_function</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">a</span><span class="o">*</span><span class="n">x</span><span class="o">+</span><span class="n">b</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[25]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">fit_parameters</span><span class="p">,</span> <span class="n">fit_covariance</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">print_and_return_fit</span><span class="p">(</span>
+    <span class="p">[</span><span class="s2">&quot;PM_RUN_CYC (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;PM_INST_CMPL (min)&quot;</span><span class="p">],</span> 
+    <span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">),</span> 
+    <span class="n">linear_function</span><span class="p">,</span>
+    <span class="n">format_uncertainty</span><span class="o">=</span><span class="s2">&quot;.4f&quot;</span>
+<span class="p">)</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>Counter   PM_RUN_CYC (min) is proportional to the grid points (nx*ny) by a factor of  8.1021 (± 0.0057)
+Counter PM_INST_CMPL (min) is proportional to the grid points (nx*ny) by a factor of 14.0630 (± 0.0003)
+</pre>
+</div>
+</div>
+
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Let's overlay our fits to the graphs from before.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[6]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">fig</span><span class="p">,</span> <span class="p">(</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">)</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="k">for</span> <span class="n">ax</span><span class="p">,</span> <span class="n">pmu_counter</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">([</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;PM_RUN_CYC (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;PM_INST_CMPL (min)&quot;</span><span class="p">]):</span>
+    <span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="n">pmu_counter</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+    <span class="n">ax</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span>
+        <span class="n">df</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">],</span> 
+        <span class="n">linear_function</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">],</span> <span class="o">*</span><span class="n">fit_parameters</span><span class="p">[</span><span class="n">pmu_counter</span><span class="p">]),</span> 
+        <span class="n">linestyle</span><span class="o">=</span><span class="s2">&quot;--&quot;</span><span class="p">,</span> 
+        <span class="n">label</span><span class="o">=</span><span class="s2">&quot;Fit: </span><span class="si">{:.2f}</span><span class="s2"> * x + </span><span class="si">{:.2f}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="o">*</span><span class="n">fit_parameters</span><span class="p">[</span><span class="n">pmu_counter</span><span class="p">])</span>
+    <span class="p">)</span>
+    <span class="n">ax</span><span class="o">.</span><span class="n">legend</span><span class="p">();</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+
+
+<div class="output_png output_subarea ">
+<img src="
+"
+>
+</div>
+
+</div>
+
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Please execute the next cell to summarize the first task.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[38]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="nb">print</span><span class="p">(</span><span class="s2">&quot;The algorithm under investigation runs about </span><span class="si">{:.0f}</span><span class="s2"> cycles and executes about </span><span class="si">{:.0f}</span><span class="s2"> instructions per grid point&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
+    <span class="o">*</span><span class="p">[</span><span class="n">fit_parameters</span><span class="p">[</span><span class="n">pmu_counter</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">pmu_counter</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;PM_RUN_CYC (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;PM_INST_CMPL (min)&quot;</span><span class="p">]]</span>
+<span class="p">))</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>The algorithm under investigation runs about 8 cycles and executes about 14 instructions per grid point
+</pre>
+</div>
+</div>
+
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p><strong>Bonus:</strong></p>
+<p>The linear fits also calculate a y intersection (»<code>b</code>«). How do you interpret this value?</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>The y axis intersection; that is, <code>b</code> of the linear fit, is the inherent overhead of the program execution. Even if our program would not compute any stencil operation at all for any grid point, it would still complete this many (~1800) instructions and run this many (~680) cycles. Interestingly, it is also the unparallelizable overhead of this (toy) example.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
 <p>We are revisiting the graph in a little while.</p>
 <p><a href="#toc">Back to top</a></p>
 
@@ -14048,7 +14272,8 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .
 <h2 id="Task-2:-Measuring-Loads-and-Stores">Task 2: Measuring Loads and Stores<a class="anchor-link" href="#Task-2:-Measuring-Loads-and-Stores">&#182;</a></h2><p><a name="task2"></a></p>
 <p>Looking at the source code, how many loads and stores from / to memory do you expect? Have a look at the loop which we instrumented.</p>
 <p>Let's compare your estimate to what the system actually does!</p>
-<p><a name="task2-a"></a><strong>TASK A</strong>: Please measure counters for loads and stores. See the TODOs in <a href="/edit/Tasks/poisson2d.ld_st.c"><code>poisson2d.ld_st.c</code></a>. This time, implement <code>PM_LD_CMPL</code> and <code>PM_ST_CMPL</code>.</p>
+<h3 id="Task-A">Task A<a class="anchor-link" href="#Task-A">&#182;</a></h3><p><a name="task2-a"></a></p>
+<p>Please measure counters for loads and stores. See the TODOs in <a href="/edit/Tasks/poisson2d.ld_st.c"><code>poisson2d.ld_st.c</code></a>. This time, implement <code>PM_LD_CMPL</code> and <code>PM_ST_CMPL</code>.</p>
 <p>Compile with <code>make task2</code>, test your program with a single run with <code>make run_task2</code>, and then finally submit a benchmarking run to the batch system with <code>make bench_task2</code>. The following cell will take care of all this.</p>
 <p><a href="#toc">Back to top</a></p>
 
@@ -14057,7 +14282,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[11]:</div>
+<div class="prompt input_prompt">In&nbsp;[3]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="o">!</span>make bench_task2
@@ -14077,524 +14302,523 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>gcc -DUSE_DOUBLE -Ofast -std=c99 -lm -lpapi  poisson2d.ld_st.c -o poisson2d.ld_st.bin
-bsub -W 60 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.ld_st.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv
-Job &lt;4032&gt; is submitted to default queue &lt;batch&gt;.
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.ld_st.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.ld_st.bin.csv
+Job &lt;24416&gt; is submitted to default queue &lt;batch&gt;.
 &lt;&lt;Waiting for dispatch ...&gt;&gt;
 &lt;&lt;Starting on login1&gt;&gt;
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,4,0.0012,95115,474,789,21343,106,249
+200,32,4,0.0012,119819,598,817,32902,164,266
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,8,0.0014,137115,684,999,33343,166,309
+200,32,8,0.0013,161819,808,1027,56902,284,386
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,12,0.0014,197115,984,1299,45343,226,369
+200,32,12,0.0014,221819,1108,1327,71902,359,461
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,16,0.0015,257115,1284,1599,63343,316,459
+200,32,16,0.0015,281819,1408,1627,86902,434,536
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,20,0.0016,317115,1584,1899,75343,376,519
+200,32,20,0.0015,341819,1708,1927,101902,509,611
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,24,0.0016,377115,1884,2199,93343,466,609
+200,32,24,0.0016,401819,2008,2227,116902,584,686
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,28,0.0017,437115,2184,2499,105343,526,669
+200,32,28,0.0016,461819,2308,2527,131902,659,761
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,32,0.0017,497115,2484,2799,123343,616,759
+200,32,32,0.0018,521819,2608,2827,146902,734,836
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,36,0.0018,557115,2784,3099,135343,676,819
+200,32,36,0.0018,581819,2908,3127,161902,809,911
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,40,0.0020,617115,3084,3399,153343,766,909
+200,32,40,0.0018,641819,3208,3427,176902,884,986
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,44,0.0019,677115,3384,3699,165343,826,969
+200,32,44,0.0019,701819,3508,3727,191902,959,1061
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,48,0.0020,737115,3684,3999,183343,916,1059
+200,32,48,0.0020,761819,3808,4027,206902,1034,1136
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,52,0.0021,797115,3984,4299,195343,976,1119
+200,32,52,0.0020,821819,4108,4327,221902,1109,1211
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,56,0.0021,857115,4284,4599,213343,1066,1209
+200,32,56,0.0021,881819,4408,4627,236902,1184,1286
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,60,0.0023,917115,4584,4899,225343,1126,1269
+200,32,60,0.0022,941819,4708,4927,251902,1259,1361
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,64,0.0023,977115,4884,5199,243343,1216,1359
+200,32,64,0.0023,1001819,5008,5227,266902,1334,1436
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,68,0.0024,1037115,5184,5499,255343,1276,1419
+200,32,68,0.0023,1061819,5308,5527,281902,1409,1511
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,72,0.0025,1097115,5484,5799,273343,1366,1509
+200,32,72,0.0025,1121819,5608,5827,296902,1484,1586
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,76,0.0025,1157115,5784,6099,285343,1426,1569
+200,32,76,0.0028,1181819,5908,6127,311902,1559,1661
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,80,0.0025,1217115,6084,6399,303343,1516,1659
+200,32,80,0.0025,1241819,6208,6427,326902,1634,1736
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,84,0.0026,1277115,6384,6699,315343,1576,1719
+200,32,84,0.0026,1301819,6508,6727,341902,1709,1811
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,88,0.0027,1337115,6684,6999,333343,1666,1809
+200,32,88,0.0026,1361819,6808,7027,356902,1784,1886
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,92,0.0027,1397115,6984,7299,345343,1726,1869
+200,32,92,0.0027,1421819,7108,7327,371902,1859,1961
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,96,0.0028,1457115,7284,7599,363343,1816,1959
+200,32,96,0.0028,1481819,7408,7627,386902,1934,2036
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,100,0.0029,1517115,7584,7899,375343,1876,2019
+200,32,100,0.0029,1541819,7708,7927,401902,2009,2111
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,104,0.0029,1577115,7884,8199,393343,1966,2109
+200,32,104,0.0029,1601819,8008,8227,416902,2084,2186
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,108,0.0030,1637115,8184,8499,405343,2026,2169
+200,32,108,0.0031,1661819,8308,8527,431902,2159,2261
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,112,0.0030,1697115,8484,8799,423343,2116,2259
+200,32,112,0.0030,1721819,8608,8827,446902,2234,2336
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,116,0.0031,1757115,8784,9099,435343,2176,2319
+200,32,116,0.0031,1781819,8908,9127,461902,2309,2411
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,120,0.0033,1817115,9084,9399,453343,2266,2409
+200,32,120,0.0032,1841819,9208,9427,476902,2384,2486
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,124,0.0032,1877115,9384,9699,465343,2326,2469
+200,32,124,0.0033,1901819,9508,9727,491902,2459,2561
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,128,0.0033,1937115,9684,9999,483343,2416,2559
+200,32,128,0.0033,1961819,9808,10027,506902,2534,2636
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,132,0.0034,1997115,9984,10299,495343,2476,2619
+200,32,132,0.0034,2021819,10108,10327,521902,2609,2711
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,136,0.0035,2057115,10284,10599,513343,2566,2709
+200,32,136,0.0035,2081819,10408,10627,536902,2684,2786
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,140,0.0035,2117115,10584,10899,525343,2626,2769
+200,32,140,0.0036,2141819,10708,10927,551902,2759,2861
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,144,0.0036,2177115,10884,11199,543343,2716,2859
+200,32,144,0.0036,2201819,11008,11227,566902,2834,2936
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,148,0.0036,2237115,11184,11499,555343,2776,2919
+200,32,148,0.0036,2261819,11308,11527,581902,2909,3011
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,152,0.0037,2297115,11484,11799,573343,2866,3009
+200,32,152,0.0037,2321819,11608,11827,596902,2984,3086
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,156,0.0038,2357115,11784,12099,585343,2926,3069
+200,32,156,0.0038,2381819,11908,12127,611902,3059,3161
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,160,0.0038,2417115,12084,12399,603343,3016,3159
+200,32,160,0.0040,2441819,12208,12427,626902,3134,3236
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,164,0.0039,2477115,12384,12699,615343,3076,3219
+200,32,164,0.0039,2501819,12508,12727,641902,3209,3311
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,168,0.0039,2537115,12684,12999,633343,3166,3309
+200,32,168,0.0040,2561819,12808,13027,656902,3284,3386
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,172,0.0040,2597115,12984,13299,645343,3226,3369
+200,32,172,0.0040,2621819,13108,13327,671902,3359,3461
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,176,0.0041,2657115,13284,13599,663343,3316,3459
+200,32,176,0.0041,2681819,13408,13627,686902,3434,3536
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,180,0.0041,2717115,13584,13899,675343,3376,3519
+200,32,180,0.0041,2741819,13708,13927,701902,3509,3611
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,184,0.0042,2777115,13884,14199,693343,3466,3609
+200,32,184,0.0042,2801819,14008,14227,716902,3584,3686
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,188,0.0043,2837115,14184,14499,705343,3526,3669
+200,32,188,0.0044,2861819,14308,14527,731902,3659,3761
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,192,0.0043,2897115,14484,14799,723343,3616,3759
+200,32,192,0.0044,2921819,14608,14827,746902,3734,3836
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,196,0.0044,2957115,14784,15099,735343,3676,3819
+200,32,196,0.0045,2981819,14908,15127,761902,3809,3911
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,200,0.0045,3017115,15084,15399,753343,3766,3909
+200,32,200,0.0045,3041819,15208,15427,776902,3884,3986
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,204,0.0045,3077115,15384,15699,765343,3826,3969
+200,32,204,0.0045,3101819,15508,15727,791902,3959,4061
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,208,0.0046,3137115,15684,15999,783343,3916,4059
+200,32,208,0.0046,3161819,15808,16027,806902,4034,4136
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,212,0.0047,3197115,15984,16299,795343,3976,4119
+200,32,212,0.0047,3221819,16108,16327,821902,4109,4211
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,216,0.0047,3257115,16284,16599,813343,4066,4209
+200,32,216,0.0047,3281819,16408,16627,836902,4184,4286
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,220,0.0048,3317115,16584,16899,825343,4126,4269
+200,32,220,0.0048,3341819,16708,16927,851902,4259,4361
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,224,0.0049,3377115,16884,17199,843343,4216,4359
+200,32,224,0.0049,3401819,17008,17227,866902,4334,4436
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,228,0.0049,3437115,17184,17499,855343,4276,4419
+200,32,228,0.0050,3461819,17308,17527,881902,4409,4511
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,232,0.0050,3497115,17484,17799,873343,4366,4509
+200,32,232,0.0050,3521819,17608,17827,896902,4484,4586
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,236,0.0051,3557115,17784,18099,885343,4426,4569
+200,32,236,0.0051,3581819,17908,18127,911902,4559,4661
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,240,0.0052,3617115,18084,18399,903343,4516,4659
+200,32,240,0.0051,3641819,18208,18427,926902,4634,4736
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,244,0.0052,3677115,18384,18699,915343,4576,4719
+200,32,244,0.0052,3701819,18508,18727,941902,4709,4811
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,248,0.0052,3737115,18684,18999,933343,4666,4809
+200,32,248,0.0053,3761819,18808,19027,956902,4784,4886
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,252,0.0054,3797115,18984,19299,945343,4726,4869
+200,32,252,0.0053,3821819,19108,19327,971902,4859,4961
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,256,0.0054,3857115,19284,19599,963343,4816,4959
+200,32,256,0.0054,3881819,19408,19627,986902,4934,5036
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,260,0.0054,3917115,19584,19899,975343,4876,5019
+200,32,260,0.0055,3941819,19708,19927,1001902,5009,5111
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,264,0.0055,3977115,19884,20199,993343,4966,5109
+200,32,264,0.0055,4001819,20008,20227,1016902,5084,5186
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,268,0.0056,4037115,20184,20499,1005343,5026,5169
+200,32,268,0.0056,4061819,20308,20527,1031902,5159,5261
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,272,0.0056,4097115,20484,20799,1023343,5116,5259
+200,32,272,0.0057,4121819,20608,20827,1046902,5234,5336
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,276,0.0057,4157115,20784,21099,1035343,5176,5319
+200,32,276,0.0057,4181819,20908,21127,1061902,5309,5411
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,280,0.0057,4217115,21084,21399,1053343,5266,5409
+200,32,280,0.0058,4241819,21208,21427,1076902,5384,5486
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,284,0.0058,4277115,21384,21699,1065343,5326,5469
+200,32,284,0.0059,4301819,21508,21727,1091902,5459,5561
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,288,0.0059,4337115,21684,21999,1083343,5416,5559
+200,32,288,0.0059,4361819,21808,22027,1106902,5534,5636
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,292,0.0059,4397115,21984,22299,1095343,5476,5619
+200,32,292,0.0060,4421819,22108,22327,1121902,5609,5711
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,296,0.0061,4457115,22284,22599,1113343,5566,5709
+200,32,296,0.0061,4481819,22408,22627,1136902,5684,5786
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,300,0.0061,4517115,22584,22899,1125343,5626,5769
+200,32,300,0.0061,4541819,22708,22927,1151902,5759,5861
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,304,0.0061,4577115,22884,23199,1143343,5716,5859
+200,32,304,0.0062,4601819,23008,23227,1166902,5834,5936
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,308,0.0062,4637115,23184,23499,1155343,5776,5919
+200,32,308,0.0063,4661819,23308,23527,1181902,5909,6011
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,312,0.0063,4697115,23484,23799,1173343,5866,6009
+200,32,312,0.0064,4721819,23608,23827,1196902,5984,6086
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,316,0.0064,4757115,23784,24099,1185343,5926,6069
+200,32,316,0.0066,4781819,23908,24127,1211902,6059,6161
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,320,0.0064,4817115,24084,24399,1203343,6016,6159
+200,32,320,0.0065,4841819,24208,24427,1226902,6134,6236
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,324,0.0065,4877115,24384,24699,1215343,6076,6219
+200,32,324,0.0065,4901819,24508,24727,1241902,6209,6311
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,328,0.0065,4937115,24684,24999,1233343,6166,6309
+200,32,328,0.0069,4961819,24808,25027,1256902,6284,6386
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,332,0.0066,4997115,24984,25299,1245343,6226,6369
+200,32,332,0.0066,5021819,25108,25327,1271902,6359,6461
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,336,0.0066,5057115,25284,25599,1263343,6316,6459
+200,32,336,0.0067,5081819,25408,25627,1286902,6434,6536
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,340,0.0068,5117115,25584,25899,1275343,6376,6519
+200,32,340,0.0068,5141819,25708,25927,1301902,6509,6611
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,344,0.0068,5177115,25884,26199,1293343,6466,6609
+200,32,344,0.0069,5201819,26008,26227,1316902,6584,6686
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,348,0.0069,5237115,26184,26499,1305343,6526,6669
+200,32,348,0.0069,5261819,26308,26527,1331902,6659,6761
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,352,0.0071,5297115,26484,26799,1323343,6616,6759
+200,32,352,0.0070,5321819,26608,26827,1346902,6734,6836
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,356,0.0070,5357115,26784,27099,1335343,6676,6819
+200,32,356,0.0070,5381819,26908,27127,1361902,6809,6911
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,360,0.0070,5417115,27084,27399,1353343,6766,6909
+200,32,360,0.0071,5441819,27208,27427,1376902,6884,6986
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,364,0.0071,5477115,27384,27699,1365343,6826,6969
+200,32,364,0.0072,5501819,27508,27727,1391902,6959,7061
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,368,0.0072,5537115,27684,27999,1383343,6916,7059
+200,32,368,0.0072,5561819,27808,28027,1406902,7034,7136
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,372,0.0073,5597115,27984,28299,1395343,6976,7119
+200,32,372,0.0073,5621819,28108,28327,1421902,7109,7211
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,376,0.0073,5657115,28284,28599,1413343,7066,7209
+200,32,376,0.0074,5681819,28408,28627,1436902,7184,7286
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,380,0.0074,5717115,28584,28899,1425343,7126,7269
+200,32,380,0.0074,5741819,28708,28927,1451902,7259,7361
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,384,0.0074,5777115,28884,29199,1443343,7216,7359
+200,32,384,0.0075,5801819,29008,29227,1466902,7334,7436
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,388,0.0075,5837115,29184,29499,1455343,7276,7419
+200,32,388,0.0076,5861819,29308,29527,1481902,7409,7511
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,392,0.0076,5897115,29484,29799,1473343,7366,7509
+200,32,392,0.0076,5921819,29608,29827,1496902,7484,7586
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,396,0.0076,5957115,29784,30099,1485343,7426,7569
+200,32,396,0.0077,5981819,29908,30127,1511902,7559,7661
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,400,0.0078,6017115,30084,30399,1503343,7516,7659
+200,32,400,0.0078,6041819,30208,30427,1526902,7634,7736
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,404,0.0078,6077115,30384,30699,1515343,7576,7719
+200,32,404,0.0079,6101819,30508,30727,1541902,7709,7811
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,408,0.0078,6137115,30684,30999,1533343,7666,7809
+200,32,408,0.0079,6161819,30808,31027,1556902,7784,7886
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,412,0.0079,6197115,30984,31299,1545343,7726,7869
+200,32,412,0.0080,6221819,31108,31327,1571902,7859,7961
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,416,0.0080,6257115,31284,31599,1563343,7816,7959
+200,32,416,0.0081,6281819,31408,31627,1586902,7934,8036
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,420,0.0080,6317115,31584,31899,1575343,7876,8019
+200,32,420,0.0081,6341819,31708,31927,1601902,8009,8111
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,424,0.0081,6377115,31884,32199,1593343,7966,8109
+200,32,424,0.0082,6401819,32008,32227,1616902,8084,8186
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,428,0.0081,6437115,32184,32499,1605343,8026,8169
+200,32,428,0.0082,6461819,32308,32527,1631902,8159,8261
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,432,0.0082,6497115,32484,32799,1623343,8116,8259
+200,32,432,0.0085,6521819,32608,32827,1646902,8234,8336
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,436,0.0083,6557115,32784,33099,1635343,8176,8319
+200,32,436,0.0084,6581819,32908,33127,1661902,8309,8411
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,440,0.0083,6617115,33084,33399,1653343,8266,8409
+200,32,440,0.0084,6641819,33208,33427,1676902,8384,8486
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,444,0.0084,6677115,33384,33699,1665343,8326,8469
+200,32,444,0.0085,6701819,33508,33727,1691902,8459,8561
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,448,0.0085,6737115,33684,33999,1683343,8416,8559
+200,32,448,0.0087,6761819,33808,34027,1706902,8534,8636
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,452,0.0085,6797115,33984,34299,1695343,8476,8619
+200,32,452,0.0087,6821819,34108,34327,1721902,8609,8711
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,456,0.0086,6857115,34284,34599,1713343,8566,8709
+200,32,456,0.0087,6881819,34408,34627,1736902,8684,8786
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,460,0.0087,6917115,34584,34899,1725343,8626,8769
+200,32,460,0.0088,6941819,34708,34927,1751902,8759,8861
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,464,0.0088,6977115,34884,35199,1743343,8716,8859
+200,32,464,0.0088,7001819,35008,35227,1766902,8834,8936
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,468,0.0088,7037115,35184,35499,1755343,8776,8919
+200,32,468,0.0089,7061819,35308,35527,1781902,8909,9011
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,472,0.0089,7097115,35484,35799,1773343,8866,9009
+200,32,472,0.0090,7121819,35608,35827,1796902,8984,9086
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,476,0.0090,7157115,35784,36099,1785343,8926,9069
+200,32,476,0.0091,7181819,35908,36127,1811902,9059,9161
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,480,0.0090,7217115,36084,36399,1803343,9016,9159
+200,32,480,0.0091,7241819,36208,36427,1826902,9134,9236
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,484,0.0091,7277115,36384,36699,1815343,9076,9219
+200,32,484,0.0092,7301819,36508,36727,1841902,9209,9311
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,488,0.0091,7337115,36684,36999,1833343,9166,9309
+200,32,488,0.0093,7361819,36808,37027,1856902,9284,9386
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,492,0.0092,7397115,36984,37299,1845343,9226,9369
+200,32,492,0.0094,7421819,37108,37327,1871902,9359,9461
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,496,0.0093,7457115,37284,37599,1863343,9316,9459
+200,32,496,0.0095,7481819,37408,37627,1886902,9434,9536
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,500,0.0093,7517115,37584,37899,1875343,9376,9519
+200,32,500,0.0094,7541819,37708,37927,1901902,9509,9611
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,504,0.0094,7577115,37884,38199,1893343,9466,9609
+200,32,504,0.0095,7601819,38008,38227,1916902,9584,9686
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,508,0.0095,7637115,38184,38499,1905343,9526,9669
+200,32,508,0.0096,7661819,38308,38527,1931902,9659,9761
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,512,0.0095,7697115,38484,38799,1923343,9616,9759
+200,32,512,0.0097,7721819,38608,38827,1946902,9734,9836
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,516,0.0096,7757115,38784,39099,1938343,9691,9834
+200,32,516,0.0098,7781819,38908,39127,1961902,9809,9911
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,520,0.0097,7817115,39084,39399,1953343,9766,9909
+200,32,520,0.0098,7841819,39208,39427,1976902,9884,9986
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,524,0.0097,7877115,39384,39699,1968343,9841,9984
+200,32,524,0.0099,7901819,39508,39727,1991902,9959,10061
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,528,0.0098,7937115,39684,39999,1983343,9916,10059
+200,32,528,0.0099,7961819,39808,40027,2006902,10034,10136
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,532,0.0099,7997115,39984,40299,1998343,9991,10134
+200,32,532,0.0100,8021819,40108,40327,2021902,10109,10211
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,536,0.0100,8057115,40284,40599,2013343,10066,10209
+200,32,536,0.0101,8081819,40408,40627,2036902,10184,10286
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,540,0.0101,8117115,40584,40899,2028343,10141,10284
+200,32,540,0.0101,8141819,40708,40927,2051902,10259,10361
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,544,0.0101,8177115,40884,41199,2043343,10216,10359
+200,32,544,0.0103,8201819,41008,41227,2066902,10334,10436
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,548,0.0102,8237115,41184,41499,2058343,10291,10434
+200,32,548,0.0103,8261819,41308,41527,2081902,10409,10511
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,552,0.0103,8297115,41484,41799,2073343,10366,10509
+200,32,552,0.0104,8321819,41608,41827,2096902,10484,10586
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,556,0.0104,8357115,41784,42099,2088343,10441,10584
+200,32,556,0.0106,8381819,41908,42127,2111902,10559,10661
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,560,0.0104,8417115,42084,42399,2103343,10516,10659
+200,32,560,0.0106,8441819,42208,42427,2126902,10634,10736
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,564,0.0105,8477115,42384,42699,2118343,10591,10734
+200,32,564,0.0106,8501819,42508,42727,2141902,10709,10811
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,568,0.0106,8537115,42684,42999,2133343,10666,10809
+200,32,568,0.0107,8561819,42808,43027,2156902,10784,10886
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,572,0.0106,8597115,42984,43299,2148343,10741,10884
+200,32,572,0.0108,8621819,43108,43327,2171902,10859,10961
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,576,0.0107,8657115,43284,43599,2163343,10816,10959
+200,32,576,0.0109,8681819,43408,43627,2186902,10934,11036
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,580,0.0109,8717115,43584,43899,2178343,10891,11034
+200,32,580,0.0110,8741819,43708,43927,2201902,11009,11111
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,584,0.0108,8777115,43884,44199,2193343,10966,11109
+200,32,584,0.0110,8801819,44008,44227,2216902,11084,11186
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,588,0.0110,8837115,44184,44499,2208343,11041,11184
+200,32,588,0.0110,8861819,44308,44527,2231902,11159,11261
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,592,0.0110,8897115,44484,44799,2223343,11116,11259
+200,32,592,0.0111,8921819,44608,44827,2246902,11234,11336
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,596,0.0111,8957115,44784,45099,2238343,11191,11334
+200,32,596,0.0113,8981819,44908,45127,2261902,11309,11411
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,600,0.0111,9017115,45084,45399,2253343,11266,11409
+200,32,600,0.0113,9041819,45208,45427,2276902,11384,11486
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,604,0.0112,9077115,45384,45699,2268343,11341,11484
+200,32,604,0.0114,9101819,45508,45727,2291902,11459,11561
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,608,0.0113,9137115,45684,45999,2283343,11416,11559
+200,32,608,0.0115,9161819,45808,46027,2306902,11534,11636
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,612,0.0113,9197115,45984,46299,2298343,11491,11634
+200,32,612,0.0115,9221819,46108,46327,2321902,11609,11711
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,616,0.0114,9257115,46284,46599,2313343,11566,11709
+200,32,616,0.0115,9281819,46408,46627,2336902,11684,11786
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,620,0.0115,9317115,46584,46899,2328343,11641,11784
+200,32,620,0.0116,9341819,46708,46927,2351902,11759,11861
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,624,0.0115,9377115,46884,47199,2343343,11716,11859
+200,32,624,0.0117,9401819,47008,47227,2366902,11834,11936
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,628,0.0115,9437115,47184,47499,2358343,11791,11934
+200,32,628,0.0117,9461819,47308,47527,2381902,11909,12011
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,632,0.0117,9497115,47484,47799,2373343,11866,12009
+200,32,632,0.0118,9521819,47608,47827,2396902,11984,12086
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,636,0.0118,9557115,47784,48099,2388343,11941,12084
+200,32,636,0.0119,9581819,47908,48127,2411902,12059,12161
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,640,0.0119,9617115,48084,48399,2403343,12016,12159
+200,32,640,0.0119,9641819,48208,48427,2426902,12134,12236
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,644,0.0118,9677115,48384,48699,2418343,12091,12234
+200,32,644,0.0121,9701819,48508,48727,2441902,12209,12311
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,648,0.0119,9737115,48684,48999,2433343,12166,12309
+200,32,648,0.0121,9761819,48808,49027,2456902,12284,12386
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,652,0.0121,9797115,48984,49299,2448343,12241,12384
+200,32,652,0.0121,9821819,49108,49327,2471902,12359,12461
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,656,0.0121,9857115,49284,49599,2463343,12316,12459
+200,32,656,0.0122,9881819,49408,49627,2486902,12434,12536
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,660,0.0122,9917115,49584,49899,2478343,12391,12534
+200,32,660,0.0123,9941819,49708,49927,2501902,12509,12611
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,664,0.0122,9977115,49884,50199,2493343,12466,12609
+200,32,664,0.0123,10001819,50008,50227,2516902,12584,12686
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,668,0.0123,10037115,50184,50499,2508343,12541,12684
+200,32,668,0.0124,10061819,50308,50527,2531902,12659,12761
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,672,0.0123,10097115,50484,50799,2523343,12616,12759
+200,32,672,0.0124,10121819,50608,50827,2546902,12734,12836
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,676,0.0125,10157115,50784,51099,2538343,12691,12834
+200,32,676,0.0126,10181819,50908,51127,2561902,12809,12911
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,680,0.0124,10217115,51084,51399,2553343,12766,12909
+200,32,680,0.0126,10241819,51208,51427,2576902,12884,12986
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,684,0.0125,10277115,51384,51699,2568343,12841,12984
+200,32,684,0.0127,10301819,51508,51727,2591902,12959,13061
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,688,0.0126,10337115,51684,51999,2583343,12916,13059
+200,32,688,0.0128,10361819,51808,52027,2606902,13034,13136
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,692,0.0126,10397115,51984,52299,2598343,12991,13134
+200,32,692,0.0128,10421819,52108,52327,2621902,13109,13211
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,696,0.0127,10457115,52284,52599,2613343,13066,13209
+200,32,696,0.0129,10481819,52408,52627,2636902,13184,13286
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,700,0.0128,10517115,52584,52899,2628343,13141,13284
+200,32,700,0.0131,10541819,52708,52927,2651902,13259,13361
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,704,0.0129,10577115,52884,53199,2643343,13216,13359
+200,32,704,0.0131,10601819,53008,53227,2666902,13334,13436
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,708,0.0129,10637115,53184,53499,2658343,13291,13434
+200,32,708,0.0130,10661819,53308,53527,2681902,13409,13511
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,712,0.0129,10697115,53484,53799,2673343,13366,13509
+200,32,712,0.0131,10721819,53608,53827,2696902,13484,13586
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,716,0.0130,10757115,53784,54099,2688343,13441,13584
+200,32,716,0.0132,10781819,53908,54127,2711902,13559,13661
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,720,0.0130,10817115,54084,54399,2703343,13516,13659
+200,32,720,0.0132,10841819,54208,54427,2726902,13634,13736
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,724,0.0132,10877115,54384,54699,2718343,13591,13734
+200,32,724,0.0134,10901819,54508,54727,2741902,13709,13811
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,728,0.0131,10937115,54684,54999,2733343,13666,13809
+200,32,728,0.0134,10961819,54808,55027,2756902,13784,13886
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,732,0.0133,10997115,54984,55299,2748343,13741,13884
+200,32,732,0.0134,11021819,55108,55327,2771902,13859,13961
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,736,0.0135,11057115,55284,55599,2763343,13816,13959
+200,32,736,0.0135,11081819,55408,55627,2786902,13934,14036
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,740,0.0134,11117115,55584,55899,2778343,13891,14034
+200,32,740,0.0137,11141819,55708,55927,2801902,14009,14111
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,744,0.0134,11177115,55884,56199,2793343,13966,14109
+200,32,744,0.0138,11201819,56008,56227,2816902,14084,14186
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,748,0.0135,11237115,56184,56499,2808343,14041,14184
+200,32,748,0.0137,11261819,56308,56527,2831902,14159,14261
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,752,0.0136,11297115,56484,56799,2823343,14116,14259
+200,32,752,0.0138,11321819,56608,56827,2846902,14234,14336
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,756,0.0136,11357115,56784,57099,2838343,14191,14334
+200,32,756,0.0139,11381819,56908,57127,2861902,14309,14411
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,760,0.0138,11417115,57084,57399,2853343,14266,14409
+200,32,760,0.0140,11441819,57208,57427,2876902,14384,14486
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,764,0.0139,11477115,57384,57699,2868343,14341,14484
+200,32,764,0.0140,11501819,57508,57727,2891902,14459,14561
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,768,0.0138,11537115,57684,57999,2883343,14416,14559
+200,32,768,0.0141,11561819,57808,58027,2906902,14534,14636
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,772,0.0140,11597115,57984,58299,2898343,14491,14634
+200,32,772,0.0141,11621819,58108,58327,2921902,14609,14711
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,776,0.0140,11657115,58284,58599,2913343,14566,14709
+200,32,776,0.0142,11681819,58408,58627,2936902,14684,14786
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,780,0.0142,11717115,58584,58899,2928343,14641,14784
+200,32,780,0.0143,11741819,58708,58927,2951902,14759,14861
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,784,0.0141,11777115,58884,59199,2943343,14716,14859
+200,32,784,0.0144,11801819,59008,59227,2966902,14834,14936
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,788,0.0143,11837115,59184,59499,2958343,14791,14934
+200,32,788,0.0144,11861819,59308,59527,2981902,14909,15011
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,792,0.0143,11897115,59484,59799,2973343,14866,15009
+200,32,792,0.0145,11921819,59608,59827,2996902,14984,15086
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,796,0.0146,11957115,59784,60099,2988343,14941,15084
+200,32,796,0.0145,11981819,59908,60127,3011902,15059,15161
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,800,0.0144,12017115,60084,60399,3003343,15016,15159
+200,32,800,0.0147,12041819,60208,60427,3026902,15134,15236
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,804,0.0145,12077115,60384,60699,3018343,15091,15234
+200,32,804,0.0147,12101819,60508,60727,3041902,15209,15311
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,808,0.0146,12137115,60684,60999,3033343,15166,15309
+200,32,808,0.0148,12161819,60808,61027,3056902,15284,15386
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,812,0.0146,12197115,60984,61299,3048343,15241,15384
+200,32,812,0.0148,12221819,61108,61327,3071902,15359,15461
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,816,0.0146,12257115,61284,61599,3063343,15316,15459
+200,32,816,0.0150,12281819,61408,61627,3086902,15434,15536
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,820,0.0148,12317115,61584,61899,3078343,15391,15534
+200,32,820,0.0149,12341819,61708,61927,3101902,15509,15611
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,824,0.0149,12377115,61884,62199,3093343,15466,15609
+200,32,824,0.0150,12401819,62008,62227,3116902,15584,15686
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,828,0.0149,12437115,62184,62499,3108343,15541,15684
+200,32,828,0.0151,12461819,62308,62527,3131902,15659,15761
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,832,0.0149,12497115,62484,62799,3123343,15616,15759
+200,32,832,0.0152,12521819,62608,62827,3146902,15734,15836
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,836,0.0151,12557115,62784,63099,3138343,15691,15834
+200,32,836,0.0152,12581819,62908,63127,3161902,15809,15911
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,840,0.0150,12617115,63084,63399,3153343,15766,15909
+200,32,840,0.0153,12641819,63208,63427,3176902,15884,15986
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,844,0.0152,12677115,63384,63699,3168343,15841,15984
+200,32,844,0.0153,12701819,63508,63727,3191902,15959,16061
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,848,0.0152,12737115,63684,63999,3183343,15916,16059
+200,32,848,0.0154,12761819,63808,64027,3206902,16034,16136
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,852,0.0153,12797115,63984,64299,3198343,15991,16134
+200,32,852,0.0155,12821819,64108,64327,3221902,16109,16211
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,856,0.0153,12857115,64284,64599,3213343,16066,16209
+200,32,856,0.0156,12881819,64408,64627,3236902,16184,16286
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,860,0.0155,12917115,64584,64899,3228343,16141,16284
+200,32,860,0.0156,12941819,64708,64927,3251902,16259,16361
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,864,0.0156,12977115,64884,65199,3243343,16216,16359
+200,32,864,0.0157,13001819,65008,65227,3266902,16334,16436
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,868,0.0157,13037115,65184,65499,3258343,16291,16434
+200,32,868,0.0158,13061819,65308,65527,3281902,16409,16511
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,872,0.0156,13097115,65484,65799,3273343,16366,16509
+200,32,872,0.0159,13121819,65608,65827,3296902,16484,16586
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,876,0.0157,13157115,65784,66099,3288343,16441,16584
+200,32,876,0.0159,13181819,65908,66127,3311902,16559,16661
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,880,0.0158,13217115,66084,66399,3303343,16516,16659
+200,32,880,0.0160,13241819,66208,66427,3326902,16634,16736
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,884,0.0158,13277115,66384,66699,3318343,16591,16734
+200,32,884,0.0160,13301819,66508,66727,3341902,16709,16811
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,888,0.0159,13337115,66684,66999,3333343,16666,16809
+200,32,888,0.0161,13361819,66808,67027,3356902,16784,16886
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,892,0.0160,13397115,66984,67299,3348343,16741,16884
+200,32,892,0.0162,13421819,67108,67327,3371902,16859,16961
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,896,0.0161,13457115,67284,67599,3363343,16816,16959
+200,32,896,0.0163,13481819,67408,67627,3386902,16934,17036
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,900,0.0162,13517115,67584,67899,3378343,16891,17034
+200,32,900,0.0164,13541819,67708,67927,3401902,17009,17111
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,904,0.0163,13577115,67884,68199,3393343,16966,17109
+200,32,904,0.0165,13601819,68008,68227,3416902,17084,17186
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,908,0.0164,13637115,68184,68499,3408343,17041,17184
+200,32,908,0.0165,13661819,68308,68527,3431902,17159,17261
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,912,0.0165,13697115,68484,68799,3423343,17116,17259
+200,32,912,0.0166,13721819,68608,68827,3446902,17234,17336
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,916,0.0165,13757115,68784,69099,3438343,17191,17334
+200,32,916,0.0166,13781819,68908,69127,3461902,17309,17411
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,920,0.0165,13817115,69084,69399,3453343,17266,17409
+200,32,920,0.0167,13841819,69208,69427,3476902,17384,17486
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,924,0.0168,13877115,69384,69699,3468343,17341,17484
+200,32,924,0.0168,13901819,69508,69727,3491902,17459,17561
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,928,0.0167,13937115,69684,69999,3483343,17416,17559
+200,32,928,0.0169,13961819,69808,70027,3506902,17534,17636
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,932,0.0169,13997115,69984,70299,3498343,17491,17634
+200,32,932,0.0175,14021819,70108,70327,3521902,17609,17711
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,936,0.0168,14057115,70284,70599,3513343,17566,17709
+200,32,936,0.0170,14081819,70408,70627,3536902,17684,17786
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,940,0.0169,14117115,70584,70899,3528343,17641,17784
+200,32,940,0.0171,14141819,70708,70927,3551902,17759,17861
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,944,0.0169,14177115,70884,71199,3543343,17716,17859
+200,32,944,0.0171,14201819,71008,71227,3566902,17834,17936
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,948,0.0170,14237115,71184,71499,3558343,17791,17934
+200,32,948,0.0172,14261819,71308,71527,3581902,17909,18011
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,952,0.0171,14297115,71484,71799,3573343,17866,18009
+200,32,952,0.0172,14321819,71608,71827,3596902,17984,18086
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,956,0.0173,14357115,71784,72099,3588343,17941,18084
+200,32,956,0.0173,14381819,71908,72127,3611902,18059,18161
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,960,0.0172,14417115,72084,72399,3603343,18016,18159
+200,32,960,0.0174,14441819,72208,72427,3626902,18134,18236
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,964,0.0177,14477115,72384,72699,3618343,18091,18234
+200,32,964,0.0176,14501819,72508,72727,3641902,18209,18311
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,968,0.0177,14537115,72684,72999,3633343,18166,18309
+200,32,968,0.0178,14561819,72808,73027,3656902,18284,18386
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,972,0.0177,14597115,72984,73299,3648343,18241,18384
+200,32,972,0.0177,14621819,73108,73327,3671902,18359,18461
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,976,0.0179,14657115,73284,73599,3663343,18316,18459
+200,32,976,0.0178,14681819,73408,73627,3686902,18434,18536
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,980,0.0180,14717115,73584,73899,3678343,18391,18534
+200,32,980,0.0179,14741819,73708,73927,3701902,18509,18611
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,984,0.0180,14777115,73884,74199,3693343,18466,18609
+200,32,984,0.0179,14801819,74008,74227,3716902,18584,18686
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,988,0.0180,14837115,74184,74499,3708343,18541,18684
+200,32,988,0.0180,14861819,74308,74527,3731902,18659,18761
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,992,0.0181,14897115,74484,74799,3723343,18616,18759
+200,32,992,0.0181,14921819,74608,74827,3746902,18734,18836
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,996,0.0184,14957115,74784,75099,3738343,18691,18834
+200,32,996,0.0182,14981819,74908,75127,3761902,18809,18911
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,1000,0.0182,15017115,75084,75399,3753343,18766,18909
+200,32,1000,0.0182,15041819,75208,75427,3776902,18884,18986
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,1004,0.0183,15077115,75384,75699,3768343,18841,18984
+200,32,1004,0.0183,15101819,75508,75727,3791902,18959,19061
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,1008,0.0184,15137115,75684,75999,3783343,18916,19059
+200,32,1008,0.0183,15161819,75808,76027,3806902,19034,19136
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,1012,0.0185,15197115,75984,76299,3798343,18991,19134
+200,32,1012,0.0184,15221819,76108,76327,3821902,19109,19211
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,1016,0.0185,15257115,76284,76599,3813343,19066,19209
+200,32,1016,0.0185,15281819,76408,76627,3836902,19184,19286
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,1020,0.0186,15317115,76584,76899,3828343,19141,19284
+200,32,1020,0.0185,15341819,76708,76927,3851902,19259,19361
 iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)
-200,32,1024,0.0183,15377115,76884,77199,3843343,19216,19359
-mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .
+200,32,1024,0.0186,15401819,77008,77227,3866902,19334,19436
+mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.ld_st.bin.csv .
 </pre>
 </div>
 </div>
@@ -14606,19 +14830,18 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>Once the run finished, let's plot it again with the following cell (non-interactive: <code>make graph_task2a</code>).</p>
+<p>Once the run finished, let's plot it again in the course of the following cells (non-interactive: <code>make graph_task2a</code>).</p>
 
 </div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[6]:</div>
+<div class="prompt input_prompt">In&nbsp;[8]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_ldst</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&quot;poisson2d.ld_st.bin.csv&quot;</span><span class="p">,</span> <span class="n">skiprows</span><span class="o">=</span><span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">50000</span><span class="p">,</span> <span class="mi">2</span><span class="p">))</span>
-<span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df_ldst</span><span class="p">,</span> <span class="s2">&quot;PM_LD_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Loads / Loop Iteration&quot;</span><span class="p">)</span>
-<span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df_ldst</span><span class="p">,</span> <span class="s2">&quot;PM_ST_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Stores / Loop Iteration&quot;</span><span class="p">)</span>
+<span class="n">df_ldst</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df_ldst</span><span class="p">[</span><span class="s2">&quot;nx&quot;</span><span class="p">]</span> <span class="o">*</span> <span class="n">df_ldst</span><span class="p">[</span><span class="s2">&quot;ny&quot;</span><span class="p">]</span> 
 <span class="n">df_ldst</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
 </pre></div>
 
@@ -14632,7 +14855,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .
 
 <div class="output_area">
 
-    <div class="prompt output_prompt">Out[6]:</div>
+    <div class="prompt output_prompt">Out[8]:</div>
 
 
 
@@ -14665,8 +14888,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .
       <th>PM_ST_CMPL (total)</th>
       <th>PM_ST_CMPL (min)</th>
       <th>PM_ST_CMPL (max)</th>
-      <th>Loads / Loop Iteration</th>
-      <th>Stores / Loop Iteration</th>
+      <th>Grid Points</th>
     </tr>
   </thead>
   <tbody>
@@ -14676,29 +14898,27 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .
       <td>32</td>
       <td>4</td>
       <td>0.0012</td>
-      <td>95115</td>
-      <td>474</td>
-      <td>789</td>
-      <td>21343</td>
-      <td>106</td>
-      <td>249</td>
-      <td>3.703125</td>
-      <td>0.828125</td>
+      <td>119819</td>
+      <td>598</td>
+      <td>817</td>
+      <td>32902</td>
+      <td>164</td>
+      <td>266</td>
+      <td>128</td>
     </tr>
     <tr>
       <th>1</th>
       <td>200</td>
       <td>32</td>
       <td>8</td>
-      <td>0.0014</td>
-      <td>137115</td>
-      <td>684</td>
-      <td>999</td>
-      <td>33343</td>
-      <td>166</td>
-      <td>309</td>
-      <td>2.671875</td>
-      <td>0.648438</td>
+      <td>0.0013</td>
+      <td>161819</td>
+      <td>808</td>
+      <td>1027</td>
+      <td>56902</td>
+      <td>284</td>
+      <td>386</td>
+      <td>256</td>
     </tr>
     <tr>
       <th>2</th>
@@ -14706,14 +14926,13 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .
       <td>32</td>
       <td>12</td>
       <td>0.0014</td>
-      <td>197115</td>
-      <td>984</td>
-      <td>1299</td>
-      <td>45343</td>
-      <td>226</td>
-      <td>369</td>
-      <td>2.562500</td>
-      <td>0.588542</td>
+      <td>221819</td>
+      <td>1108</td>
+      <td>1327</td>
+      <td>71902</td>
+      <td>359</td>
+      <td>461</td>
+      <td>384</td>
     </tr>
     <tr>
       <th>3</th>
@@ -14721,29 +14940,27 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .
       <td>32</td>
       <td>16</td>
       <td>0.0015</td>
-      <td>257115</td>
-      <td>1284</td>
-      <td>1599</td>
-      <td>63343</td>
-      <td>316</td>
-      <td>459</td>
-      <td>2.507812</td>
-      <td>0.617188</td>
+      <td>281819</td>
+      <td>1408</td>
+      <td>1627</td>
+      <td>86902</td>
+      <td>434</td>
+      <td>536</td>
+      <td>512</td>
     </tr>
     <tr>
       <th>4</th>
       <td>200</td>
       <td>32</td>
       <td>20</td>
-      <td>0.0016</td>
-      <td>317115</td>
-      <td>1584</td>
-      <td>1899</td>
-      <td>75343</td>
-      <td>376</td>
-      <td>519</td>
-      <td>2.475000</td>
-      <td>0.587500</td>
+      <td>0.0015</td>
+      <td>341819</td>
+      <td>1708</td>
+      <td>1927</td>
+      <td>101902</td>
+      <td>509</td>
+      <td>611</td>
+      <td>640</td>
     </tr>
   </tbody>
 </table>
@@ -14758,12 +14975,111 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[79]:</div>
+<div class="prompt input_prompt">In&nbsp;[9]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">fig</span><span class="p">,</span> <span class="p">(</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">)</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_LD_CMPL (min)&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax1</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+<span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_ST_CMPL (min)&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax2</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+
+
+<div class="output_png output_subarea ">
+<img src="
+"
+>
+</div>
+
+</div>
+
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Also this behaviour looks – at a first glance – linear. We can again fit a first-order polynom (and re-use our previously defined function <code>curve_fit</code>)!</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[29]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">_fit</span><span class="p">,</span> <span class="n">_cov</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">print_and_return_fit</span><span class="p">(</span>
+    <span class="p">[</span><span class="s2">&quot;PM_LD_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;PM_ST_CMPL (min)&quot;</span><span class="p">],</span> 
+    <span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">),</span> 
+    <span class="n">linear_function</span><span class="p">,</span>
+    <span class="n">format_value</span><span class="o">=</span><span class="s2">&quot;.4f&quot;</span>
+<span class="p">)</span>
+<span class="n">fit_parameters</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_parameters</span><span class="p">,</span> <span class="o">**</span><span class="n">_fit</span><span class="p">}</span>
+<span class="n">fit_covariance</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_covariance</span><span class="p">,</span> <span class="o">**</span><span class="n">_cov</span><span class="p">}</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>Counter PM_LD_CMPL (min) is proportional to the grid points (nx*ny) by a factor of 2.3437 (± 0.000037)
+Counter PM_ST_CMPL (min) is proportional to the grid points (nx*ny) by a factor of 0.5860 (± 0.000019)
+</pre>
+</div>
+</div>
+
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Let's overlay this in one common plot:</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[28]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">fig</span><span class="p">,</span> <span class="p">(</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">)</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
-<span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Loads / Loop Iteration&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax1</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
-<span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Stores / Loop Iteration&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax2</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+<span class="k">for</span> <span class="n">ax</span><span class="p">,</span> <span class="n">pmu_counter</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">([</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;PM_LD_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;PM_ST_CMPL (min)&quot;</span><span class="p">]):</span>
+    <span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="n">pmu_counter</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+    <span class="n">ax</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span>
+        <span class="n">df_ldst</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">],</span> 
+        <span class="n">linear_function</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">],</span> <span class="o">*</span><span class="n">fit_parameters</span><span class="p">[</span><span class="n">pmu_counter</span><span class="p">]),</span> 
+        <span class="n">linestyle</span><span class="o">=</span><span class="s2">&quot;--&quot;</span><span class="p">,</span> 
+        <span class="n">label</span><span class="o">=</span><span class="s2">&quot;Fit: </span><span class="si">{:.2f}</span><span class="s2"> * x + </span><span class="si">{:.2f}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="o">*</span><span class="n">fit_parameters</span><span class="p">[</span><span class="n">pmu_counter</span><span class="p">])</span>
+    <span class="p">)</span>
+    <span class="n">ax</span><span class="o">.</span><span class="n">legend</span><span class="p">();</span>
 </pre></div>
 
     </div>
@@ -14782,7 +15098,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .
 
 
 <div class="output_png output_subarea ">
-<img src="
+<img src="
 "
 >
 </div>
@@ -14797,8 +15113,9 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
 <p>Did you expect more?</p>
-<p>The reason is simple: Among the load and store instructions counted by <code>PM_LD_CMPL</code> and <code>PM_ST_CMPL</code> are vector instructions which can load and store multiple (two) values at a time. To see how many <em>bytes</em> are loaded and stored, we need to measure counters for vectorized loads and stores as well.</p>
-<p><a name="task2-b"></a><strong>TASK B</strong>: Please measure counters for <em>vectorized</em> loads and <em>vectorized</em> stores. See the TODOs in <a href="/edit/Tasks/poisson2d.vld.c"><code>poisson2d.vld.c</code></a> and <a href="/edit/Tasks/poisson2d.vst.c"><code>poisson2d.vst.c</code></a> (<em>Note: These vector counters can not be measured together and need separate files and runs</em>). Can you find out the name of the counters yourself, using <code>papi_native_avail | grep VECTOR_</code>?</p>
+<p>The reason is simple: Among the load and store instructions counted by <code>PM_LD_CMPL</code> and <code>PM_ST_CMPL</code> are vector instructions which can load and store multiple (in this case: two) values at a time. To see how many <em>bytes</em> are loaded and stored, we need to measure counters for vectorized loads and stores as well.</p>
+<h3 id="TASK-B">TASK B<a class="anchor-link" href="#TASK-B">&#182;</a></h3><p><a name="task2-b"></a></p>
+<p>Please measure counters for <em>vectorized</em> loads and <em>vectorized</em> stores. See the TODOs in <a href="poisson2d.vld.c"><code>poisson2d.vld.c</code></a> and <a href="poisson2d.vst.c"><code>poisson2d.vst.c</code></a> (<em>Note: These vector counters can not be measured together and need separate files and runs</em>). Can you find out the name of the counters yourself, using <code>papi_native_avail | grep VECTOR_</code>?</p>
 <p>Compile, test, and bench-run your program again.</p>
 <p><a href="#toc">Back to top</a></p>
 
@@ -14807,7 +15124,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[2]:</div>
+<div class="prompt input_prompt">In&nbsp;[9]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="o">!</span>papi_native_avail <span class="p">|</span> grep VECTOR_
@@ -14827,9 +15144,9 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>| PM_VECTOR_FLOP_CMPL                                                          |
-| PM_VECTOR_LD_CMPL                                                            |
-| PM_VECTOR_ST_CMPL                                                            |
+<pre>| PM_VECTOR_FLOP_CMPL                                                          |
+| PM_VECTOR_LD_CMPL                                                            |
+| PM_VECTOR_ST_CMPL                                                            |
 </pre>
 </div>
 </div>
@@ -14848,7 +15165,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[3]:</div>
+<div class="prompt input_prompt">In&nbsp;[1]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="o">!</span>make bench_task3
@@ -14868,8 +15185,8 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>bsub -W 60 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vld.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.vld.bin.csv
-Job &lt;4097&gt; is submitted to default queue &lt;batch&gt;.
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vld.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.vld.bin.csv
+Job &lt;24641&gt; is submitted to default queue &lt;batch&gt;.
 &lt;&lt;Waiting for dispatch ...&gt;&gt;
 &lt;&lt;Starting on login1&gt;&gt;
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
@@ -14879,9 +15196,9 @@ iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,12,0.0012,174000,870,870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,16,0.0013,234000,1170,1170
+200,32,16,0.0012,234000,1170,1170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,20,0.0014,294000,1470,1470
+200,32,20,0.0013,294000,1470,1470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,24,0.0014,354000,1770,1770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
@@ -14895,11 +15212,11 @@ iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,44,0.0017,654000,3270,3270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,48,0.0017,714000,3570,3570
+200,32,48,0.0018,714000,3570,3570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,52,0.0018,774000,3870,3870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,56,0.0020,834000,4170,4170
+200,32,56,0.0019,834000,4170,4170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,60,0.0020,894000,4470,4470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
@@ -14909,117 +15226,117 @@ iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,72,0.0022,1074000,5370,5370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,76,0.0023,1134000,5670,5670
+200,32,76,0.0022,1134000,5670,5670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,80,0.0023,1194000,5970,5970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,84,0.0023,1254000,6270,6270
+200,32,84,0.0024,1254000,6270,6270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,88,0.0024,1314000,6570,6570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,92,0.0025,1374000,6870,6870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,96,0.0025,1434000,7170,7170
+200,32,96,0.0027,1434000,7170,7170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,100,0.0026,1494000,7470,7470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,104,0.0027,1554000,7770,7770
+200,32,104,0.0029,1554000,7770,7770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,108,0.0027,1614000,8070,8070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,112,0.0028,1674000,8370,8370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,116,0.0028,1734000,8670,8670
+200,32,116,0.0029,1734000,8670,8670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,120,0.0029,1794000,8970,8970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,124,0.0030,1854000,9270,9270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,128,0.0030,1914000,9570,9570
+200,32,128,0.0032,1914000,9570,9570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,132,0.0031,1974000,9870,9870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,136,0.0032,2034000,10170,10170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,140,0.0032,2094000,10470,10470
+200,32,140,0.0033,2094000,10470,10470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,144,0.0033,2154000,10770,10770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,148,0.0034,2214000,11070,11070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,152,0.0035,2274000,11370,11370
+200,32,152,0.0036,2274000,11370,11370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,156,0.0035,2334000,11670,11670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,160,0.0036,2394000,11970,11970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,164,0.0036,2454000,12270,12270
+200,32,164,0.0037,2454000,12270,12270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,168,0.0037,2514000,12570,12570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,172,0.0037,2574000,12870,12870
+200,32,172,0.0038,2574000,12870,12870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,176,0.0038,2634000,13170,13170
+200,32,176,0.0039,2634000,13170,13170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,180,0.0039,2694000,13470,13470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,184,0.0041,2754000,13770,13770
+200,32,184,0.0040,2754000,13770,13770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,188,0.0040,2814000,14070,14070
+200,32,188,0.0041,2814000,14070,14070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,192,0.0041,2874000,14370,14370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,196,0.0041,2934000,14670,14670
+200,32,196,0.0042,2934000,14670,14670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,200,0.0042,2994000,14970,14970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,204,0.0043,3054000,15270,15270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,208,0.0044,3114000,15570,15570
+200,32,208,0.0045,3114000,15570,15570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,212,0.0044,3174000,15870,15870
+200,32,212,0.0045,3174000,15870,15870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,216,0.0044,3234000,16170,16170
+200,32,216,0.0045,3234000,16170,16170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,220,0.0045,3294000,16470,16470
+200,32,220,0.0046,3294000,16470,16470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,224,0.0046,3354000,16770,16770
+200,32,224,0.0048,3354000,16770,16770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,228,0.0047,3414000,17070,17070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,232,0.0047,3474000,17370,17370
+200,32,232,0.0048,3474000,17370,17370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,236,0.0048,3534000,17670,17670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,240,0.0048,3594000,17970,17970
+200,32,240,0.0049,3594000,17970,17970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,244,0.0049,3654000,18270,18270
+200,32,244,0.0050,3654000,18270,18270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,248,0.0049,3714000,18570,18570
+200,32,248,0.0052,3714000,18570,18570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,252,0.0050,3774000,18870,18870
+200,32,252,0.0051,3774000,18870,18870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,256,0.0051,3834000,19170,19170
+200,32,256,0.0052,3834000,19170,19170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,260,0.0052,3894000,19470,19470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,264,0.0052,3954000,19770,19770
+200,32,264,0.0053,3954000,19770,19770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,268,0.0053,4014000,20070,20070
+200,32,268,0.0054,4014000,20070,20070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,272,0.0053,4074000,20370,20370
+200,32,272,0.0054,4074000,20370,20370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,276,0.0055,4134000,20670,20670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,280,0.0055,4194000,20970,20970
+200,32,280,0.0056,4194000,20970,20970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,284,0.0055,4254000,21270,21270
+200,32,284,0.0056,4254000,21270,21270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,288,0.0057,4314000,21570,21570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,292,0.0056,4374000,21870,21870
+200,32,292,0.0058,4374000,21870,21870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,296,0.0057,4434000,22170,22170
+200,32,296,0.0058,4434000,22170,22170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,300,0.0059,4494000,22470,22470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
@@ -15027,366 +15344,366 @@ iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,308,0.0060,4614000,23070,23070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,312,0.0060,4674000,23370,23370
+200,32,312,0.0061,4674000,23370,23370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,316,0.0061,4734000,23670,23670
+200,32,316,0.0062,4734000,23670,23670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,320,0.0061,4794000,23970,23970
+200,32,320,0.0062,4794000,23970,23970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,324,0.0062,4854000,24270,24270
+200,32,324,0.0063,4854000,24270,24270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,328,0.0062,4914000,24570,24570
+200,32,328,0.0063,4914000,24570,24570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,332,0.0063,4974000,24870,24870
+200,32,332,0.0064,4974000,24870,24870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,336,0.0063,5034000,25170,25170
+200,32,336,0.0065,5034000,25170,25170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,340,0.0066,5094000,25470,25470
+200,32,340,0.0065,5094000,25470,25470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,344,0.0065,5154000,25770,25770
+200,32,344,0.0066,5154000,25770,25770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,348,0.0067,5214000,26070,26070
+200,32,348,0.0069,5214000,26070,26070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,352,0.0068,5274000,26370,26370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,356,0.0067,5334000,26670,26670
+200,32,356,0.0070,5334000,26670,26670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,360,0.0067,5394000,26970,26970
+200,32,360,0.0069,5394000,26970,26970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,364,0.0068,5454000,27270,27270
+200,32,364,0.0070,5454000,27270,27270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,368,0.0069,5514000,27570,27570
+200,32,368,0.0070,5514000,27570,27570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,372,0.0069,5574000,27870,27870
+200,32,372,0.0071,5574000,27870,27870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,376,0.0070,5634000,28170,28170
+200,32,376,0.0073,5634000,28170,28170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,380,0.0071,5694000,28470,28470
+200,32,380,0.0073,5694000,28470,28470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,384,0.0071,5754000,28770,28770
+200,32,384,0.0073,5754000,28770,28770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,388,0.0073,5814000,29070,29070
+200,32,388,0.0074,5814000,29070,29070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,392,0.0074,5874000,29370,29370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,396,0.0073,5934000,29670,29670
+200,32,396,0.0076,5934000,29670,29670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,400,0.0074,5994000,29970,29970
+200,32,400,0.0075,5994000,29970,29970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,404,0.0074,6054000,30270,30270
+200,32,404,0.0076,6054000,30270,30270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,408,0.0075,6114000,30570,30570
+200,32,408,0.0077,6114000,30570,30570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,412,0.0076,6174000,30870,30870
+200,32,412,0.0078,6174000,30870,30870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,416,0.0076,6234000,31170,31170
+200,32,416,0.0079,6234000,31170,31170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,420,0.0080,6294000,31470,31470
+200,32,420,0.0079,6294000,31470,31470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,424,0.0079,6354000,31770,31770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,428,0.0078,6414000,32070,32070
+200,32,428,0.0080,6414000,32070,32070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,432,0.0079,6474000,32370,32370
+200,32,432,0.0080,6474000,32370,32370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,436,0.0080,6534000,32670,32670
+200,32,436,0.0081,6534000,32670,32670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,440,0.0080,6594000,32970,32970
+200,32,440,0.0082,6594000,32970,32970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,444,0.0083,6654000,33270,33270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,448,0.0082,6714000,33570,33570
+200,32,448,0.0084,6714000,33570,33570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,452,0.0082,6774000,33870,33870
+200,32,452,0.0084,6774000,33870,33870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,456,0.0083,6834000,34170,34170
+200,32,456,0.0084,6834000,34170,34170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,460,0.0086,6894000,34470,34470
+200,32,460,0.0085,6894000,34470,34470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,464,0.0084,6954000,34770,34770
+200,32,464,0.0086,6954000,34770,34770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,468,0.0085,7014000,35070,35070
+200,32,468,0.0087,7014000,35070,35070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,472,0.0086,7074000,35370,35370
+200,32,472,0.0088,7074000,35370,35370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,476,0.0086,7134000,35670,35670
+200,32,476,0.0088,7134000,35670,35670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,480,0.0087,7194000,35970,35970
+200,32,480,0.0089,7194000,35970,35970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,484,0.0088,7254000,36270,36270
+200,32,484,0.0090,7254000,36270,36270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,488,0.0088,7314000,36570,36570
+200,32,488,0.0091,7314000,36570,36570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,492,0.0089,7374000,36870,36870
+200,32,492,0.0091,7374000,36870,36870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,496,0.0091,7434000,37170,37170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,500,0.0092,7494000,37470,37470
+200,32,500,0.0094,7494000,37470,37470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,504,0.0091,7554000,37770,37770
+200,32,504,0.0093,7554000,37770,37770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,508,0.0092,7614000,38070,38070
+200,32,508,0.0095,7614000,38070,38070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,512,0.0092,7674000,38370,38370
+200,32,512,0.0096,7674000,38370,38370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,516,0.0093,7734000,38670,38670
+200,32,516,0.0095,7734000,38670,38670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,520,0.0093,7794000,38970,38970
+200,32,520,0.0095,7794000,38970,38970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,524,0.0094,7854000,39270,39270
+200,32,524,0.0097,7854000,39270,39270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
 200,32,528,0.0097,7914000,39570,39570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,532,0.0095,7974000,39870,39870
+200,32,532,0.0098,7974000,39870,39870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,536,0.0096,8034000,40170,40170
+200,32,536,0.0098,8034000,40170,40170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,540,0.0097,8094000,40470,40470
+200,32,540,0.0099,8094000,40470,40470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,544,0.0097,8154000,40770,40770
+200,32,544,0.0100,8154000,40770,40770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,548,0.0099,8214000,41070,41070
+200,32,548,0.0101,8214000,41070,41070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,552,0.0099,8274000,41370,41370
+200,32,552,0.0101,8274000,41370,41370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,556,0.0100,8334000,41670,41670
+200,32,556,0.0104,8334000,41670,41670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,560,0.0100,8394000,41970,41970
+200,32,560,0.0103,8394000,41970,41970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,564,0.0101,8454000,42270,42270
+200,32,564,0.0103,8454000,42270,42270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,568,0.0102,8514000,42570,42570
+200,32,568,0.0106,8514000,42570,42570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,572,0.0103,8574000,42870,42870
+200,32,572,0.0105,8574000,42870,42870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,576,0.0103,8634000,43170,43170
+200,32,576,0.0106,8634000,43170,43170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,580,0.0104,8694000,43470,43470
+200,32,580,0.0108,8694000,43470,43470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,584,0.0104,8754000,43770,43770
+200,32,584,0.0109,8754000,43770,43770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,588,0.0106,8814000,44070,44070
+200,32,588,0.0108,8814000,44070,44070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,592,0.0106,8874000,44370,44370
+200,32,592,0.0109,8874000,44370,44370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,596,0.0107,8934000,44670,44670
+200,32,596,0.0109,8934000,44670,44670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,600,0.0107,8994000,44970,44970
+200,32,600,0.0110,8994000,44970,44970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,604,0.0109,9054000,45270,45270
+200,32,604,0.0111,9054000,45270,45270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,608,0.0109,9114000,45570,45570
+200,32,608,0.0112,9114000,45570,45570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,612,0.0110,9174000,45870,45870
+200,32,612,0.0112,9174000,45870,45870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,616,0.0110,9234000,46170,46170
+200,32,616,0.0114,9234000,46170,46170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,620,0.0111,9294000,46470,46470
+200,32,620,0.0113,9294000,46470,46470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,624,0.0112,9354000,46770,46770
+200,32,624,0.0114,9354000,46770,46770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,628,0.0112,9414000,47070,47070
+200,32,628,0.0117,9414000,47070,47070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,632,0.0113,9474000,47370,47370
+200,32,632,0.0116,9474000,47370,47370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,636,0.0114,9534000,47670,47670
+200,32,636,0.0116,9534000,47670,47670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,640,0.0115,9594000,47970,47970
+200,32,640,0.0117,9594000,47970,47970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,644,0.0115,9654000,48270,48270
+200,32,644,0.0119,9654000,48270,48270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,648,0.0115,9714000,48570,48570
+200,32,648,0.0118,9714000,48570,48570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,652,0.0116,9774000,48870,48870
+200,32,652,0.0119,9774000,48870,48870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,656,0.0118,9834000,49170,49170
+200,32,656,0.0119,9834000,49170,49170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,660,0.0117,9894000,49470,49470
+200,32,660,0.0121,9894000,49470,49470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,664,0.0118,9954000,49770,49770
+200,32,664,0.0122,9954000,49770,49770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,668,0.0118,10014000,50070,50070
+200,32,668,0.0123,10014000,50070,50070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,672,0.0120,10074000,50370,50370
+200,32,672,0.0122,10074000,50370,50370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,676,0.0121,10134000,50670,50670
+200,32,676,0.0123,10134000,50670,50670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,680,0.0120,10194000,50970,50970
+200,32,680,0.0123,10194000,50970,50970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,684,0.0121,10254000,51270,51270
+200,32,684,0.0125,10254000,51270,51270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,688,0.0123,10314000,51570,51570
+200,32,688,0.0125,10314000,51570,51570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,692,0.0122,10374000,51870,51870
+200,32,692,0.0127,10374000,51870,51870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,696,0.0123,10434000,52170,52170
+200,32,696,0.0126,10434000,52170,52170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,700,0.0124,10494000,52470,52470
+200,32,700,0.0127,10494000,52470,52470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,704,0.0124,10554000,52770,52770
+200,32,704,0.0128,10554000,52770,52770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,708,0.0125,10614000,53070,53070
+200,32,708,0.0129,10614000,53070,53070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,712,0.0126,10674000,53370,53370
+200,32,712,0.0128,10674000,53370,53370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,716,0.0126,10734000,53670,53670
+200,32,716,0.0131,10734000,53670,53670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,720,0.0126,10794000,53970,53970
+200,32,720,0.0130,10794000,53970,53970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,724,0.0128,10854000,54270,54270
+200,32,724,0.0130,10854000,54270,54270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,728,0.0128,10914000,54570,54570
+200,32,728,0.0132,10914000,54570,54570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,732,0.0129,10974000,54870,54870
+200,32,732,0.0133,10974000,54870,54870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,736,0.0130,11034000,55170,55170
+200,32,736,0.0135,11034000,55170,55170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,740,0.0130,11094000,55470,55470
+200,32,740,0.0135,11094000,55470,55470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,744,0.0130,11154000,55770,55770
+200,32,744,0.0135,11154000,55770,55770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,748,0.0131,11214000,56070,56070
+200,32,748,0.0134,11214000,56070,56070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,752,0.0132,11274000,56370,56370
+200,32,752,0.0135,11274000,56370,56370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,756,0.0133,11334000,56670,56670
+200,32,756,0.0136,11334000,56670,56670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,760,0.0134,11394000,56970,56970
+200,32,760,0.0137,11394000,56970,56970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,764,0.0134,11454000,57270,57270
+200,32,764,0.0137,11454000,57270,57270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,768,0.0135,11514000,57570,57570
+200,32,768,0.0138,11514000,57570,57570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,772,0.0135,11574000,57870,57870
+200,32,772,0.0139,11574000,57870,57870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,776,0.0136,11634000,58170,58170
+200,32,776,0.0141,11634000,58170,58170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,780,0.0138,11694000,58470,58470
+200,32,780,0.0140,11694000,58470,58470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,784,0.0138,11754000,58770,58770
+200,32,784,0.0142,11754000,58770,58770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,788,0.0139,11814000,59070,59070
+200,32,788,0.0141,11814000,59070,59070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,792,0.0139,11874000,59370,59370
+200,32,792,0.0142,11874000,59370,59370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,796,0.0141,11934000,59670,59670
+200,32,796,0.0143,11934000,59670,59670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,800,0.0140,11994000,59970,59970
+200,32,800,0.0143,11994000,59970,59970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,804,0.0141,12054000,60270,60270
+200,32,804,0.0145,12054000,60270,60270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,808,0.0142,12114000,60570,60570
+200,32,808,0.0145,12114000,60570,60570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,812,0.0143,12174000,60870,60870
+200,32,812,0.0145,12174000,60870,60870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,816,0.0143,12234000,61170,61170
+200,32,816,0.0148,12234000,61170,61170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,820,0.0143,12294000,61470,61470
+200,32,820,0.0148,12294000,61470,61470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,824,0.0144,12354000,61770,61770
+200,32,824,0.0148,12354000,61770,61770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,828,0.0145,12414000,62070,62070
+200,32,828,0.0148,12414000,62070,62070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,832,0.0145,12474000,62370,62370
+200,32,832,0.0149,12474000,62370,62370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,836,0.0146,12534000,62670,62670
+200,32,836,0.0150,12534000,62670,62670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,840,0.0146,12594000,62970,62970
+200,32,840,0.0150,12594000,62970,62970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,844,0.0147,12654000,63270,63270
+200,32,844,0.0151,12654000,63270,63270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,848,0.0148,12714000,63570,63570
+200,32,848,0.0153,12714000,63570,63570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,852,0.0149,12774000,63870,63870
+200,32,852,0.0153,12774000,63870,63870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,856,0.0150,12834000,64170,64170
+200,32,856,0.0153,12834000,64170,64170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,860,0.0150,12894000,64470,64470
+200,32,860,0.0154,12894000,64470,64470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,864,0.0151,12954000,64770,64770
+200,32,864,0.0154,12954000,64770,64770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,868,0.0152,13014000,65070,65070
+200,32,868,0.0155,13014000,65070,65070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,872,0.0151,13074000,65370,65370
+200,32,872,0.0157,13074000,65370,65370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,876,0.0152,13134000,65670,65670
+200,32,876,0.0156,13134000,65670,65670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,880,0.0154,13194000,65970,65970
+200,32,880,0.0157,13194000,65970,65970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,884,0.0154,13254000,66270,66270
+200,32,884,0.0157,13254000,66270,66270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,888,0.0154,13314000,66570,66570
+200,32,888,0.0158,13314000,66570,66570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,892,0.0155,13374000,66870,66870
+200,32,892,0.0159,13374000,66870,66870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,896,0.0156,13434000,67170,67170
+200,32,896,0.0160,13434000,67170,67170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,900,0.0158,13494000,67470,67470
+200,32,900,0.0160,13494000,67470,67470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,904,0.0158,13554000,67770,67770
+200,32,904,0.0162,13554000,67770,67770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,908,0.0159,13614000,68070,68070
+200,32,908,0.0162,13614000,68070,68070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,912,0.0161,13674000,68370,68370
+200,32,912,0.0163,13674000,68370,68370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,916,0.0162,13734000,68670,68670
+200,32,916,0.0163,13734000,68670,68670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,920,0.0162,13794000,68970,68970
+200,32,920,0.0164,13794000,68970,68970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,924,0.0163,13854000,69270,69270
+200,32,924,0.0165,13854000,69270,69270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,928,0.0162,13914000,69570,69570
+200,32,928,0.0166,13914000,69570,69570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,932,0.0164,13974000,69870,69870
+200,32,932,0.0166,13974000,69870,69870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,936,0.0163,14034000,70170,70170
+200,32,936,0.0167,14034000,70170,70170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,940,0.0164,14094000,70470,70470
+200,32,940,0.0167,14094000,70470,70470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,944,0.0165,14154000,70770,70770
+200,32,944,0.0168,14154000,70770,70770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,948,0.0166,14214000,71070,71070
+200,32,948,0.0170,14214000,71070,71070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,952,0.0166,14274000,71370,71370
+200,32,952,0.0171,14274000,71370,71370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,956,0.0170,14334000,71670,71670
+200,32,956,0.0171,14334000,71670,71670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,960,0.0168,14394000,71970,71970
+200,32,960,0.0171,14394000,71970,71970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,964,0.0174,14454000,72270,72270
+200,32,964,0.0175,14454000,72270,72270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,968,0.0172,14514000,72570,72570
+200,32,968,0.0176,14514000,72570,72570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,972,0.0173,14574000,72870,72870
+200,32,972,0.0176,14574000,72870,72870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,976,0.0173,14634000,73170,73170
+200,32,976,0.0175,14634000,73170,73170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,980,0.0175,14694000,73470,73470
+200,32,980,0.0178,14694000,73470,73470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,984,0.0175,14754000,73770,73770
+200,32,984,0.0180,14754000,73770,73770
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,988,0.0176,14814000,74070,74070
+200,32,988,0.0178,14814000,74070,74070
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,992,0.0176,14874000,74370,74370
+200,32,992,0.0179,14874000,74370,74370
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,996,0.0178,14934000,74670,74670
+200,32,996,0.0181,14934000,74670,74670
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,1000,0.0179,14994000,74970,74970
+200,32,1000,0.0180,14994000,74970,74970
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,1004,0.0178,15054000,75270,75270
+200,32,1004,0.0182,15054000,75270,75270
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,1008,0.0179,15114000,75570,75570
+200,32,1008,0.0181,15114000,75570,75570
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,1012,0.0179,15174000,75870,75870
+200,32,1012,0.0183,15174000,75870,75870
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,1016,0.0181,15234000,76170,76170
+200,32,1016,0.0183,15234000,76170,76170
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,1020,0.0181,15294000,76470,76470
+200,32,1020,0.0186,15294000,76470,76470
 iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)
-200,32,1024,0.0179,15354000,76770,76770
-mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vld.bin.csv .
-bsub -W 60 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vst.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.vst.bin.csv
-Job &lt;4098&gt; is submitted to default queue &lt;batch&gt;.
+200,32,1024,0.0182,15354000,76770,76770
+mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.vld.bin.csv .
+bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vst.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.vst.bin.csv
+Job &lt;24642&gt; is submitted to default queue &lt;batch&gt;.
 &lt;&lt;Waiting for dispatch ...&gt;&gt;
 &lt;&lt;Starting on login1&gt;&gt;
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
@@ -15400,11 +15717,11 @@ iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,20,0.0013,54200,271,271
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,24,0.0014,66200,331,331
+200,32,24,0.0013,66200,331,331
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,28,0.0014,78200,391,391
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,32,0.0016,90200,451,451
+200,32,32,0.0015,90200,451,451
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,36,0.0015,102200,511,511
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
@@ -15420,109 +15737,109 @@ iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,60,0.0020,174200,871,871
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,64,0.0022,186200,931,931
+200,32,64,0.0020,186200,931,931
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,68,0.0022,198200,991,991
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,72,0.0021,210200,1051,1051
+200,32,72,0.0023,210200,1051,1051
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,76,0.0023,222200,1111,1111
+200,32,76,0.0022,222200,1111,1111
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,80,0.0023,234200,1171,1171
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,84,0.0023,246200,1231,1231
+200,32,84,0.0024,246200,1231,1231
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,88,0.0024,258200,1291,1291
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,92,0.0025,270200,1351,1351
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,96,0.0027,282200,1411,1411
+200,32,96,0.0025,282200,1411,1411
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,100,0.0026,294200,1471,1471
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,104,0.0027,306200,1531,1531
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,108,0.0027,318200,1591,1591
+200,32,108,0.0028,318200,1591,1591
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,112,0.0028,330200,1651,1651
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,116,0.0028,342200,1711,1711
+200,32,116,0.0029,342200,1711,1711
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,120,0.0030,354200,1771,1771
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,124,0.0030,366200,1831,1831
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,128,0.0030,378200,1891,1891
+200,32,128,0.0031,378200,1891,1891
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,132,0.0032,390200,1951,1951
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,136,0.0032,402200,2011,2011
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,140,0.0032,414200,2071,2071
+200,32,140,0.0033,414200,2071,2071
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,144,0.0033,426200,2131,2131
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,148,0.0033,438200,2191,2191
+200,32,148,0.0035,438200,2191,2191
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,152,0.0034,450200,2251,2251
+200,32,152,0.0035,450200,2251,2251
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,156,0.0035,462200,2311,2311
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,160,0.0036,474200,2371,2371
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,164,0.0036,486200,2431,2431
+200,32,164,0.0038,486200,2431,2431
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,168,0.0037,498200,2491,2491
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,172,0.0037,510200,2551,2551
+200,32,172,0.0038,510200,2551,2551
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,176,0.0039,522200,2611,2611
+200,32,176,0.0038,522200,2611,2611
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,180,0.0039,534200,2671,2671
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,184,0.0039,546200,2731,2731
+200,32,184,0.0040,546200,2731,2731
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,188,0.0040,558200,2791,2791
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,192,0.0040,570200,2851,2851
+200,32,192,0.0041,570200,2851,2851
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,196,0.0041,582200,2911,2911
+200,32,196,0.0042,582200,2911,2911
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,200,0.0042,594200,2971,2971
+200,32,200,0.0044,594200,2971,2971
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,204,0.0042,606200,3031,3031
+200,32,204,0.0043,606200,3031,3031
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,208,0.0043,618200,3091,3091
+200,32,208,0.0044,618200,3091,3091
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,212,0.0044,630200,3151,3151
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,216,0.0044,642200,3211,3211
+200,32,216,0.0045,642200,3211,3211
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,220,0.0046,654200,3271,3271
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,224,0.0046,666200,3331,3331
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,228,0.0046,678200,3391,3391
+200,32,228,0.0047,678200,3391,3391
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,232,0.0047,690200,3451,3451
+200,32,232,0.0048,690200,3451,3451
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,236,0.0047,702200,3511,3511
+200,32,236,0.0048,702200,3511,3511
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,240,0.0048,714200,3571,3571
+200,32,240,0.0049,714200,3571,3571
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,244,0.0049,726200,3631,3631
+200,32,244,0.0050,726200,3631,3631
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,248,0.0049,738200,3691,3691
+200,32,248,0.0050,738200,3691,3691
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,252,0.0050,750200,3751,3751
+200,32,252,0.0051,750200,3751,3751
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,256,0.0051,762200,3811,3811
+200,32,256,0.0052,762200,3811,3811
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,260,0.0051,774200,3871,3871
+200,32,260,0.0052,774200,3871,3871
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,264,0.0053,786200,3931,3931
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,268,0.0053,798200,3991,3991
+200,32,268,0.0054,798200,3991,3991
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,272,0.0054,810200,4051,4051
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
@@ -15530,378 +15847,378 @@ iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,280,0.0055,834200,4171,4171
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,284,0.0055,846200,4231,4231
+200,32,284,0.0056,846200,4231,4231
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,288,0.0056,858200,4291,4291
+200,32,288,0.0057,858200,4291,4291
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,292,0.0057,870200,4351,4351
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,296,0.0057,882200,4411,4411
+200,32,296,0.0058,882200,4411,4411
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,300,0.0058,894200,4471,4471
+200,32,300,0.0059,894200,4471,4471
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,304,0.0058,906200,4531,4531
+200,32,304,0.0059,906200,4531,4531
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,308,0.0059,918200,4591,4591
+200,32,308,0.0060,918200,4591,4591
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,312,0.0060,930200,4651,4651
+200,32,312,0.0061,930200,4651,4651
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,316,0.0060,942200,4711,4711
+200,32,316,0.0061,942200,4711,4711
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,320,0.0061,954200,4771,4771
+200,32,320,0.0062,954200,4771,4771
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,324,0.0061,966200,4831,4831
+200,32,324,0.0063,966200,4831,4831
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,328,0.0062,978200,4891,4891
+200,32,328,0.0063,978200,4891,4891
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,332,0.0063,990200,4951,4951
+200,32,332,0.0064,990200,4951,4951
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,336,0.0063,1002200,5011,5011
+200,32,336,0.0065,1002200,5011,5011
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,340,0.0064,1014200,5071,5071
+200,32,340,0.0066,1014200,5071,5071
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,344,0.0065,1026200,5131,5131
+200,32,344,0.0066,1026200,5131,5131
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,348,0.0066,1038200,5191,5191
+200,32,348,0.0067,1038200,5191,5191
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,352,0.0066,1050200,5251,5251
+200,32,352,0.0069,1050200,5251,5251
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,356,0.0067,1062200,5311,5311
+200,32,356,0.0068,1062200,5311,5311
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,360,0.0067,1074200,5371,5371
+200,32,360,0.0068,1074200,5371,5371
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,364,0.0068,1086200,5431,5431
+200,32,364,0.0069,1086200,5431,5431
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,368,0.0068,1098200,5491,5491
+200,32,368,0.0070,1098200,5491,5491
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,372,0.0069,1110200,5551,5551
+200,32,372,0.0071,1110200,5551,5551
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,376,0.0070,1122200,5611,5611
+200,32,376,0.0071,1122200,5611,5611
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,380,0.0071,1134200,5671,5671
+200,32,380,0.0072,1134200,5671,5671
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,384,0.0072,1146200,5731,5731
+200,32,384,0.0073,1146200,5731,5731
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,388,0.0072,1158200,5791,5791
+200,32,388,0.0073,1158200,5791,5791
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,392,0.0072,1170200,5851,5851
+200,32,392,0.0074,1170200,5851,5851
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,396,0.0073,1182200,5911,5911
+200,32,396,0.0075,1182200,5911,5911
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,400,0.0074,1194200,5971,5971
+200,32,400,0.0075,1194200,5971,5971
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,404,0.0074,1206200,6031,6031
+200,32,404,0.0076,1206200,6031,6031
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,408,0.0076,1218200,6091,6091
+200,32,408,0.0077,1218200,6091,6091
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,412,0.0076,1230200,6151,6151
+200,32,412,0.0077,1230200,6151,6151
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,416,0.0077,1242200,6211,6211
+200,32,416,0.0080,1242200,6211,6211
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,420,0.0077,1254200,6271,6271
+200,32,420,0.0078,1254200,6271,6271
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,424,0.0078,1266200,6331,6331
+200,32,424,0.0079,1266200,6331,6331
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,428,0.0078,1278200,6391,6391
+200,32,428,0.0080,1278200,6391,6391
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,432,0.0080,1290200,6451,6451
+200,32,432,0.0081,1290200,6451,6451
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,436,0.0079,1302200,6511,6511
+200,32,436,0.0082,1302200,6511,6511
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,440,0.0081,1314200,6571,6571
+200,32,440,0.0082,1314200,6571,6571
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,444,0.0081,1326200,6631,6631
+200,32,444,0.0083,1326200,6631,6631
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,448,0.0082,1338200,6691,6691
+200,32,448,0.0083,1338200,6691,6691
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,452,0.0082,1350200,6751,6751
+200,32,452,0.0084,1350200,6751,6751
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,456,0.0084,1362200,6811,6811
+200,32,456,0.0085,1362200,6811,6811
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,460,0.0084,1374200,6871,6871
+200,32,460,0.0085,1374200,6871,6871
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,464,0.0084,1386200,6931,6931
+200,32,464,0.0087,1386200,6931,6931
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,468,0.0085,1398200,6991,6991
+200,32,468,0.0086,1398200,6991,6991
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,472,0.0085,1410200,7051,7051
+200,32,472,0.0087,1410200,7051,7051
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,476,0.0086,1422200,7111,7111
+200,32,476,0.0088,1422200,7111,7111
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,480,0.0087,1434200,7171,7171
+200,32,480,0.0090,1434200,7171,7171
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,484,0.0088,1446200,7231,7231
+200,32,484,0.0089,1446200,7231,7231
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,488,0.0088,1458200,7291,7291
+200,32,488,0.0090,1458200,7291,7291
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,492,0.0089,1470200,7351,7351
+200,32,492,0.0092,1470200,7351,7351
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,496,0.0089,1482200,7411,7411
+200,32,496,0.0092,1482200,7411,7411
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,500,0.0090,1494200,7471,7471
+200,32,500,0.0092,1494200,7471,7471
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,504,0.0092,1506200,7531,7531
+200,32,504,0.0093,1506200,7531,7531
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,508,0.0093,1518200,7591,7591
+200,32,508,0.0094,1518200,7591,7591
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,512,0.0092,1530200,7651,7651
+200,32,512,0.0095,1530200,7651,7651
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,516,0.0093,1542200,7711,7711
+200,32,516,0.0096,1542200,7711,7711
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,520,0.0094,1554200,7771,7771
+200,32,520,0.0096,1554200,7771,7771
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,524,0.0094,1566200,7831,7831
+200,32,524,0.0096,1566200,7831,7831
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,528,0.0094,1578200,7891,7891
+200,32,528,0.0097,1578200,7891,7891
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
 200,32,532,0.0097,1590200,7951,7951
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,536,0.0096,1602200,8011,8011
+200,32,536,0.0098,1602200,8011,8011
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,540,0.0097,1614200,8071,8071
+200,32,540,0.0100,1614200,8071,8071
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,544,0.0097,1626200,8131,8131
+200,32,544,0.0099,1626200,8131,8131
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,548,0.0099,1638200,8191,8191
+200,32,548,0.0100,1638200,8191,8191
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,552,0.0099,1650200,8251,8251
+200,32,552,0.0101,1650200,8251,8251
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,556,0.0101,1662200,8311,8311
+200,32,556,0.0102,1662200,8311,8311
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,560,0.0100,1674200,8371,8371
+200,32,560,0.0102,1674200,8371,8371
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,564,0.0101,1686200,8431,8431
+200,32,564,0.0105,1686200,8431,8431
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,568,0.0102,1698200,8491,8491
+200,32,568,0.0104,1698200,8491,8491
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,572,0.0103,1710200,8551,8551
+200,32,572,0.0105,1710200,8551,8551
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,576,0.0103,1722200,8611,8611
+200,32,576,0.0105,1722200,8611,8611
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,580,0.0104,1734200,8671,8671
+200,32,580,0.0108,1734200,8671,8671
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,584,0.0104,1746200,8731,8731
+200,32,584,0.0108,1746200,8731,8731
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,588,0.0105,1758200,8791,8791
+200,32,588,0.0109,1758200,8791,8791
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,592,0.0107,1770200,8851,8851
+200,32,592,0.0109,1770200,8851,8851
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,596,0.0108,1782200,8911,8911
+200,32,596,0.0109,1782200,8911,8911
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,600,0.0107,1794200,8971,8971
+200,32,600,0.0111,1794200,8971,8971
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,604,0.0109,1806200,9031,9031
+200,32,604,0.0111,1806200,9031,9031
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,608,0.0109,1818200,9091,9091
+200,32,608,0.0112,1818200,9091,9091
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,612,0.0109,1830200,9151,9151
+200,32,612,0.0112,1830200,9151,9151
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,616,0.0110,1842200,9211,9211
+200,32,616,0.0114,1842200,9211,9211
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,620,0.0111,1854200,9271,9271
+200,32,620,0.0113,1854200,9271,9271
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,624,0.0112,1866200,9331,9331
+200,32,624,0.0114,1866200,9331,9331
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,628,0.0111,1878200,9391,9391
+200,32,628,0.0114,1878200,9391,9391
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,632,0.0112,1890200,9451,9451
+200,32,632,0.0116,1890200,9451,9451
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,636,0.0113,1902200,9511,9511
+200,32,636,0.0116,1902200,9511,9511
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,640,0.0116,1914200,9571,9571
+200,32,640,0.0117,1914200,9571,9571
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,644,0.0114,1926200,9631,9631
+200,32,644,0.0118,1926200,9631,9631
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,648,0.0115,1938200,9691,9691
+200,32,648,0.0118,1938200,9691,9691
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,652,0.0117,1950200,9751,9751
+200,32,652,0.0121,1950200,9751,9751
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,656,0.0117,1962200,9811,9811
+200,32,656,0.0121,1962200,9811,9811
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,660,0.0117,1974200,9871,9871
+200,32,660,0.0121,1974200,9871,9871
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,664,0.0118,1986200,9931,9931
+200,32,664,0.0121,1986200,9931,9931
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,668,0.0119,1998200,9991,9991
+200,32,668,0.0122,1998200,9991,9991
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,672,0.0120,2010200,10051,10051
+200,32,672,0.0122,2010200,10051,10051
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,676,0.0120,2022200,10111,10111
+200,32,676,0.0124,2022200,10111,10111
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,680,0.0120,2034200,10171,10171
+200,32,680,0.0123,2034200,10171,10171
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,684,0.0121,2046200,10231,10231
+200,32,684,0.0124,2046200,10231,10231
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,688,0.0122,2058200,10291,10291
+200,32,688,0.0126,2058200,10291,10291
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,692,0.0123,2070200,10351,10351
+200,32,692,0.0127,2070200,10351,10351
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,696,0.0124,2082200,10411,10411
+200,32,696,0.0126,2082200,10411,10411
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,700,0.0124,2094200,10471,10471
+200,32,700,0.0128,2094200,10471,10471
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,704,0.0125,2106200,10531,10531
+200,32,704,0.0127,2106200,10531,10531
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,708,0.0125,2118200,10591,10591
+200,32,708,0.0128,2118200,10591,10591
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,712,0.0125,2130200,10651,10651
+200,32,712,0.0129,2130200,10651,10651
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,716,0.0125,2142200,10711,10711
+200,32,716,0.0130,2142200,10711,10711
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,720,0.0126,2154200,10771,10771
+200,32,720,0.0130,2154200,10771,10771
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,724,0.0127,2166200,10831,10831
+200,32,724,0.0131,2166200,10831,10831
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,728,0.0128,2178200,10891,10891
+200,32,728,0.0131,2178200,10891,10891
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,732,0.0128,2190200,10951,10951
+200,32,732,0.0132,2190200,10951,10951
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,736,0.0130,2202200,11011,11011
+200,32,736,0.0134,2202200,11011,11011
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,740,0.0130,2214200,11071,11071
+200,32,740,0.0134,2214200,11071,11071
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,744,0.0130,2226200,11131,11131
+200,32,744,0.0134,2226200,11131,11131
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,748,0.0131,2238200,11191,11191
+200,32,748,0.0135,2238200,11191,11191
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,752,0.0133,2250200,11251,11251
+200,32,752,0.0136,2250200,11251,11251
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,756,0.0133,2262200,11311,11311
+200,32,756,0.0136,2262200,11311,11311
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,760,0.0133,2274200,11371,11371
+200,32,760,0.0137,2274200,11371,11371
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,764,0.0134,2286200,11431,11431
+200,32,764,0.0138,2286200,11431,11431
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,768,0.0135,2298200,11491,11491
+200,32,768,0.0138,2298200,11491,11491
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,772,0.0137,2310200,11551,11551
+200,32,772,0.0139,2310200,11551,11551
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,776,0.0136,2322200,11611,11611
+200,32,776,0.0139,2322200,11611,11611
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,780,0.0137,2334200,11671,11671
+200,32,780,0.0140,2334200,11671,11671
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,784,0.0137,2346200,11731,11731
+200,32,784,0.0141,2346200,11731,11731
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,788,0.0138,2358200,11791,11791
+200,32,788,0.0142,2358200,11791,11791
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,792,0.0139,2370200,11851,11851
+200,32,792,0.0142,2370200,11851,11851
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,796,0.0140,2382200,11911,11911
+200,32,796,0.0144,2382200,11911,11911
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,800,0.0140,2394200,11971,11971
+200,32,800,0.0144,2394200,11971,11971
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,804,0.0141,2406200,12031,12031
+200,32,804,0.0144,2406200,12031,12031
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,808,0.0143,2418200,12091,12091
+200,32,808,0.0146,2418200,12091,12091
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,812,0.0142,2430200,12151,12151
+200,32,812,0.0146,2430200,12151,12151
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,816,0.0143,2442200,12211,12211
+200,32,816,0.0146,2442200,12211,12211
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,820,0.0144,2454200,12271,12271
+200,32,820,0.0147,2454200,12271,12271
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,824,0.0144,2466200,12331,12331
+200,32,824,0.0148,2466200,12331,12331
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,828,0.0145,2478200,12391,12391
+200,32,828,0.0149,2478200,12391,12391
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,832,0.0146,2490200,12451,12451
+200,32,832,0.0149,2490200,12451,12451
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,836,0.0146,2502200,12511,12511
+200,32,836,0.0150,2502200,12511,12511
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,840,0.0147,2514200,12571,12571
+200,32,840,0.0151,2514200,12571,12571
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,844,0.0148,2526200,12631,12631
+200,32,844,0.0152,2526200,12631,12631
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,848,0.0149,2538200,12691,12691
+200,32,848,0.0151,2538200,12691,12691
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,852,0.0149,2550200,12751,12751
+200,32,852,0.0152,2550200,12751,12751
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,856,0.0150,2562200,12811,12811
+200,32,856,0.0153,2562200,12811,12811
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,860,0.0152,2574200,12871,12871
+200,32,860,0.0154,2574200,12871,12871
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,864,0.0151,2586200,12931,12931
+200,32,864,0.0155,2586200,12931,12931
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,868,0.0151,2598200,12991,12991
+200,32,868,0.0155,2598200,12991,12991
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,872,0.0151,2610200,13051,13051
+200,32,872,0.0156,2610200,13051,13051
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,876,0.0152,2622200,13111,13111
+200,32,876,0.0156,2622200,13111,13111
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,880,0.0155,2634200,13171,13171
+200,32,880,0.0157,2634200,13171,13171
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,884,0.0154,2646200,13231,13231
+200,32,884,0.0158,2646200,13231,13231
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,888,0.0155,2658200,13291,13291
+200,32,888,0.0159,2658200,13291,13291
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,892,0.0155,2670200,13351,13351
+200,32,892,0.0159,2670200,13351,13351
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,896,0.0156,2682200,13411,13411
+200,32,896,0.0160,2682200,13411,13411
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,900,0.0157,2694200,13471,13471
+200,32,900,0.0160,2694200,13471,13471
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,904,0.0159,2706200,13531,13531
+200,32,904,0.0162,2706200,13531,13531
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,908,0.0160,2718200,13591,13591
+200,32,908,0.0162,2718200,13591,13591
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,912,0.0161,2730200,13651,13651
+200,32,912,0.0163,2730200,13651,13651
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,916,0.0162,2742200,13711,13711
+200,32,916,0.0163,2742200,13711,13711
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,920,0.0161,2754200,13771,13771
+200,32,920,0.0164,2754200,13771,13771
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,924,0.0162,2766200,13831,13831
+200,32,924,0.0165,2766200,13831,13831
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,928,0.0163,2778200,13891,13891
+200,32,928,0.0166,2778200,13891,13891
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,932,0.0165,2790200,13951,13951
+200,32,932,0.0168,2790200,13951,13951
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,936,0.0165,2802200,14011,14011
+200,32,936,0.0167,2802200,14011,14011
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,940,0.0165,2814200,14071,14071
+200,32,940,0.0169,2814200,14071,14071
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,944,0.0166,2826200,14131,14131
+200,32,944,0.0169,2826200,14131,14131
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,948,0.0166,2838200,14191,14191
+200,32,948,0.0169,2838200,14191,14191
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,952,0.0168,2850200,14251,14251
+200,32,952,0.0170,2850200,14251,14251
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,956,0.0167,2862200,14311,14311
+200,32,956,0.0170,2862200,14311,14311
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,960,0.0168,2874200,14371,14371
+200,32,960,0.0171,2874200,14371,14371
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,964,0.0173,2886200,14431,14431
+200,32,964,0.0175,2886200,14431,14431
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,968,0.0172,2898200,14491,14491
+200,32,968,0.0175,2898200,14491,14491
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,972,0.0172,2910200,14551,14551
+200,32,972,0.0176,2910200,14551,14551
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,976,0.0173,2922200,14611,14611
+200,32,976,0.0176,2922200,14611,14611
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,980,0.0175,2934200,14671,14671
+200,32,980,0.0178,2934200,14671,14671
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,984,0.0176,2946200,14731,14731
+200,32,984,0.0178,2946200,14731,14731
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,988,0.0176,2958200,14791,14791
+200,32,988,0.0179,2958200,14791,14791
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,992,0.0177,2970200,14851,14851
+200,32,992,0.0178,2970200,14851,14851
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,996,0.0178,2982200,14911,14911
+200,32,996,0.0181,2982200,14911,14911
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,1000,0.0177,2994200,14971,14971
+200,32,1000,0.0180,2994200,14971,14971
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,1004,0.0179,3006200,15031,15031
+200,32,1004,0.0181,3006200,15031,15031
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,1008,0.0179,3018200,15091,15091
+200,32,1008,0.0182,3018200,15091,15091
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,1012,0.0180,3030200,15151,15151
+200,32,1012,0.0183,3030200,15151,15151
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,1016,0.0180,3042200,15211,15211
+200,32,1016,0.0183,3042200,15211,15211
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,1020,0.0182,3054200,15271,15271
+200,32,1020,0.0184,3054200,15271,15271
 iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)
-200,32,1024,0.0178,3066200,15331,15331
-mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vst.bin.csv .
+200,32,1024,0.0182,3066200,15331,15331
+mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.vst.bin.csv .
 </pre>
 </div>
 </div>
@@ -15914,14 +16231,14 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vst.bin.csv .
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
 <p>Let's plot it again, as soon as the run finishes! Non-interactively, call <code>graph_task2b</code>.</p>
-<p><em>We need to read in two CSV files now, which we combine to one common dataframe <code>df_vldvst</code>.</em></p>
+<p><em>Because we couldn't measure the two vector counters at the same time, we have two CSV files to read in now. We combine them into one common dataframe <code>df_vldvst</code> in the following.</em></p>
 
 </div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[8]:</div>
+<div class="prompt input_prompt">In&nbsp;[31]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_vld</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&quot;poisson2d.vld.bin.csv&quot;</span><span class="p">,</span> <span class="n">skiprows</span><span class="o">=</span><span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">50000</span><span class="p">,</span> <span class="mi">2</span><span class="p">))</span>
@@ -15936,11 +16253,10 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vst.bin.csv .
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[9]:</div>
+<div class="prompt input_prompt">In&nbsp;[32]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df_vldvst</span><span class="p">,</span> <span class="s2">&quot;PM_VECTOR_LD_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Vector Loads / Loop Iteration&quot;</span><span class="p">)</span>
-<span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df_vldvst</span><span class="p">,</span> <span class="s2">&quot;PM_VECTOR_ST_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Vector Stores / Loop Iteration&quot;</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_vldvst</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df_vldvst</span><span class="p">[</span><span class="s2">&quot;nx&quot;</span><span class="p">]</span> <span class="o">*</span> <span class="n">df_vldvst</span><span class="p">[</span><span class="s2">&quot;ny&quot;</span><span class="p">]</span> 
 <span class="n">df_vldvst</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
 </pre></div>
 
@@ -15954,7 +16270,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vst.bin.csv .
 
 <div class="output_area">
 
-    <div class="prompt output_prompt">Out[9]:</div>
+    <div class="prompt output_prompt">Out[32]:</div>
 
 
 
@@ -15987,8 +16303,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vst.bin.csv .
       <th>PM_VECTOR_ST_CMPL (total)</th>
       <th>PM_VECTOR_ST_CMPL (min)</th>
       <th>PM_VECTOR_ST_CMPL (max)</th>
-      <th>Vector Loads / Loop Iteration</th>
-      <th>Vector Stores / Loop Iteration</th>
+      <th>Grid Points</th>
     </tr>
   </thead>
   <tbody>
@@ -16004,8 +16319,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vst.bin.csv .
       <td>200</td>
       <td>1</td>
       <td>1</td>
-      <td>0.000000</td>
-      <td>0.007812</td>
+      <td>128</td>
     </tr>
     <tr>
       <th>1</th>
@@ -16019,8 +16333,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vst.bin.csv .
       <td>18200</td>
       <td>91</td>
       <td>91</td>
-      <td>2.226562</td>
-      <td>0.355469</td>
+      <td>256</td>
     </tr>
     <tr>
       <th>2</th>
@@ -16034,38 +16347,35 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vst.bin.csv .
       <td>30200</td>
       <td>151</td>
       <td>151</td>
-      <td>2.265625</td>
-      <td>0.393229</td>
+      <td>384</td>
     </tr>
     <tr>
       <th>3</th>
       <td>16</td>
       <td>200</td>
       <td>32</td>
-      <td>0.0013</td>
+      <td>0.0012</td>
       <td>234000</td>
       <td>1170</td>
       <td>1170</td>
       <td>42200</td>
       <td>211</td>
       <td>211</td>
-      <td>2.285156</td>
-      <td>0.412109</td>
+      <td>512</td>
     </tr>
     <tr>
       <th>4</th>
       <td>20</td>
       <td>200</td>
       <td>32</td>
-      <td>0.0014</td>
+      <td>0.0013</td>
       <td>294000</td>
       <td>1470</td>
       <td>1470</td>
       <td>54200</td>
       <td>271</td>
       <td>271</td>
-      <td>2.296875</td>
-      <td>0.423438</td>
+      <td>640</td>
     </tr>
   </tbody>
 </table>
@@ -16080,12 +16390,103 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vst.bin.csv .
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[10]:</div>
+<div class="prompt input_prompt">In&nbsp;[33]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">fig</span><span class="p">,</span> <span class="p">(</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">)</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_VECTOR_LD_CMPL (min)&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax1</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+<span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_VECTOR_ST_CMPL (min)&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax2</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+
+
+<div class="output_png output_subarea ">
+<img src="
+"
+>
+</div>
+
+</div>
+
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Also here seems to be a linear correlation. Let's do our fitting and plot directly.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[34]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">_fit</span><span class="p">,</span> <span class="n">_cov</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">print_and_return_fit</span><span class="p">(</span>
+    <span class="p">[</span><span class="s2">&quot;PM_VECTOR_LD_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;PM_VECTOR_ST_CMPL (min)&quot;</span><span class="p">],</span> 
+    <span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">),</span> 
+    <span class="n">linear_function</span><span class="p">,</span>
+    <span class="n">format_value</span><span class="o">=</span><span class="s2">&quot;.4f&quot;</span><span class="p">,</span>
+<span class="p">)</span>
+<span class="n">fit_parameters</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_parameters</span><span class="p">,</span> <span class="o">**</span><span class="n">_fit</span><span class="p">}</span>
+<span class="n">fit_covariance</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_covariance</span><span class="p">,</span> <span class="o">**</span><span class="n">_cov</span><span class="p">}</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>Counter PM_VECTOR_LD_CMPL (min) is proportional to the grid points (nx*ny) by a factor of 2.3439 (± 0.000111)
+Counter PM_VECTOR_ST_CMPL (min) is proportional to the grid points (nx*ny) by a factor of 0.4688 (± 0.000012)
+</pre>
+</div>
+</div>
+
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[35]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">fig</span><span class="p">,</span> <span class="p">(</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">)</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
-<span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Vector Loads / Loop Iteration&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax1</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
-<span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Vector Stores / Loop Iteration&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax2</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+<span class="k">for</span> <span class="n">ax</span><span class="p">,</span> <span class="n">pmu_counter</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">([</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;PM_VECTOR_LD_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;PM_VECTOR_ST_CMPL (min)&quot;</span><span class="p">]):</span>
+    <span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="n">pmu_counter</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">ax</span><span class="o">=</span><span class="n">ax</span><span class="p">,</span> <span class="n">legend</span><span class="o">=</span><span class="kc">True</span><span class="p">);</span>
+    <span class="n">ax</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span>
+        <span class="n">df_vldvst</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">],</span> 
+        <span class="n">linear_function</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">],</span> <span class="o">*</span><span class="n">fit_parameters</span><span class="p">[</span><span class="n">pmu_counter</span><span class="p">]),</span> 
+        <span class="n">linestyle</span><span class="o">=</span><span class="s2">&quot;--&quot;</span><span class="p">,</span> 
+        <span class="n">label</span><span class="o">=</span><span class="s2">&quot;Fit: </span><span class="si">{:.2f}</span><span class="s2"> * x + </span><span class="si">{:.2f}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="o">*</span><span class="n">fit_parameters</span><span class="p">[</span><span class="n">pmu_counter</span><span class="p">])</span>
+    <span class="p">)</span>
+    <span class="n">ax</span><span class="o">.</span><span class="n">legend</span><span class="p">();</span>
 </pre></div>
 
     </div>
@@ -16104,7 +16505,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vst.bin.csv .
 
 
 <div class="output_png output_subarea ">
-<img src="
+<img src="
 "
 >
 </div>
@@ -16137,14 +16538,14 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[83]:</div>
+<div class="prompt input_prompt">In&nbsp;[37]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_byte</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">()</span>
-<span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Loads / Loop Iteration&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Vector Loads / Loop Iteration&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Loads / Loop Iteration&quot;</span><span class="p">])</span><span class="o">*</span><span class="mi">8</span>
-<span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Stores / Loop Iteration&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Vector Stores / Loop Iteration&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Stores / Loop Iteration&quot;</span><span class="p">])</span><span class="o">*</span><span class="mi">8</span>
+<span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Loads&quot;</span><span class="p">]</span>  <span class="o">=</span> <span class="p">(</span><span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_VECTOR_LD_CMPL (min)&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_LD_CMPL (min)&quot;</span><span class="p">])</span><span class="o">*</span><span class="mi">8</span>
+<span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Stores&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">df_vldvst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_VECTOR_ST_CMPL (min)&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="n">df_ldst</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_ST_CMPL (min)&quot;</span><span class="p">])</span><span class="o">*</span><span class="mi">8</span>
 <span class="n">ax</span> <span class="o">=</span> <span class="n">df_byte</span><span class="o">.</span><span class="n">plot</span><span class="p">()</span>
-<span class="n">ax</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s2">&quot;Bytes / Loop Iteration&quot;</span><span class="p">);</span>
+<span class="n">ax</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s2">&quot;Bytes&quot;</span><span class="p">);</span>
 </pre></div>
 
     </div>
@@ -16163,7 +16564,7 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 
 
 <div class="output_png output_subarea ">
-<img src="
+<img src="
 "
 >
 </div>
@@ -16173,16 +16574,27 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 </div>
 </div>
 
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Let's quantify the difference by, again, fitting a linear function to the data.</p>
+
+</div>
+</div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[12]:</div>
+<div class="prompt input_prompt">In&nbsp;[38]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
-<span class="n">mean_byte_ld</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">polyfit</span><span class="p">(</span><span class="n">df_byte</span><span class="p">[</span><span class="n">df_byte</span><span class="o">.</span><span class="n">index</span> <span class="o">&gt;</span> <span class="mi">200</span><span class="p">]</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">df_byte</span><span class="p">[</span><span class="n">df_byte</span><span class="o">.</span><span class="n">index</span> <span class="o">&gt;</span> <span class="mi">200</span><span class="p">][</span><span class="s2">&quot;Loads / Loop Iteration&quot;</span><span class="p">],</span> <span class="mi">0</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
-<span class="n">mean_byte_st</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">polyfit</span><span class="p">(</span><span class="n">df_byte</span><span class="p">[</span><span class="n">df_byte</span><span class="o">.</span><span class="n">index</span> <span class="o">&gt;</span> <span class="mi">200</span><span class="p">]</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">df_byte</span><span class="p">[</span><span class="n">df_byte</span><span class="o">.</span><span class="n">index</span> <span class="o">&gt;</span> <span class="mi">200</span><span class="p">][</span><span class="s2">&quot;Stores / Loop Iteration&quot;</span><span class="p">],</span> <span class="mi">0</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
-<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Mean byte loaded: </span><span class="si">{}</span><span class="se">\t</span><span class="s2">Mean byte stored: </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">mean_byte_ld</span><span class="p">,</span> <span class="n">mean_byte_st</span><span class="p">))</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">_fit</span><span class="p">,</span> <span class="n">_cov</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">print_and_return_fit</span><span class="p">(</span>
+    <span class="p">[</span><span class="s2">&quot;Loads&quot;</span><span class="p">,</span> <span class="s2">&quot;Stores&quot;</span><span class="p">],</span> 
+    <span class="n">df_byte</span><span class="p">,</span> 
+    <span class="n">linear_function</span>
+<span class="p">)</span>
+<span class="n">fit_parameters</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_parameters</span><span class="p">,</span> <span class="o">**</span><span class="n">_fit</span><span class="p">}</span>
+<span class="n">fit_covariance</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_covariance</span><span class="p">,</span> <span class="o">**</span><span class="n">_cov</span><span class="p">}</span>
 </pre></div>
 
     </div>
@@ -16199,7 +16611,8 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>Mean byte loaded: 37.52662546714877	Mean byte stored: 8.428951320998907
+<pre>Counter  Loads is proportional to the grid points (nx*ny) by a factor of 37.5010 (± 0.000592)
+Counter Stores is proportional to the grid points (nx*ny) by a factor of  8.4379 (± 0.000247)
 </pre>
 </div>
 </div>
@@ -16207,6 +16620,14 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 </div>
 </div>
 
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Analagously to the proportionality factors, this mich is loaded/stored per grid point.</p>
+
+</div>
+</div>
 </div>
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
@@ -16218,11 +16639,11 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[13]:</div>
+<div class="prompt input_prompt">In&nbsp;[50]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_bandwidth</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">()</span>
-<span class="n">df_bandwidth</span><span class="p">[</span><span class="s2">&quot;Bandwidth / Byte/Cycle&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Loads / Loop Iteration&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Stores / Loop Iteration&quot;</span><span class="p">])</span> <span class="o">/</span> <span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Cycles / Loop Iteration&quot;</span><span class="p">]</span>
+<span class="n">df_bandwidth</span><span class="p">[</span><span class="s2">&quot;Bandwidth / Byte/Cycle&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Loads&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Stores&quot;</span><span class="p">])</span> <span class="o">/</span> <span class="n">df</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;PM_RUN_CYC (min)&quot;</span><span class="p">]</span>
 </pre></div>
 
     </div>
@@ -16233,14 +16654,14 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>Let's display it as a function of <code>nx</code>. And also compare it to the available L1 cache bandwidth in a second (sub-)plot. Non-interactive users, call <code>make graph_task2c</code>.</p>
+<p>Let's display it as a function of grid points. And also compare it to the available L1 cache bandwidth in a second (sub-)plot. Non-interactive users, call <code>make graph_task2c</code>.</p>
 
 </div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[15]:</div>
+<div class="prompt input_prompt">In&nbsp;[51]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">fig</span><span class="p">,</span> <span class="p">(</span><span class="n">ax1</span><span class="p">,</span> <span class="n">ax2</span><span class="p">)</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="n">nrows</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">sharex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
@@ -16267,7 +16688,7 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 
 
 <div class="output_png output_subarea ">
-<img src="
+<img src="
 "
 >
 </div>
@@ -16291,7 +16712,7 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 <div class="text_cell_render border-box-sizing rendered_html">
 <h2 id="Task-E1:-Measuring-FlOps">Task E1: Measuring FlOps<a class="anchor-link" href="#Task-E1:-Measuring-FlOps">&#182;</a></h2><p><a name="taske1"></a></p>
 <p>If you still have time, feel free to work on the following extended task.</p>
-<p><strong>TASK</strong>: Please measure counters for <em>vectorized</em> floating point operations and <em>scalar</em> floating point operations. The two counters can also not be measured during the same run. So please see the TODOs in <a href="/edit/Tasks/poisson2d.sflops.c"><code>poisson2d.sflops.c</code></a> and <a href="/edit/Tasks/poisson2d.vflops.c"><code>poisson2d.vflops.c</code></a>. By now you should be able to find out the names of the counters by yourself (<em>Hint: they include the words scalar and vector…</em>).</p>
+<p><strong>TASK</strong>: Please measure counters for <em>vectorized</em> floating point operations and <em>scalar</em> floating point operations. The two counters can also not be measured during the same run. So please see the TODOs in <a href="/edit/Tasks/poisson2d.sflops.c"><code>poisson2d.sflops.c</code></a> and <a href="/edit/Tasks/poisson2d.vflops.c"><code>poisson2d.vflops.c</code></a>. By now you should be able to find out the names of the counters by yourself (<em>Hint: they include the words »scalar« and »vector«…</em>).</p>
 <p>As usual, compile, test, and bench-run your program.</p>
 <p><a href="#toc">Back to top</a></p>
 
@@ -16300,7 +16721,7 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[42]:</div>
+<div class="prompt input_prompt">In&nbsp;[4]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="o">!</span>make bench_task4
@@ -16320,8 +16741,8 @@ n_\text{ld}^\text{scalar} &amp;= n_\text{ld}^\text{total} - n_\text{ld}^\text{ve
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>bsub -W 60 -nnodes 1 -Is -P GEN110 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.sflop.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.sflop.bin.csv
-Job &lt;4299&gt; is submitted to default queue &lt;batch&gt;.
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.sflop.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.sflop.bin.csv
+Job &lt;24645&gt; is submitted to default queue &lt;batch&gt;.
 &lt;&lt;Waiting for dispatch ...&gt;&gt;
 &lt;&lt;Starting on login1&gt;&gt;
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
@@ -16335,7 +16756,7 @@ iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCA
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,20,0.0013,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,24,0.0014,0,0,0
+200,32,24,0.0013,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,28,0.0014,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
@@ -16351,21 +16772,21 @@ iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCA
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,52,0.0018,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,56,0.0019,0,0,0
+200,32,56,0.0022,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,60,0.0020,0,0,0
+200,32,60,0.0019,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,64,0.0021,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,68,0.0022,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,72,0.0022,0,0,0
+200,32,72,0.0021,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,76,0.0022,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,80,0.0023,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,84,0.0024,0,0,0
+200,32,84,0.0025,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,88,0.0024,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
@@ -16373,39 +16794,39 @@ iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCA
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,96,0.0025,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,100,0.0028,0,0,0
+200,32,100,0.0026,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,104,0.0027,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,108,0.0027,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,112,0.0029,0,0,0
+200,32,112,0.0028,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,116,0.0028,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,120,0.0029,0,0,0
+200,32,120,0.0031,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,124,0.0030,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,128,0.0031,0,0,0
+200,32,128,0.0030,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,132,0.0031,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,136,0.0032,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,140,0.0033,0,0,0
+200,32,140,0.0032,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,144,0.0034,0,0,0
+200,32,144,0.0033,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,148,0.0034,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,152,0.0034,0,0,0
+200,32,152,0.0035,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,156,0.0035,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,160,0.0036,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,164,0.0037,0,0,0
+200,32,164,0.0036,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,168,0.0037,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
@@ -16415,13 +16836,13 @@ iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCA
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,180,0.0039,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,184,0.0039,0,0,0
+200,32,184,0.0040,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,188,0.0040,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,192,0.0041,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,196,0.0041,0,0,0
+200,32,196,0.0042,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,200,0.0042,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
@@ -16433,9 +16854,9 @@ iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCA
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,216,0.0045,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,220,0.0046,0,0,0
+200,32,220,0.0045,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,224,0.0047,0,0,0
+200,32,224,0.0046,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,228,0.0047,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
@@ -16447,11 +16868,11 @@ iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCA
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,244,0.0049,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,248,0.0050,0,0,0
+200,32,248,0.0051,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,252,0.0050,0,0,0
+200,32,252,0.0051,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,256,0.0051,0,0,0
+200,32,256,0.0053,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,260,0.0052,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
@@ -16459,79 +16880,79 @@ iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCA
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,268,0.0054,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,272,0.0055,0,0,0
+200,32,272,0.0054,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,276,0.0055,0,0,0
+200,32,276,0.0054,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,280,0.0055,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,284,0.0056,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,288,0.0057,0,0,0
+200,32,288,0.0056,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,292,0.0057,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,296,0.0058,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,300,0.0059,0,0,0
+200,32,300,0.0058,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,304,0.0059,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,308,0.0059,0,0,0
+200,32,308,0.0060,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,312,0.0060,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,316,0.0061,0,0,0
+200,32,316,0.0062,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,320,0.0061,0,0,0
+200,32,320,0.0062,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,324,0.0062,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,328,0.0063,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,332,0.0065,0,0,0
+200,32,332,0.0064,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,336,0.0064,0,0,0
+200,32,336,0.0065,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,340,0.0065,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,344,0.0065,0,0,0
+200,32,344,0.0066,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,348,0.0066,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,352,0.0067,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,356,0.0067,0,0,0
+200,32,356,0.0068,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,360,0.0068,0,0,0
+200,32,360,0.0069,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,364,0.0069,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,368,0.0070,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,372,0.0070,0,0,0
+200,32,372,0.0072,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,376,0.0071,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,380,0.0072,0,0,0
+200,32,380,0.0071,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,384,0.0072,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,388,0.0072,0,0,0
+200,32,388,0.0073,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,392,0.0075,0,0,0
+200,32,392,0.0074,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,396,0.0074,0,0,0
+200,32,396,0.0076,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,400,0.0075,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,404,0.0075,0,0,0
+200,32,404,0.0076,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,408,0.0076,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,412,0.0077,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,416,0.0077,0,0,0
+200,32,416,0.0078,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,420,0.0078,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
@@ -16541,27 +16962,27 @@ iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCA
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,432,0.0080,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,436,0.0080,0,0,0
+200,32,436,0.0081,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,440,0.0081,0,0,0
+200,32,440,0.0082,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,444,0.0083,0,0,0
+200,32,444,0.0082,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,448,0.0084,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,452,0.0084,0,0,0
+200,32,452,0.0083,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,456,0.0084,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,460,0.0085,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,464,0.0086,0,0,0
+200,32,464,0.0085,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,468,0.0086,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,472,0.0088,0,0,0
+200,32,472,0.0087,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,476,0.0087,0,0,0
+200,32,476,0.0089,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,480,0.0088,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
@@ -16571,7 +16992,7 @@ iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCA
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,492,0.0090,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,496,0.0090,0,0,0
+200,32,496,0.0091,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,500,0.0092,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
@@ -16579,266 +17000,266 @@ iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCA
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,508,0.0093,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,512,0.0092,0,0,0
+200,32,512,0.0094,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,516,0.0093,0,0,0
+200,32,516,0.0094,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,520,0.0094,0,0,0
+200,32,520,0.0095,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,524,0.0094,0,0,0
+200,32,524,0.0096,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,528,0.0094,0,0,0
+200,32,528,0.0096,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,532,0.0095,0,0,0
+200,32,532,0.0098,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,536,0.0096,0,0,0
+200,32,536,0.0097,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
 200,32,540,0.0098,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,544,0.0097,0,0,0
+200,32,544,0.0099,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,548,0.0098,0,0,0
+200,32,548,0.0100,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,552,0.0099,0,0,0
+200,32,552,0.0101,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,556,0.0099,0,0,0
+200,32,556,0.0101,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,560,0.0100,0,0,0
+200,32,560,0.0102,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,564,0.0102,0,0,0
+200,32,564,0.0103,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,568,0.0102,0,0,0
+200,32,568,0.0104,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,572,0.0103,0,0,0
+200,32,572,0.0105,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,576,0.0103,0,0,0
+200,32,576,0.0105,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,580,0.0105,0,0,0
+200,32,580,0.0106,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,584,0.0104,0,0,0
+200,32,584,0.0107,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,588,0.0106,0,0,0
+200,32,588,0.0107,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,592,0.0107,0,0,0
+200,32,592,0.0108,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,596,0.0106,0,0,0
+200,32,596,0.0109,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,600,0.0107,0,0,0
+200,32,600,0.0110,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,604,0.0109,0,0,0
+200,32,604,0.0111,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,608,0.0109,0,0,0
+200,32,608,0.0111,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,612,0.0109,0,0,0
+200,32,612,0.0112,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,616,0.0110,0,0,0
+200,32,616,0.0112,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,620,0.0117,0,0,0
+200,32,620,0.0113,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,624,0.0112,0,0,0
+200,32,624,0.0114,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,628,0.0111,0,0,0
+200,32,628,0.0115,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,632,0.0112,0,0,0
+200,32,632,0.0115,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,636,0.0113,0,0,0
+200,32,636,0.0115,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,640,0.0115,0,0,0
+200,32,640,0.0116,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,644,0.0114,0,0,0
+200,32,644,0.0118,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,648,0.0115,0,0,0
+200,32,648,0.0117,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,652,0.0116,0,0,0
+200,32,652,0.0119,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,656,0.0117,0,0,0
+200,32,656,0.0119,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,660,0.0117,0,0,0
+200,32,660,0.0121,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,664,0.0118,0,0,0
+200,32,664,0.0120,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,668,0.0119,0,0,0
+200,32,668,0.0122,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,672,0.0119,0,0,0
+200,32,672,0.0121,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,676,0.0119,0,0,0
+200,32,676,0.0124,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,680,0.0120,0,0,0
+200,32,680,0.0123,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,684,0.0121,0,0,0
+200,32,684,0.0125,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,688,0.0122,0,0,0
+200,32,688,0.0124,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,692,0.0122,0,0,0
+200,32,692,0.0125,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,696,0.0123,0,0,0
+200,32,696,0.0126,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,700,0.0124,0,0,0
+200,32,700,0.0127,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,704,0.0124,0,0,0
+200,32,704,0.0126,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,708,0.0125,0,0,0
+200,32,708,0.0127,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,712,0.0125,0,0,0
+200,32,712,0.0129,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,716,0.0126,0,0,0
+200,32,716,0.0128,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,720,0.0126,0,0,0
+200,32,720,0.0129,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,724,0.0127,0,0,0
+200,32,724,0.0132,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,728,0.0128,0,0,0
+200,32,728,0.0131,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,732,0.0128,0,0,0
+200,32,732,0.0131,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,736,0.0129,0,0,0
+200,32,736,0.0133,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,740,0.0130,0,0,0
+200,32,740,0.0133,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,744,0.0130,0,0,0
+200,32,744,0.0133,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,748,0.0131,0,0,0
+200,32,748,0.0134,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,752,0.0131,0,0,0
+200,32,752,0.0136,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,756,0.0132,0,0,0
+200,32,756,0.0136,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,760,0.0133,0,0,0
+200,32,760,0.0136,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,764,0.0134,0,0,0
+200,32,764,0.0136,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,768,0.0134,0,0,0
+200,32,768,0.0138,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,772,0.0136,0,0,0
+200,32,772,0.0138,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,776,0.0136,0,0,0
+200,32,776,0.0139,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,780,0.0136,0,0,0
+200,32,780,0.0139,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,784,0.0137,0,0,0
+200,32,784,0.0140,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,788,0.0138,0,0,0
+200,32,788,0.0140,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,792,0.0139,0,0,0
+200,32,792,0.0141,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,796,0.0139,0,0,0
+200,32,796,0.0142,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,800,0.0140,0,0,0
+200,32,800,0.0143,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,804,0.0141,0,0,0
+200,32,804,0.0143,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,808,0.0142,0,0,0
+200,32,808,0.0144,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,812,0.0142,0,0,0
+200,32,812,0.0144,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,816,0.0143,0,0,0
+200,32,816,0.0145,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,820,0.0143,0,0,0
+200,32,820,0.0146,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,824,0.0144,0,0,0
+200,32,824,0.0148,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,828,0.0145,0,0,0
+200,32,828,0.0147,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,832,0.0145,0,0,0
+200,32,832,0.0148,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,836,0.0146,0,0,0
+200,32,836,0.0149,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,840,0.0147,0,0,0
+200,32,840,0.0150,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,844,0.0147,0,0,0
+200,32,844,0.0150,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,848,0.0148,0,0,0
+200,32,848,0.0150,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,852,0.0149,0,0,0
+200,32,852,0.0151,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,856,0.0149,0,0,0
+200,32,856,0.0152,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,860,0.0150,0,0,0
+200,32,860,0.0152,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,864,0.0150,0,0,0
+200,32,864,0.0153,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,868,0.0152,0,0,0
+200,32,868,0.0154,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,872,0.0151,0,0,0
+200,32,872,0.0156,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,876,0.0153,0,0,0
+200,32,876,0.0156,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,880,0.0153,0,0,0
+200,32,880,0.0156,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,884,0.0153,0,0,0
+200,32,884,0.0157,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,888,0.0155,0,0,0
+200,32,888,0.0157,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,892,0.0156,0,0,0
+200,32,892,0.0158,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,896,0.0156,0,0,0
+200,32,896,0.0159,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,900,0.0158,0,0,0
+200,32,900,0.0159,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,904,0.0158,0,0,0
+200,32,904,0.0161,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,908,0.0159,0,0,0
+200,32,908,0.0162,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,912,0.0159,0,0,0
+200,32,912,0.0164,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,916,0.0162,0,0,0
+200,32,916,0.0163,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,920,0.0162,0,0,0
+200,32,920,0.0164,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,924,0.0162,0,0,0
+200,32,924,0.0165,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,928,0.0162,0,0,0
+200,32,928,0.0166,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,932,0.0163,0,0,0
+200,32,932,0.0166,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,936,0.0164,0,0,0
+200,32,936,0.0167,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,940,0.0165,0,0,0
+200,32,940,0.0167,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,944,0.0165,0,0,0
+200,32,944,0.0168,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,948,0.0166,0,0,0
+200,32,948,0.0169,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,952,0.0167,0,0,0
+200,32,952,0.0172,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,956,0.0168,0,0,0
+200,32,956,0.0171,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,960,0.0168,0,0,0
+200,32,960,0.0172,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,964,0.0172,0,0,0
+200,32,964,0.0175,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,968,0.0173,0,0,0
+200,32,968,0.0175,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,972,0.0173,0,0,0
+200,32,972,0.0176,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,976,0.0173,0,0,0
+200,32,976,0.0177,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,980,0.0175,0,0,0
+200,32,980,0.0178,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,984,0.0176,0,0,0
+200,32,984,0.0178,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,988,0.0175,0,0,0
+200,32,988,0.0179,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,992,0.0176,0,0,0
+200,32,992,0.0179,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,996,0.0178,0,0,0
+200,32,996,0.0182,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,1000,0.0177,0,0,0
+200,32,1000,0.0181,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,1004,0.0178,0,0,0
+200,32,1004,0.0182,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,1008,0.0178,0,0,0
+200,32,1008,0.0182,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,1012,0.0181,0,0,0
+200,32,1012,0.0184,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,1016,0.0180,0,0,0
+200,32,1016,0.0184,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,1020,0.0182,0,0,0
+200,32,1020,0.0186,0,0,0
 iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)
-200,32,1024,0.0179,0,0,0
-mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.sflop.bin.csv .
-bsub -W 60 -nnodes 1 -Is -P GEN110 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vflop.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.vflop.bin.csv
-Job &lt;4300&gt; is submitted to default queue &lt;batch&gt;.
+200,32,1024,0.0182,0,0,0
+mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.sflop.bin.csv .
+bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vflop.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.vflop.bin.csv
+Job &lt;24646&gt; is submitted to default queue &lt;batch&gt;.
 &lt;&lt;Waiting for dispatch ...&gt;&gt;
 &lt;&lt;Starting on login1&gt;&gt;
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
@@ -16852,7 +17273,7 @@ iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VEC
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,20,0.0013,438000,2190,2190
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,24,0.0014,534000,2670,2670
+200,32,24,0.0013,534000,2670,2670
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,28,0.0014,630000,3150,3150
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
@@ -16864,29 +17285,29 @@ iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VEC
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,44,0.0017,1014000,5070,5070
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,48,0.0018,1110000,5550,5550
+200,32,48,0.0017,1110000,5550,5550
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,52,0.0018,1206000,6030,6030
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,56,0.0020,1302000,6510,6510
+200,32,56,0.0019,1302000,6510,6510
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,60,0.0020,1398000,6990,6990
+200,32,60,0.0019,1398000,6990,6990
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,64,0.0021,1494000,7470,7470
+200,32,64,0.0020,1494000,7470,7470
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,68,0.0022,1590000,7950,7950
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,72,0.0022,1686000,8430,8430
+200,32,72,0.0021,1686000,8430,8430
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,76,0.0022,1782000,8910,8910
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,80,0.0023,1878000,9390,9390
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,84,0.0024,1974000,9870,9870
+200,32,84,0.0025,1974000,9870,9870
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,88,0.0024,2070000,10350,10350
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,92,0.0025,2166000,10830,10830
+200,32,92,0.0026,2166000,10830,10830
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,96,0.0025,2262000,11310,11310
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
@@ -16894,13 +17315,13 @@ iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VEC
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,104,0.0027,2454000,12270,12270
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,108,0.0028,2550000,12750,12750
+200,32,108,0.0027,2550000,12750,12750
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,112,0.0028,2646000,13230,13230
+200,32,112,0.0029,2646000,13230,13230
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,116,0.0029,2742000,13710,13710
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,120,0.0032,2838000,14190,14190
+200,32,120,0.0029,2838000,14190,14190
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,124,0.0030,2934000,14670,14670
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
@@ -16910,15 +17331,15 @@ iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VEC
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,136,0.0032,3222000,16110,16110
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,140,0.0033,3318000,16590,16590
+200,32,140,0.0032,3318000,16590,16590
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,144,0.0033,3414000,17070,17070
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,148,0.0034,3510000,17550,17550
+200,32,148,0.0036,3510000,17550,17550
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,152,0.0034,3606000,18030,18030
+200,32,152,0.0035,3606000,18030,18030
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,156,0.0036,3702000,18510,18510
+200,32,156,0.0035,3702000,18510,18510
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,160,0.0036,3798000,18990,18990
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
@@ -16928,13 +17349,13 @@ iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VEC
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,172,0.0038,4086000,20430,20430
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,176,0.0039,4182000,20910,20910
+200,32,176,0.0038,4182000,20910,20910
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,180,0.0039,4278000,21390,21390
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,184,0.0040,4374000,21870,21870
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,188,0.0040,4470000,22350,22350
+200,32,188,0.0041,4470000,22350,22350
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,192,0.0041,4566000,22830,22830
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
@@ -16944,25 +17365,25 @@ iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VEC
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,204,0.0043,4854000,24270,24270
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,208,0.0043,4950000,24750,24750
+200,32,208,0.0044,4950000,24750,24750
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,212,0.0044,5046000,25230,25230
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,216,0.0045,5142000,25710,25710
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,220,0.0047,5238000,26190,26190
+200,32,220,0.0046,5238000,26190,26190
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,224,0.0046,5334000,26670,26670
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,228,0.0047,5430000,27150,27150
+200,32,228,0.0048,5430000,27150,27150
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,232,0.0047,5526000,27630,27630
+200,32,232,0.0049,5526000,27630,27630
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,236,0.0048,5622000,28110,28110
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,240,0.0049,5718000,28590,28590
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,244,0.0050,5814000,29070,29070
+200,32,244,0.0049,5814000,29070,29070
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,248,0.0050,5910000,29550,29550
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
@@ -16972,19 +17393,19 @@ iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VEC
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,260,0.0052,6198000,30990,30990
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,264,0.0052,6294000,31470,31470
+200,32,264,0.0053,6294000,31470,31470
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,268,0.0053,6390000,31950,31950
+200,32,268,0.0054,6390000,31950,31950
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,272,0.0054,6486000,32430,32430
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,276,0.0058,6582000,32910,32910
+200,32,276,0.0054,6582000,32910,32910
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,280,0.0055,6678000,33390,33390
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,284,0.0056,6774000,33870,33870
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,288,0.0056,6870000,34350,34350
+200,32,288,0.0057,6870000,34350,34350
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,292,0.0057,6966000,34830,34830
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
@@ -16992,23 +17413,23 @@ iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VEC
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,300,0.0059,7158000,35790,35790
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,304,0.0060,7254000,36270,36270
+200,32,304,0.0059,7254000,36270,36270
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,308,0.0060,7350000,36750,36750
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,312,0.0061,7446000,37230,37230
+200,32,312,0.0062,7446000,37230,37230
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,316,0.0061,7542000,37710,37710
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,320,0.0062,7638000,38190,38190
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,324,0.0063,7734000,38670,38670
+200,32,324,0.0062,7734000,38670,38670
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,328,0.0064,7830000,39150,39150
+200,32,328,0.0063,7830000,39150,39150
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,332,0.0064,7926000,39630,39630
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,336,0.0064,8022000,40110,40110
+200,32,336,0.0065,8022000,40110,40110
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,340,0.0065,8118000,40590,40590
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
@@ -17016,21 +17437,21 @@ iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VEC
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,348,0.0066,8310000,41550,41550
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,352,0.0068,8406000,42030,42030
+200,32,352,0.0067,8406000,42030,42030
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,356,0.0069,8502000,42510,42510
+200,32,356,0.0068,8502000,42510,42510
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,360,0.0068,8598000,42990,42990
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,364,0.0069,8694000,43470,43470
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,368,0.0069,8790000,43950,43950
+200,32,368,0.0070,8790000,43950,43950
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,372,0.0070,8886000,44430,44430
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,376,0.0071,8982000,44910,44910
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,380,0.0071,9078000,45390,45390
+200,32,380,0.0072,9078000,45390,45390
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,384,0.0072,9174000,45870,45870
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
@@ -17048,23 +17469,23 @@ iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VEC
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,412,0.0077,9846000,49230,49230
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,416,0.0077,9942000,49710,49710
+200,32,416,0.0079,9942000,49710,49710
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,420,0.0078,10038000,50190,50190
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,424,0.0079,10134000,50670,50670
+200,32,424,0.0080,10134000,50670,50670
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,428,0.0079,10230000,51150,51150
+200,32,428,0.0080,10230000,51150,51150
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,432,0.0080,10326000,51630,51630
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,436,0.0080,10422000,52110,52110
+200,32,436,0.0083,10422000,52110,52110
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,440,0.0081,10518000,52590,52590
+200,32,440,0.0082,10518000,52590,52590
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,444,0.0082,10614000,53070,53070
+200,32,444,0.0083,10614000,53070,53070
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,448,0.0082,10710000,53550,53550
+200,32,448,0.0083,10710000,53550,53550
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,452,0.0083,10806000,54030,54030
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
@@ -17076,284 +17497,284 @@ iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VEC
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,468,0.0086,11190000,55950,55950
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,472,0.0088,11286000,56430,56430
+200,32,472,0.0087,11286000,56430,56430
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,476,0.0089,11382000,56910,56910
+200,32,476,0.0087,11382000,56910,56910
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,480,0.0088,11478000,57390,57390
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,484,0.0088,11574000,57870,57870
+200,32,484,0.0089,11574000,57870,57870
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,488,0.0089,11670000,58350,58350
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,492,0.0090,11766000,58830,58830
+200,32,492,0.0091,11766000,58830,58830
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,496,0.0090,11862000,59310,59310
+200,32,496,0.0091,11862000,59310,59310
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,500,0.0091,11958000,59790,59790
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
 200,32,504,0.0092,12054000,60270,60270
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,508,0.0094,12150000,60750,60750
+200,32,508,0.0093,12150000,60750,60750
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,512,0.0092,12246000,61230,61230
+200,32,512,0.0094,12246000,61230,61230
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,516,0.0093,12342000,61710,61710
+200,32,516,0.0096,12342000,61710,61710
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,520,0.0093,12438000,62190,62190
+200,32,520,0.0096,12438000,62190,62190
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,524,0.0094,12534000,62670,62670
+200,32,524,0.0095,12534000,62670,62670
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,528,0.0094,12630000,63150,63150
+200,32,528,0.0098,12630000,63150,63150
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,532,0.0095,12726000,63630,63630
+200,32,532,0.0097,12726000,63630,63630
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,536,0.0096,12822000,64110,64110
+200,32,536,0.0097,12822000,64110,64110
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,540,0.0100,12918000,64590,64590
+200,32,540,0.0098,12918000,64590,64590
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,544,0.0097,13014000,65070,65070
+200,32,544,0.0100,13014000,65070,65070
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,548,0.0098,13110000,65550,65550
+200,32,548,0.0102,13110000,65550,65550
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,552,0.0099,13206000,66030,66030
+200,32,552,0.0102,13206000,66030,66030
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,556,0.0100,13302000,66510,66510
+200,32,556,0.0101,13302000,66510,66510
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,560,0.0101,13398000,66990,66990
+200,32,560,0.0103,13398000,66990,66990
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,564,0.0102,13494000,67470,67470
+200,32,564,0.0103,13494000,67470,67470
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,568,0.0103,13590000,67950,67950
+200,32,568,0.0104,13590000,67950,67950
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,572,0.0103,13686000,68430,68430
+200,32,572,0.0105,13686000,68430,68430
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,576,0.0103,13782000,68910,68910
+200,32,576,0.0105,13782000,68910,68910
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,580,0.0105,13878000,69390,69390
+200,32,580,0.0107,13878000,69390,69390
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,584,0.0105,13974000,69870,69870
+200,32,584,0.0108,13974000,69870,69870
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,588,0.0106,14070000,70350,70350
+200,32,588,0.0107,14070000,70350,70350
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,592,0.0106,14166000,70830,70830
+200,32,592,0.0108,14166000,70830,70830
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,596,0.0106,14262000,71310,71310
+200,32,596,0.0109,14262000,71310,71310
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,600,0.0108,14358000,71790,71790
+200,32,600,0.0110,14358000,71790,71790
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,604,0.0109,14454000,72270,72270
+200,32,604,0.0110,14454000,72270,72270
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,608,0.0109,14550000,72750,72750
+200,32,608,0.0111,14550000,72750,72750
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,612,0.0109,14646000,73230,73230
+200,32,612,0.0114,14646000,73230,73230
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,616,0.0111,14742000,73710,73710
+200,32,616,0.0112,14742000,73710,73710
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,620,0.0111,14838000,74190,74190
+200,32,620,0.0113,14838000,74190,74190
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,624,0.0112,14934000,74670,74670
+200,32,624,0.0114,14934000,74670,74670
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,628,0.0112,15030000,75150,75150
+200,32,628,0.0116,15030000,75150,75150
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,632,0.0112,15126000,75630,75630
+200,32,632,0.0115,15126000,75630,75630
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,636,0.0114,15222000,76110,76110
+200,32,636,0.0117,15222000,76110,76110
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,640,0.0114,15318000,76590,76590
+200,32,640,0.0116,15318000,76590,76590
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,644,0.0114,15414000,77070,77070
+200,32,644,0.0118,15414000,77070,77070
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,648,0.0115,15510000,77550,77550
+200,32,648,0.0117,15510000,77550,77550
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,652,0.0117,15606000,78030,78030
+200,32,652,0.0119,15606000,78030,78030
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,656,0.0117,15702000,78510,78510
+200,32,656,0.0119,15702000,78510,78510
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,660,0.0117,15798000,78990,78990
+200,32,660,0.0120,15798000,78990,78990
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,664,0.0118,15894000,79470,79470
+200,32,664,0.0120,15894000,79470,79470
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,668,0.0120,15990000,79950,79950
+200,32,668,0.0121,15990000,79950,79950
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,672,0.0120,16086000,80430,80430
+200,32,672,0.0121,16086000,80430,80430
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,676,0.0121,16182000,80910,80910
+200,32,676,0.0123,16182000,80910,80910
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,680,0.0120,16278000,81390,81390
+200,32,680,0.0122,16278000,81390,81390
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,684,0.0121,16374000,81870,81870
+200,32,684,0.0125,16374000,81870,81870
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,688,0.0122,16470000,82350,82350
+200,32,688,0.0124,16470000,82350,82350
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,692,0.0122,16566000,82830,82830
+200,32,692,0.0126,16566000,82830,82830
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,696,0.0124,16662000,83310,83310
+200,32,696,0.0125,16662000,83310,83310
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,700,0.0124,16758000,83790,83790
+200,32,700,0.0127,16758000,83790,83790
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,704,0.0124,16854000,84270,84270
+200,32,704,0.0128,16854000,84270,84270
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,708,0.0125,16950000,84750,84750
+200,32,708,0.0128,16950000,84750,84750
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,712,0.0125,17046000,85230,85230
+200,32,712,0.0128,17046000,85230,85230
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,716,0.0126,17142000,85710,85710
+200,32,716,0.0128,17142000,85710,85710
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,720,0.0126,17238000,86190,86190
+200,32,720,0.0129,17238000,86190,86190
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,724,0.0127,17334000,86670,86670
+200,32,724,0.0130,17334000,86670,86670
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,728,0.0128,17430000,87150,87150
+200,32,728,0.0130,17430000,87150,87150
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,732,0.0130,17526000,87630,87630
+200,32,732,0.0132,17526000,87630,87630
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,736,0.0129,17622000,88110,88110
+200,32,736,0.0132,17622000,88110,88110
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,740,0.0129,17718000,88590,88590
+200,32,740,0.0133,17718000,88590,88590
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,744,0.0130,17814000,89070,89070
+200,32,744,0.0133,17814000,89070,89070
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,748,0.0131,17910000,89550,89550
+200,32,748,0.0134,17910000,89550,89550
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,752,0.0132,18006000,90030,90030
+200,32,752,0.0134,18006000,90030,90030
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,756,0.0132,18102000,90510,90510
+200,32,756,0.0136,18102000,90510,90510
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,760,0.0133,18198000,90990,90990
+200,32,760,0.0136,18198000,90990,90990
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,764,0.0134,18294000,91470,91470
+200,32,764,0.0136,18294000,91470,91470
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,768,0.0135,18390000,91950,91950
+200,32,768,0.0137,18390000,91950,91950
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,772,0.0136,18486000,92430,92430
+200,32,772,0.0139,18486000,92430,92430
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,776,0.0136,18582000,92910,92910
+200,32,776,0.0139,18582000,92910,92910
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,780,0.0137,18678000,93390,93390
+200,32,780,0.0139,18678000,93390,93390
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,784,0.0137,18774000,93870,93870
+200,32,784,0.0140,18774000,93870,93870
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,788,0.0138,18870000,94350,94350
+200,32,788,0.0140,18870000,94350,94350
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,792,0.0138,18966000,94830,94830
+200,32,792,0.0142,18966000,94830,94830
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,796,0.0140,19062000,95310,95310
+200,32,796,0.0142,19062000,95310,95310
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,800,0.0140,19158000,95790,95790
+200,32,800,0.0144,19158000,95790,95790
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,804,0.0140,19254000,96270,96270
+200,32,804,0.0143,19254000,96270,96270
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,808,0.0141,19350000,96750,96750
+200,32,808,0.0144,19350000,96750,96750
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,812,0.0142,19446000,97230,97230
+200,32,812,0.0145,19446000,97230,97230
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,816,0.0143,19542000,97710,97710
+200,32,816,0.0145,19542000,97710,97710
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,820,0.0143,19638000,98190,98190
+200,32,820,0.0146,19638000,98190,98190
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,824,0.0144,19734000,98670,98670
+200,32,824,0.0147,19734000,98670,98670
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,828,0.0146,19830000,99150,99150
+200,32,828,0.0147,19830000,99150,99150
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,832,0.0146,19926000,99630,99630
+200,32,832,0.0148,19926000,99630,99630
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,836,0.0146,20022000,100110,100110
+200,32,836,0.0151,20022000,100110,100110
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,840,0.0147,20118000,100590,100590
+200,32,840,0.0150,20118000,100590,100590
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,844,0.0147,20214000,101070,101070
+200,32,844,0.0150,20214000,101070,101070
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,848,0.0148,20310000,101550,101550
+200,32,848,0.0151,20310000,101550,101550
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,852,0.0148,20406000,102030,102030
+200,32,852,0.0152,20406000,102030,102030
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,856,0.0150,20502000,102510,102510
+200,32,856,0.0152,20502000,102510,102510
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,860,0.0150,20598000,102990,102990
+200,32,860,0.0152,20598000,102990,102990
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,864,0.0151,20694000,103470,103470
+200,32,864,0.0153,20694000,103470,103470
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,868,0.0151,20790000,103950,103950
+200,32,868,0.0154,20790000,103950,103950
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,872,0.0152,20886000,104430,104430
+200,32,872,0.0155,20886000,104430,104430
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,876,0.0153,20982000,104910,104910
+200,32,876,0.0155,20982000,104910,104910
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,880,0.0154,21078000,105390,105390
+200,32,880,0.0157,21078000,105390,105390
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,884,0.0154,21174000,105870,105870
+200,32,884,0.0157,21174000,105870,105870
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,888,0.0154,21270000,106350,106350
+200,32,888,0.0158,21270000,106350,106350
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,892,0.0155,21366000,106830,106830
+200,32,892,0.0158,21366000,106830,106830
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,896,0.0157,21462000,107310,107310
+200,32,896,0.0159,21462000,107310,107310
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,900,0.0156,21558000,107790,107790
+200,32,900,0.0161,21558000,107790,107790
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,904,0.0158,21654000,108270,108270
+200,32,904,0.0162,21654000,108270,108270
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,908,0.0159,21750000,108750,108750
+200,32,908,0.0161,21750000,108750,108750
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,912,0.0159,21846000,109230,109230
+200,32,912,0.0163,21846000,109230,109230
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,916,0.0161,21942000,109710,109710
+200,32,916,0.0164,21942000,109710,109710
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,920,0.0161,22038000,110190,110190
+200,32,920,0.0165,22038000,110190,110190
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,924,0.0162,22134000,110670,110670
+200,32,924,0.0164,22134000,110670,110670
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,928,0.0164,22230000,111150,111150
+200,32,928,0.0166,22230000,111150,111150
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,932,0.0164,22326000,111630,111630
+200,32,932,0.0166,22326000,111630,111630
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,936,0.0164,22422000,112110,112110
+200,32,936,0.0167,22422000,112110,112110
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,940,0.0164,22518000,112590,112590
+200,32,940,0.0168,22518000,112590,112590
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,944,0.0165,22614000,113070,113070
+200,32,944,0.0168,22614000,113070,113070
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,948,0.0167,22710000,113550,113550
+200,32,948,0.0169,22710000,113550,113550
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,952,0.0168,22806000,114030,114030
+200,32,952,0.0170,22806000,114030,114030
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,956,0.0168,22902000,114510,114510
+200,32,956,0.0170,22902000,114510,114510
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,960,0.0168,22998000,114990,114990
+200,32,960,0.0171,22998000,114990,114990
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,964,0.0174,23094000,115470,115470
+200,32,964,0.0176,23094000,115470,115470
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,968,0.0172,23190000,115950,115950
+200,32,968,0.0176,23190000,115950,115950
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,972,0.0173,23286000,116430,116430
+200,32,972,0.0177,23286000,116430,116430
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,976,0.0172,23382000,116910,116910
+200,32,976,0.0177,23382000,116910,116910
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,980,0.0174,23478000,117390,117390
+200,32,980,0.0178,23478000,117390,117390
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,984,0.0174,23574000,117870,117870
+200,32,984,0.0178,23574000,117870,117870
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,988,0.0176,23670000,118350,118350
+200,32,988,0.0179,23670000,118350,118350
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,992,0.0176,23766000,118830,118830
+200,32,992,0.0180,23766000,118830,118830
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,996,0.0179,23862000,119310,119310
+200,32,996,0.0181,23862000,119310,119310
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,1000,0.0177,23958000,119790,119790
+200,32,1000,0.0182,23958000,119790,119790
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,1004,0.0178,24054000,120270,120270
+200,32,1004,0.0182,24054000,120270,120270
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,1008,0.0178,24150000,120750,120750
+200,32,1008,0.0182,24150000,120750,120750
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,1012,0.0180,24246000,121230,121230
+200,32,1012,0.0184,24246000,121230,121230
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,1016,0.0180,24342000,121710,121710
+200,32,1016,0.0185,24342000,121710,121710
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,1020,0.0181,24438000,122190,122190
+200,32,1020,0.0184,24438000,122190,122190
 iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)
-200,32,1024,0.0178,24534000,122670,122670
-mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vflop.bin.csv .
+200,32,1024,0.0182,24534000,122670,122670
+mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.vflop.bin.csv .
 </pre>
 </div>
 </div>
@@ -17364,35 +17785,153 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vflop.bin.csv .
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[47]:</div>
+<div class="prompt input_prompt">In&nbsp;[39]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_sflop</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&quot;poisson2d.sflop.bin.csv&quot;</span><span class="p">,</span> <span class="n">skiprows</span><span class="o">=</span><span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">50000</span><span class="p">,</span> <span class="mi">2</span><span class="p">))</span>
 <span class="n">df_vflop</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&quot;poisson2d.vflop.bin.csv&quot;</span><span class="p">,</span> <span class="n">skiprows</span><span class="o">=</span><span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">50000</span><span class="p">,</span> <span class="mi">2</span><span class="p">))</span>
 <span class="n">df_flop</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">([</span><span class="n">df_sflop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">),</span> <span class="n">df_vflop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[[</span><span class="s1">&#39;PM_VECTOR_FLOP_CMPL (total)&#39;</span><span class="p">,</span> <span class="s1">&#39;PM_VECTOR_FLOP_CMPL (min)&#39;</span><span class="p">,</span> <span class="s1">&#39; PM_VECTOR_FLOP_CMPL (max)&#39;</span><span class="p">]]],</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span>
+<span class="n">df_flop</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
 </pre></div>
 
     </div>
 </div>
 </div>
 
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt output_prompt">Out[39]:</div>
+
+
+
+<div class="output_html rendered_html output_subarea output_execute_result">
+<div>
+<style scoped>
+    .dataframe tbody tr th:only-of-type {
+        vertical-align: middle;
+    }
+
+    .dataframe tbody tr th {
+        vertical-align: top;
+    }
+
+    .dataframe thead th {
+        text-align: right;
+    }
+</style>
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>nx</th>
+      <th>iter</th>
+      <th>ny</th>
+      <th>Runtime</th>
+      <th>PM_SCALAR_FLOP_CMPL (total)</th>
+      <th>PM_SCALAR_FLOP_CMPL (min)</th>
+      <th>PM_SCALAR_FLOP_CMPL (max)</th>
+      <th>PM_VECTOR_FLOP_CMPL (total)</th>
+      <th>PM_VECTOR_FLOP_CMPL (min)</th>
+      <th>PM_VECTOR_FLOP_CMPL (max)</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>4</td>
+      <td>200</td>
+      <td>32</td>
+      <td>0.0010</td>
+      <td>96000</td>
+      <td>480</td>
+      <td>480</td>
+      <td>0</td>
+      <td>0</td>
+      <td>0</td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td>8</td>
+      <td>200</td>
+      <td>32</td>
+      <td>0.0011</td>
+      <td>0</td>
+      <td>0</td>
+      <td>0</td>
+      <td>150000</td>
+      <td>750</td>
+      <td>750</td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td>12</td>
+      <td>200</td>
+      <td>32</td>
+      <td>0.0012</td>
+      <td>0</td>
+      <td>0</td>
+      <td>0</td>
+      <td>246000</td>
+      <td>1230</td>
+      <td>1230</td>
+    </tr>
+    <tr>
+      <th>3</th>
+      <td>16</td>
+      <td>200</td>
+      <td>32</td>
+      <td>0.0012</td>
+      <td>0</td>
+      <td>0</td>
+      <td>0</td>
+      <td>342000</td>
+      <td>1710</td>
+      <td>1710</td>
+    </tr>
+    <tr>
+      <th>4</th>
+      <td>20</td>
+      <td>200</td>
+      <td>32</td>
+      <td>0.0013</td>
+      <td>0</td>
+      <td>0</td>
+      <td>0</td>
+      <td>438000</td>
+      <td>2190</td>
+      <td>2190</td>
+    </tr>
+  </tbody>
+</table>
+</div>
+</div>
+
+</div>
+
+</div>
+</div>
+
 </div>
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>The name of the vector counter is a bit misleading; not floating point operations are measured but floating point instructions. To get <em>real</em> floating point operations, each value needs to be multiplied by the vector width (2). We can plot the values afterwards (non-interactive: <code>make graph_task4</code>).</p>
+<p>Again, the name of the vector counter is a bit misleading; not floating point operations are measured but floating point instructions. To get <em>real</em> floating point operations, each value needs to be multiplied by the vector width (2). We can plot the values afterwards (non-interactive: <code>make graph_task4</code>).</p>
 
 </div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[49]:</div>
+<div class="prompt input_prompt">In&nbsp;[40]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df_flop</span><span class="p">,</span> <span class="s2">&quot;PM_SCALAR_FLOP_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Scalar FlOps / Loop Iteration&quot;</span><span class="p">)</span>
-<span class="n">common</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="n">df_flop</span><span class="p">,</span> <span class="s2">&quot;PM_VECTOR_FLOP_CMPL (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Vector Instructions / Loop Iteration&quot;</span><span class="p">)</span>
-<span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;Vector FlOps / Loop Iteration&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;Vector Instructions / Loop Iteration&quot;</span><span class="p">]</span> <span class="o">*</span> <span class="mi">2</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;nx&quot;</span><span class="p">]</span> <span class="o">*</span> <span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;ny&quot;</span><span class="p">]</span>
+<span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;Vector FlOps (min)&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;PM_VECTOR_FLOP_CMPL (min)&quot;</span><span class="p">]</span> <span class="o">*</span> <span class="mi">2</span>
+<span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;Scalar FlOps (min)&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df_flop</span><span class="p">[</span><span class="s2">&quot;PM_SCALAR_FLOP_CMPL (min)&quot;</span><span class="p">]</span>
 </pre></div>
 
     </div>
@@ -17402,10 +17941,10 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vflop.bin.csv .
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[50]:</div>
+<div class="prompt input_prompt">In&nbsp;[41]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_flop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[[</span><span class="s2">&quot;Scalar FlOps / Loop Iteration&quot;</span><span class="p">,</span> <span class="s2">&quot;Vector FlOps / Loop Iteration&quot;</span><span class="p">]]</span><span class="o">.</span><span class="n">plot</span><span class="p">();</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_flop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[[</span><span class="s2">&quot;Scalar FlOps (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Vector FlOps (min)&quot;</span><span class="p">]]</span><span class="o">.</span><span class="n">plot</span><span class="p">();</span>
 </pre></div>
 
     </div>
@@ -17424,7 +17963,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vflop.bin.csv .
 
 
 <div class="output_png output_subarea ">
-<img src="
+<img src="
 "
 >
 </div>
@@ -17434,6 +17973,52 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vflop.bin.csv .
 </div>
 </div>
 
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[43]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">_fit</span><span class="p">,</span> <span class="n">_cov</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">print_and_return_fit</span><span class="p">(</span>
+    <span class="p">[</span><span class="s2">&quot;Scalar FlOps (min)&quot;</span><span class="p">,</span> <span class="s2">&quot;Vector FlOps (min)&quot;</span><span class="p">],</span> 
+    <span class="n">df_flop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">),</span> 
+    <span class="n">linear_function</span>
+<span class="p">)</span>
+<span class="n">fit_parameters</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_parameters</span><span class="p">,</span> <span class="o">**</span><span class="n">_fit</span><span class="p">}</span>
+<span class="n">fit_covariance</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">fit_covariance</span><span class="p">,</span> <span class="o">**</span><span class="n">_cov</span><span class="p">}</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>Counter Scalar FlOps (min) is proportional to the grid points (nx*ny) by a factor of -0.0003 (± 0.0002)
+Counter Vector FlOps (min) is proportional to the grid points (nx*ny) by a factor of  7.5004 (± 0.0002)
+</pre>
+</div>
+</div>
+
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<p>Interesting! We seem to be using the vector registers of our system very well. Basically all operations are vector operations!</p>
+
+</div>
+</div>
 </div>
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
@@ -17449,13 +18034,13 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vflop.bin.csv .
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[66]:</div>
+<div class="prompt input_prompt">In&nbsp;[56]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">I_flop_scalar</span> <span class="o">=</span> <span class="n">df_flop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Scalar FlOps / Loop Iteration&quot;</span><span class="p">]</span>
-<span class="n">I_flop_vector</span> <span class="o">=</span> <span class="n">df_flop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;nx&quot;</span><span class="p">)[</span><span class="s2">&quot;Vector FlOps / Loop Iteration&quot;</span><span class="p">]</span>
-<span class="n">I_mem_load</span>    <span class="o">=</span> <span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Loads / Loop Iteration&quot;</span><span class="p">]</span>
-<span class="n">I_mem_store</span>   <span class="o">=</span> <span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Stores / Loop Iteration&quot;</span><span class="p">]</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">I_flop_scalar</span> <span class="o">=</span> <span class="n">df_flop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;Scalar FlOps (min)&quot;</span><span class="p">]</span>
+<span class="n">I_flop_vector</span> <span class="o">=</span> <span class="n">df_flop</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="s2">&quot;Grid Points&quot;</span><span class="p">)[</span><span class="s2">&quot;Vector FlOps (min)&quot;</span><span class="p">]</span>
+<span class="n">I_mem_load</span>    <span class="o">=</span> <span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Loads&quot;</span><span class="p">]</span>
+<span class="n">I_mem_store</span>   <span class="o">=</span> <span class="n">df_byte</span><span class="p">[</span><span class="s2">&quot;Stores&quot;</span><span class="p">]</span>
 </pre></div>
 
     </div>
@@ -17465,7 +18050,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vflop.bin.csv .
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[75]:</div>
+<div class="prompt input_prompt">In&nbsp;[57]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_ai</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">()</span>
@@ -17490,7 +18075,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vflop.bin.csv .
 
 
 <div class="output_png output_subarea ">
-<img src="
+<img src="
 "
 >
 </div>
@@ -17514,6 +18099,7 @@ mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vflop.bin.csv .
 <div class="text_cell_render border-box-sizing rendered_html">
 <h2 id="Task-E2:-Measuring-a-Larger-Range">Task E2: Measuring a Larger Range<a class="anchor-link" href="#Task-E2:-Measuring-a-Larger-Range">&#182;</a></h2><p><a name="taske2"></a></p>
 <p>If you still still have time, you might venture into your own benchmarking adventure.</p>
+<p>Maybe you noticed already, for instance in Task 2 C: At the very right to very large numbers of grid points, the behaviour of the graph changed. Something is happening there!</p>
 <p><strong>TASK</strong>: Revisit the counters measured above for a larger range of <code>nx</code>. Right now, we only studied <code>nx</code> until 1000. New effects appear above that value – partly only well above, though ($nx &gt; 15000$).</p>
 <p>You're on your own here. Edit the <code>bench.sh</code> script to change the range and the stepping increments.</p>
 <p><strong>Good luck!</strong></p>
diff --git a/2-Performance_Counters/Handson/Solutions/Hands-On-Performance-Counters.ipynb b/2-Performance_Counters/Handson/Solutions/Hands-On-Performance-Counters.ipynb
index ae4037283522b909c8fe6ef29d2108cf3af5cc07..91f993b1553d71a39298693fdaa16fc55240d18b 100644
--- a/2-Performance_Counters/Handson/Solutions/Hands-On-Performance-Counters.ipynb
+++ b/2-Performance_Counters/Handson/Solutions/Hands-On-Performance-Counters.ipynb
@@ -1,4402 +1 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Hands-On: Performance Counters\n",
-    "\n",
-    "This Notebook is part of the exercises for the SC18 Tutorial »Application Porting and Optimization on GPU-accelerated POWER Architectures«. It is to be run on a POWER9 machine; in the tutorial: on Ascent, the POWER9 training cluster of Oak Ridge National Lab.\n",
-    "\n",
-    "This Notebook can be run interactively on Ascent. If this capability is unavailable to you, use it as a description for executing the tasks on Ascent via a shell access. During data evaluation, the Notebook mentions the corresponding commands to execute in case you are not able to run the Notebook interactively directly on Ascent.\n",
-    "\n",
-    "## Table of Contents\n",
-    "<a name=\"toc\"></a>\n",
-    "\n",
-    "* [Task 1: Measuring Cycles and Instructions](#task1)\n",
-    "* [Task 2: Measuring Loads and Stores](#task2)\n",
-    "  - [A: Loads and Stores](#task2-a)\n",
-    "  - [B: More Loads and Stores](#task2-b)\n",
-    "  - [C: Bandwidth](#task2-c)\n",
-    "* [Task E1: Measuring FLOP](#taske1)\n",
-    "* [Task E2: Measuring a Greater Range](#taske2)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Task 1: Measuring Cycles and Instructions\n",
-    "<a name=\"task1\"></a>\n",
-    "\n",
-    "Throughout this exercise, the core loop of the Jacobi algorithm is instrumented and analyzed. The part in question is\n",
-    "\n",
-    "```c\n",
-    "for (int iy = iy_start; iy < iy_end; iy++)\n",
-    "{\n",
-    "    for( int ix = ix_start; ix < ix_end; ix++ )\n",
-    "    {\n",
-    "        Anew[iy*nx+ix] = -0.25 * (rhs[iy*nx+ix] - (A[ iy   *nx+ix+1] + A[ iy   *nx+ix-1]\n",
-    "                                                +  A[(iy-1)*nx+ix  ] + A[(iy+1)*nx+ix  ]));\n",
-    "        error = fmaxr( error, fabsr(Anew[iy*nx+ix]-A[iy*nx+ix]));\n",
-    "    }\n",
-    "}\n",
-    "```\n",
-    "\n",
-    "After `PAPI_add_named_event()` is used to add named PMU events outside of the relaxation iteration, `PAPI_start()`\n",
-    "and `PAPI_stop()` can be used to count how often a PMU event is incremented.\n",
-    "\n",
-    "For the first task, we will measure quantities often used to characterize an application, cycles and instructions.\n",
-    "\n",
-    "**TASK**: Please measure counters for completed instructions and run cycles. See the TODOs in [`poisson2d.ins_cyc.c`](/edit/Tasks/poisson2d.ins_cyc.c). Either edit with Jupyter capabilities by clicking on the link of the file or use a dedicated editor (`vim` is available). The names of the counters to be implemented are `PM_INST_CMPL` and `PM_RUN_CYC`.\n",
-    "\n",
-    "After changing the source code, compile it with `make task1` or by executing the following cell (we need to change directories first, though).\n",
-    "\n",
-    "[Back to top](#toc)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/autofs/nccsopen-svm1_home/aherten/OpenPOWER-SC18/2-PAPI/Compiling/Solutions\n"
-     ]
-    }
-   ],
-   "source": [
-    "%cd Tasks/\n",
-    "# Use `%cd Solutions` to look at the solutions for each task"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "gcc -DUSE_DOUBLE -Ofast -std=c99 -lm -lpapi  poisson2d.ins_cyc.c -o poisson2d.ins_cyc.bin\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "!make task1"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Make sure your program is measuring correctly, by invoking it, for instance with these arguments: `./poisson2d.ins_cyc.bin 100 64 32` – see the next cell. The `100` specifies the number of iterations to perform, `64` and `32` are the size of the grid in y and x direction, respectively."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\r\n",
-      "100,64,32,0.0011,3324000,33229,34329,1902422,18803,27821\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "!./poisson2d.ins_cyc.bin 100 64 32\n",
-    "# alternatively call !make run_task1"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "For the following runs, we are going to use Ascent's compute backend nodes which are not shared amongst users and also have six GPUs available. We use the available batch scheduler *IBM Spectrum LSF* for this. For convenience, a call to the batch submission system is stored in the environment variable `$SC18_SUBMIT_CMD`. You are welcome to adapt it once you get more familiar with the system.\n",
-    "\n",
-    "For now, we want to run our first benchmarking run and measure cycles and instructions for different data sizes, as a function of `nx`. The Makefile holds a target for this, call it with `make bench_task1`:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 80,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "gcc -DUSE_DOUBLE -Ofast -std=c99 -lm -lpapi  poisson2d.ins_cyc.c -o poisson2d.ins_cyc.bin\n",
-      "bsub -W 60 -nnodes 1 -Is -P GEN110 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.ins_cyc.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv\n",
-      "Job <4318> is submitted to default queue <batch>.\n",
-      "<<Waiting for dispatch ...>>\n",
-      "<<Starting on login1>>\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,4,0.0012,548153,2735,3888,266504,1243,4753\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,8,0.0014,1082153,5405,6558,668070,3227,6573\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,12,0.0014,1442153,7205,8358,872094,4181,12974\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,16,0.0015,1802153,9005,10158,1074585,5230,7975\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,20,0.0015,2162153,10805,11958,1281118,6236,14107\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,24,0.0016,2522153,12605,13758,1479347,7222,10037\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,28,0.0019,2882153,14405,15558,1682827,8251,11219\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,32,0.0017,3242153,16205,17358,1871170,9210,12109\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,36,0.0018,3602153,18005,19158,2075730,10193,13063\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,40,0.0019,3962153,19805,20958,2272736,11258,14491\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,44,0.0019,4322153,21605,22758,2491982,12249,17554\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,48,0.0020,4682153,23405,24558,2692600,13292,16003\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,52,0.0020,5042153,25205,26358,2878730,14277,17055\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,56,0.0021,5402153,27005,28158,3084915,15295,18583\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,60,0.0022,5762153,28805,29958,3291836,16330,19233\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,64,0.0023,6122153,30605,31758,3622134,17946,20887\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,68,0.0024,6482153,32405,33558,3930512,19200,22297\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,72,0.0027,6842153,34205,35358,4270649,20402,22797\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,76,0.0025,7202153,36005,37158,4209408,20894,24035\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,80,0.0025,7562153,37805,38958,4410712,21911,24986\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,84,0.0026,7922153,39605,40758,4631259,23020,25649\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,88,0.0027,8282153,41405,42558,4814218,23914,26743\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,92,0.0027,8642153,43205,44358,5039020,24944,37612\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,96,0.0030,9002153,45005,46158,5247046,26072,29012\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,100,0.0029,9362153,46805,47958,5426721,26963,29831\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,104,0.0029,9722153,48605,49758,5619647,27963,31679\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,108,0.0030,10082153,50405,51558,5828776,28956,31626\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,112,0.0031,10442153,52205,53358,6033005,30029,32674\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,116,0.0031,10802153,54005,55158,6244763,30994,35257\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,120,0.0032,11162153,55805,56958,6425499,31972,34572\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,124,0.0033,11522153,57605,58758,6654149,33094,35931\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,128,0.0033,11882153,59405,60558,6851733,34090,36755\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,132,0.0034,12242153,61205,62358,7052529,35058,39834\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,136,0.0035,12602153,63005,64158,7241645,36039,38957\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,140,0.0035,12962153,64805,65958,7438548,37024,39702\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,144,0.0036,13322153,66605,67758,7649807,38039,46041\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,148,0.0037,13682153,68405,69558,7837686,39006,41671\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,152,0.0037,14042153,70205,71358,8039582,40031,42707\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,156,0.0038,14402153,72005,73158,8272212,41195,43645\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,160,0.0040,14762153,73805,74958,8471858,42200,44594\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,164,0.0039,15122153,75605,76758,8657085,43103,45699\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,168,0.0039,15482153,77405,78558,8856462,44110,46863\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,172,0.0040,15842153,79205,80358,9050337,45084,47600\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,176,0.0041,16202153,81005,82158,9267755,46142,55546\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,180,0.0042,16562153,82805,83958,9452041,47058,49763\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,184,0.0042,16922153,84605,85758,9655929,48043,50875\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,188,0.0043,17282153,86405,87558,9906002,49331,52491\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,192,0.0043,17642153,88205,89358,10089481,50268,52937\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,196,0.0044,18002153,90005,91158,10292606,51256,54507\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,200,0.0045,18362153,91805,92958,10466174,52144,54851\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,204,0.0045,18722153,93605,94758,10710242,53145,77999\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,208,0.0046,19082153,95405,96558,10872705,54177,57081\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,212,0.0047,19442153,97205,98358,11284063,56244,58937\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,216,0.0047,19802153,99005,100158,11267668,56162,58869\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,220,0.0048,20162153,100805,101958,11510801,57350,60362\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,224,0.0051,20522153,102605,103758,11730908,58406,61013\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,228,0.0050,20882153,104405,105558,11891323,59260,62051\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,232,0.0050,21242153,106205,107358,12083458,60220,63113\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,236,0.0050,21602153,108005,109158,12290078,61234,68599\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,240,0.0051,21962153,109805,110958,12547828,62267,88616\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,244,0.0052,22322153,111605,112758,12674066,63146,66333\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,248,0.0052,22682153,113405,114558,12882346,64155,67081\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,252,0.0053,23042153,115205,116358,13140221,65490,68231\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,256,0.0054,23402153,117005,118158,13331460,66431,69187\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,260,0.0054,23762153,118805,119958,13531478,67456,70141\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,264,0.0055,24122153,120605,121758,13710546,68246,81094\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,268,0.0055,24482153,122405,123558,13890638,69208,72412\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,272,0.0056,24842153,124205,125358,14130816,70366,88752\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,276,0.0057,25202153,126005,127158,14355067,71208,93990\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,280,0.0057,25562153,127805,128958,14513593,72251,85857\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,284,0.0059,25922153,129605,130758,14800806,73802,76775\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,288,0.0059,26282153,131405,132558,14959572,74579,77267\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,292,0.0059,26642153,133205,134358,15130033,75389,78361\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,296,0.0060,27002153,135005,136158,15314583,76370,79151\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,300,0.0061,27362153,136805,137958,15515700,77373,80055\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,304,0.0061,27722153,138605,139758,15739536,78395,81351\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,308,0.0062,28082153,140405,141558,15910915,79341,82085\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,312,0.0063,28442153,142205,143358,16119259,80297,83271\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,316,0.0063,28802153,144005,145158,16376727,81668,84481\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,320,0.0064,29162153,145805,146958,16575917,82685,85800\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,324,0.0065,29522153,147605,148758,16752101,83529,86861\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,328,0.0065,29882153,149405,150558,16931954,84456,87199\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,332,0.0066,30242153,151205,152358,17129562,85462,88022\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,336,0.0067,30602153,153005,154158,17522378,87337,90235\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,340,0.0067,30962153,154805,155958,17525540,87379,89947\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,344,0.0068,31322153,156605,157758,17811817,88413,169057\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,348,0.0069,31682153,158405,159558,17999372,89772,92601\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,352,0.0069,32042153,160205,161358,18204371,90776,101494\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,356,0.0070,32402153,162005,163158,18393456,91621,107055\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,360,0.0070,32762153,163805,164958,18567077,92476,114024\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,364,0.0072,33122153,165605,166758,18749614,93562,96291\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,368,0.0073,33482153,167405,168558,18957503,94465,97467\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,372,0.0072,33842153,169205,170358,19137907,95471,98421\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,376,0.0073,34202153,171005,172158,19350029,96457,99505\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,380,0.0075,34562153,172805,173958,19657158,97897,122483\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,384,0.0075,34922153,174605,175758,20019224,98872,199167\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,388,0.0075,35282153,176405,177558,19999785,99747,102911\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,392,0.0077,35642153,178205,179358,20188679,100586,121054\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,396,0.0076,36002153,180005,181158,20368637,101583,105060\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,400,0.0077,36362153,181805,182958,20628698,102607,152896\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,404,0.0078,36722153,183605,184758,20759711,103503,111551\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,408,0.0078,37082153,185405,186558,21008339,104552,136230\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,412,0.0080,37442153,187205,188358,21248565,105961,109252\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,416,0.0080,37802153,189005,190158,21446394,106998,110446\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,420,0.0081,38162153,190805,191958,21618503,107795,119989\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,424,0.0081,38522153,192605,193758,21778142,108604,112064\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,428,0.0081,38882153,194405,195558,21989784,109653,120306\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,432,0.0082,39242153,196205,197358,22191881,110730,113916\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,436,0.0083,39602153,198005,199158,22373426,111587,115657\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,440,0.0084,39962153,199805,200958,22596402,112638,130342\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,444,0.0084,40322153,201605,202758,22868323,114041,124888\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,448,0.0085,40682153,203405,204558,23084361,115132,128588\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,452,0.0086,41042153,205205,206358,23255449,115787,156348\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,456,0.0088,41402153,207005,208158,23400730,116742,119985\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,460,0.0087,41762153,208805,209958,23616057,117782,125672\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,464,0.0088,42122153,210605,211758,23845815,118769,150383\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,468,0.0089,42482153,212405,213558,23982677,119580,123029\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,472,0.0090,42842153,214205,215358,24183894,120688,124270\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,476,0.0090,43202153,216005,217158,24479273,122149,125974\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,480,0.0091,43562153,217805,218958,24768939,123125,164217\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,484,0.0092,43922153,219605,220758,24828983,123895,127390\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,488,0.0091,44282153,221405,222558,25011559,124768,128788\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,492,0.0092,44642153,223205,224358,25219550,125760,132732\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,496,0.0093,45002153,225005,226158,25447017,126853,140428\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,500,0.0093,45362153,226805,227958,25586059,127650,131094\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,504,0.0094,45722153,228605,229758,25796559,128739,131932\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,508,0.0095,46082153,230405,231558,26122261,130275,141242\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,512,0.0095,46442153,232205,233358,26303806,130890,135216\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,516,0.0096,46802153,234005,235158,26441241,131860,137807\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,520,0.0097,47162153,235805,236958,26620814,132726,144193\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,524,0.0097,47522153,237605,238758,26895547,133979,180810\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,528,0.0098,47882153,239405,240558,27103175,134594,195038\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,532,0.0099,48242153,241205,242358,27216804,135653,148537\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,536,0.0100,48602153,243005,244158,27609711,137157,225927\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,540,0.0101,48962153,244805,245958,27856165,138525,222412\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,544,0.0101,49322153,246605,247758,27949313,139206,146089\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,548,0.0102,49682153,248405,249558,28071639,140106,144061\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,552,0.0102,50042153,250205,251358,28221254,140771,147826\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,556,0.0103,50402153,252005,253158,28466442,141994,145849\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,560,0.0105,50762153,253805,254958,28785863,142904,194917\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,564,0.0105,51122153,255605,256758,28851831,143902,156411\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,568,0.0106,51482153,257405,258558,29223120,145608,162476\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,572,0.0108,51842153,259205,260358,29438332,146788,151895\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,576,0.0108,52202153,261005,262158,29557331,147210,151262\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,580,0.0108,52562153,262805,263958,29704990,148198,158557\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,584,0.0108,52922153,264605,265758,29996452,149016,250006\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,588,0.0109,53282153,266405,267558,30123135,150270,154069\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,592,0.0110,53642153,268205,269358,30283611,150978,165439\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,596,0.0110,54002153,270005,271158,30512807,152128,156216\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,600,0.0111,54362153,271805,272958,30713954,153227,157015\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,604,0.0113,54722153,273605,274758,31116246,155098,162946\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,608,0.0113,55082153,275405,276558,31292429,155792,166047\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,612,0.0113,55442153,277205,278358,31367681,156312,187819\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,616,0.0114,55802153,279005,280158,31509163,156923,173955\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,620,0.0115,56162153,280805,281958,31751550,158349,162413\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,624,0.0116,56522153,282605,283758,32010052,159426,164990\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,628,0.0116,56882153,284405,285558,32270071,160471,206182\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,632,0.0118,57242153,286205,287358,32379821,161317,166154\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,636,0.0118,57602153,288005,289158,32621237,162719,174455\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,640,0.0118,57962153,289805,290958,32760054,163283,174727\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,644,0.0119,58322153,291605,292758,32895462,163973,168568\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,648,0.0119,58682153,293405,294558,33046462,164805,176098\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,652,0.0120,59042153,295205,296358,33305627,166069,179927\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,656,0.0121,59402153,297005,298158,33611780,166989,248127\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,660,0.0121,59762153,298805,299958,33791922,168433,184984\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,664,0.0121,60122153,300605,301758,33927065,169140,182483\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,668,0.0124,60482153,302405,303558,34476798,171567,188679\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,672,0.0123,60842153,304205,305358,34350802,171240,175365\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,676,0.0123,61202153,306005,307158,34529315,172118,202239\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,680,0.0124,61562153,307805,308958,34716545,172878,244909\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,684,0.0126,61922153,309605,310758,35111667,174820,186347\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,688,0.0126,62282153,311405,312558,35200811,175517,179013\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,692,0.0126,62642153,313205,314358,35391859,176015,252609\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,696,0.0127,63002153,315005,316158,35696188,177815,200506\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,700,0.0128,63362153,316805,317958,35825556,178736,191521\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,704,0.0129,63722153,318605,319758,36008866,179237,218743\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,708,0.0129,64082153,320405,321558,36282257,180511,214158\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,712,0.0129,64442153,322205,323358,36251857,180793,191833\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,716,0.0131,64802153,324005,325158,36828270,182903,229477\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,720,0.0130,65162153,325805,326958,36775140,183107,213910\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,724,0.0131,65522153,327605,328758,36946255,184028,240244\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,728,0.0132,65882153,329405,330558,37189420,185485,206103\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,732,0.0133,66242153,331205,332358,37526856,187108,192940\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,736,0.0134,66602153,333005,334158,37747623,188004,201070\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,740,0.0134,66962153,334805,335958,37844347,188709,198675\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,744,0.0134,67322153,336605,337758,37874634,189009,203611\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,748,0.0136,67682153,338405,339558,38360815,190893,193995\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,752,0.0137,68042153,340205,341358,38702052,192377,222451\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,756,0.0136,68402153,342005,343158,38548177,192033,249435\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,760,0.0138,68762153,343805,344958,39152996,194437,272148\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,764,0.0138,69122153,345605,346758,39070056,194876,204988\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,768,0.0138,69482153,347405,348558,39192485,195337,208507\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,772,0.0139,69842153,349205,350358,39509976,197063,216644\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,776,0.0140,70202153,351005,352158,39643299,197720,238164\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,780,0.0141,70562153,352805,353958,40047395,199611,212284\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,784,0.0142,70922153,354605,355758,40474213,201350,218018\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,788,0.0143,71282153,356405,357558,40369690,200941,270257\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,792,0.0143,71642153,358205,359358,40667289,202430,244792\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,796,0.0145,72002153,360005,361158,41245212,205315,244622\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,800,0.0144,72362153,361805,362958,41042713,204407,249254\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,804,0.0145,72722153,363605,364758,41137099,205254,211445\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,808,0.0145,73082153,365405,366558,41267168,205869,210553\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,812,0.0146,73442153,367205,368358,41538016,207083,242270\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,816,0.0147,73802153,369005,370158,41856937,208198,257079\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,820,0.0149,74162153,370805,371958,42581251,211598,220361\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,824,0.0148,74522153,372605,373758,42106929,210144,214780\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,828,0.0151,74882153,374405,375558,42954101,213100,216189\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,832,0.0150,75242153,376205,377358,42591682,212393,217281\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,836,0.0150,75602153,378005,379158,42833889,213607,225147\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,840,0.0151,75962153,379805,380958,42888365,213833,258282\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,844,0.0151,76322153,381605,382758,43234463,215605,228741\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,848,0.0152,76682153,383405,384558,43340508,216058,240778\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,852,0.0154,77042153,385205,386358,43964132,218702,263707\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,856,0.0155,77402153,387005,388158,43738562,218168,230126\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,860,0.0154,77762153,388805,389958,44071523,219837,238185\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,864,0.0155,78122153,390605,391758,44411093,221177,232408\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,868,0.0157,78482153,392405,393558,44526424,222013,237960\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,872,0.0158,78842153,394205,395358,45188815,224084,346189\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,876,0.0156,79202153,396005,397158,44700630,222996,237268\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,880,0.0158,79562153,397805,398958,45208957,224813,328325\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,884,0.0159,79922153,399605,400758,45474656,226439,239215\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,888,0.0160,80282153,401405,402558,45766475,227867,240911\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,892,0.0160,80642153,403205,404358,45940503,228819,243891\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,896,0.0161,81002153,405005,406158,45973712,229111,241548\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,900,0.0162,81362153,406805,407958,46447521,230613,346027\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,904,0.0163,81722153,408605,409758,46859527,233117,305572\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,908,0.0164,82082153,410405,411558,47123610,234871,284329\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,912,0.0166,82442153,412205,413358,47816182,237201,366650\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,916,0.0166,82802153,414005,415158,47456504,236767,248921\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,920,0.0165,83162153,415805,416958,47592162,237459,265738\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,924,0.0167,83522153,417605,418758,48057683,239541,276783\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,928,0.0167,83882153,419405,420558,48171706,239841,277682\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,932,0.0170,84242153,421205,422358,48721591,242883,245719\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,936,0.0169,84602153,423005,424158,48377712,241387,254877\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,940,0.0169,84962153,424805,425958,48721762,242855,255300\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,944,0.0170,85322153,426605,427758,49035991,243372,370914\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,948,0.0171,85682153,428405,429558,49070436,244800,262067\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,952,0.0171,86042153,430205,431358,49234273,245636,258683\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,956,0.0172,86402153,432005,433158,49586922,247001,316148\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,960,0.0172,86762153,433805,434958,49640943,247637,284307\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,964,0.0177,87122153,435605,436758,51436885,256453,266477\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,968,0.0178,87482153,437405,438558,51146832,254991,267861\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,972,0.0177,87842153,439205,440358,51377929,256333,274159\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,976,0.0179,88202153,441005,442158,51360933,256336,265049\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,980,0.0179,88562153,442805,443958,51845435,258521,293602\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,984,0.0180,88922153,444605,445758,52129373,259818,262711\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,988,0.0181,89282153,446405,447558,52262963,260903,278224\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,992,0.0182,89642153,448205,449358,52407317,261432,272849\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,996,0.0184,90002153,450005,451158,53286503,265403,275404\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,1000,0.0182,90362153,451805,452958,53051777,264487,273734\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,1004,0.0183,90722153,453605,454758,53153647,264834,340140\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,1008,0.0183,91082153,455405,456558,53025643,264711,274578\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,1012,0.0185,91442153,457205,458358,53709439,267192,353247\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,1016,0.0186,91802153,459005,460158,54036527,268786,339099\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,1020,0.0186,92162153,460805,461958,54154888,269844,327020\n",
-      "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n",
-      "200,32,1024,0.0183,92522153,462605,463758,52875104,262839,332332\n",
-      "mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ins_cyc.bin.csv .\n"
-     ]
-    }
-   ],
-   "source": [
-    "!make bench_task1"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Once the run is completed, let's have a look at the data!\n",
-    "\n",
-    "This can be done best in the interactive version of the Jupyter Notebook. In case this version of the description is unavailable to you, call the Makefile target `make graph_task1` (either with X forwarding, or download the resulting PDF)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import seaborn as sns\n",
-    "import pandas as pd\n",
-    "import matplotlib.pyplot as plt\n",
-    "import common\n",
-    "%matplotlib inline\n",
-    "sns.set()\n",
-    "plt.rcParams['figure.figsize'] = [14, 6]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 77,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>iter</th>\n",
-       "      <th>ny</th>\n",
-       "      <th>nx</th>\n",
-       "      <th>Runtime</th>\n",
-       "      <th>PM_INST_CMPL (total)</th>\n",
-       "      <th>PM_INST_CMPL (min)</th>\n",
-       "      <th>PM_INST_CMPL (max)</th>\n",
-       "      <th>PM_RUN_CYC (total)</th>\n",
-       "      <th>PM_RUN_CYC (min)</th>\n",
-       "      <th>PM_RUN_CYC (max)</th>\n",
-       "      <th>Instructions / Loop Iteration</th>\n",
-       "      <th>Cycles / Loop Iteration</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>200</td>\n",
-       "      <td>32</td>\n",
-       "      <td>4</td>\n",
-       "      <td>0.0012</td>\n",
-       "      <td>548153</td>\n",
-       "      <td>2735</td>\n",
-       "      <td>3888</td>\n",
-       "      <td>266883</td>\n",
-       "      <td>1237</td>\n",
-       "      <td>4793</td>\n",
-       "      <td>21.367188</td>\n",
-       "      <td>9.664062</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>200</td>\n",
-       "      <td>32</td>\n",
-       "      <td>8</td>\n",
-       "      <td>0.0014</td>\n",
-       "      <td>1082153</td>\n",
-       "      <td>5405</td>\n",
-       "      <td>6558</td>\n",
-       "      <td>668819</td>\n",
-       "      <td>3214</td>\n",
-       "      <td>6623</td>\n",
-       "      <td>21.113281</td>\n",
-       "      <td>12.554688</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>200</td>\n",
-       "      <td>32</td>\n",
-       "      <td>12</td>\n",
-       "      <td>0.0014</td>\n",
-       "      <td>1442153</td>\n",
-       "      <td>7205</td>\n",
-       "      <td>8358</td>\n",
-       "      <td>872913</td>\n",
-       "      <td>4187</td>\n",
-       "      <td>11640</td>\n",
-       "      <td>18.763021</td>\n",
-       "      <td>10.903646</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>200</td>\n",
-       "      <td>32</td>\n",
-       "      <td>16</td>\n",
-       "      <td>0.0015</td>\n",
-       "      <td>1802153</td>\n",
-       "      <td>9005</td>\n",
-       "      <td>10158</td>\n",
-       "      <td>1077532</td>\n",
-       "      <td>5254</td>\n",
-       "      <td>8147</td>\n",
-       "      <td>17.587891</td>\n",
-       "      <td>10.261719</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>200</td>\n",
-       "      <td>32</td>\n",
-       "      <td>20</td>\n",
-       "      <td>0.0016</td>\n",
-       "      <td>2162153</td>\n",
-       "      <td>10805</td>\n",
-       "      <td>11958</td>\n",
-       "      <td>1277957</td>\n",
-       "      <td>6209</td>\n",
-       "      <td>9015</td>\n",
-       "      <td>16.882812</td>\n",
-       "      <td>9.701562</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   iter  ny  nx  Runtime  PM_INST_CMPL (total)  PM_INST_CMPL (min)  \\\n",
-       "0   200  32   4   0.0012                548153                2735   \n",
-       "1   200  32   8   0.0014               1082153                5405   \n",
-       "2   200  32  12   0.0014               1442153                7205   \n",
-       "3   200  32  16   0.0015               1802153                9005   \n",
-       "4   200  32  20   0.0016               2162153               10805   \n",
-       "\n",
-       "    PM_INST_CMPL (max)  PM_RUN_CYC (total)  PM_RUN_CYC (min)  \\\n",
-       "0                 3888              266883              1237   \n",
-       "1                 6558              668819              3214   \n",
-       "2                 8358              872913              4187   \n",
-       "3                10158             1077532              5254   \n",
-       "4                11958             1277957              6209   \n",
-       "\n",
-       "    PM_RUN_CYC (max)  Instructions / Loop Iteration  Cycles / Loop Iteration  \n",
-       "0               4793                      21.367188                 9.664062  \n",
-       "1               6623                      21.113281                12.554688  \n",
-       "2              11640                      18.763021                10.903646  \n",
-       "3               8147                      17.587891                10.261719  \n",
-       "4               9015                      16.882812                 9.701562  "
-      ]
-     },
-     "execution_count": 77,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "plt.rcParams['figure.figsize'] = [14, 6]\n",
-    "df = pd.read_csv(\"poisson2d.ins_cyc.bin.csv\", skiprows=range(2, 50000, 2))  # Read in the CSV file from the bench run; parse with Pandas\n",
-    "common.normalize(df, \"PM_INST_CMPL (min)\", \"Instructions / Loop Iteration\")  # Normalize to each grid cell\n",
-    "common.normalize(df, \"PM_RUN_CYC (min)\", \"Cycles / Loop Iteration\")\n",
-    "df.head()  # Display the head of the Pandas dataframe"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 78,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "\n",
-      "text/plain": [
-       "<Figure size 1008x432 with 2 Axes>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# Plot Cycles and Instructions - both per grid cell\n",
-    "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
-    "df.set_index(\"nx\")[\"Cycles / Loop Iteration\"].plot(ax=ax1, legend=True);\n",
-    "df.set_index(\"nx\")[\"Instructions / Loop Iteration\"].plot(ax=ax2, legend=True);"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "What is your result? What value do the graphs come asymptotically close too?\n",
-    "\n",
-    "We are revisiting the graph in a little while.\n",
-    "\n",
-    "[Back to top](#toc)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Task 2: Measuring Loads and Stores\n",
-    "<a name=\"task2\"></a>\n",
-    "\n",
-    "Looking at the source code, how many loads and stores from / to memory do you expect? Have a look at the loop which we instrumented.\n",
-    "\n",
-    "Let's compare your estimate to what the system actually does!\n",
-    "\n",
-    "<a name=\"task2-a\"></a>**TASK A**: Please measure counters for loads and stores. See the TODOs in [`poisson2d.ld_st.c`](/edit/Tasks/poisson2d.ld_st.c). This time, implement `PM_LD_CMPL` and `PM_ST_CMPL`.\n",
-    "\n",
-    "Compile with `make task2`, test your program with a single run with `make run_task2`, and then finally submit a benchmarking run to the batch system with `make bench_task2`. The following cell will take care of all this.\n",
-    "\n",
-    "[Back to top](#toc)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "gcc -DUSE_DOUBLE -Ofast -std=c99 -lm -lpapi  poisson2d.ld_st.c -o poisson2d.ld_st.bin\n",
-      "bsub -W 60 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.ld_st.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv\n",
-      "Job <4032> is submitted to default queue <batch>.\n",
-      "<<Waiting for dispatch ...>>\n",
-      "<<Starting on login1>>\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,4,0.0012,95115,474,789,21343,106,249\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,8,0.0014,137115,684,999,33343,166,309\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,12,0.0014,197115,984,1299,45343,226,369\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,16,0.0015,257115,1284,1599,63343,316,459\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,20,0.0016,317115,1584,1899,75343,376,519\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,24,0.0016,377115,1884,2199,93343,466,609\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,28,0.0017,437115,2184,2499,105343,526,669\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,32,0.0017,497115,2484,2799,123343,616,759\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,36,0.0018,557115,2784,3099,135343,676,819\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,40,0.0020,617115,3084,3399,153343,766,909\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,44,0.0019,677115,3384,3699,165343,826,969\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,48,0.0020,737115,3684,3999,183343,916,1059\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,52,0.0021,797115,3984,4299,195343,976,1119\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,56,0.0021,857115,4284,4599,213343,1066,1209\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,60,0.0023,917115,4584,4899,225343,1126,1269\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,64,0.0023,977115,4884,5199,243343,1216,1359\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,68,0.0024,1037115,5184,5499,255343,1276,1419\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,72,0.0025,1097115,5484,5799,273343,1366,1509\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,76,0.0025,1157115,5784,6099,285343,1426,1569\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,80,0.0025,1217115,6084,6399,303343,1516,1659\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,84,0.0026,1277115,6384,6699,315343,1576,1719\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,88,0.0027,1337115,6684,6999,333343,1666,1809\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,92,0.0027,1397115,6984,7299,345343,1726,1869\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,96,0.0028,1457115,7284,7599,363343,1816,1959\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,100,0.0029,1517115,7584,7899,375343,1876,2019\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,104,0.0029,1577115,7884,8199,393343,1966,2109\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,108,0.0030,1637115,8184,8499,405343,2026,2169\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,112,0.0030,1697115,8484,8799,423343,2116,2259\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,116,0.0031,1757115,8784,9099,435343,2176,2319\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,120,0.0033,1817115,9084,9399,453343,2266,2409\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,124,0.0032,1877115,9384,9699,465343,2326,2469\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,128,0.0033,1937115,9684,9999,483343,2416,2559\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,132,0.0034,1997115,9984,10299,495343,2476,2619\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,136,0.0035,2057115,10284,10599,513343,2566,2709\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,140,0.0035,2117115,10584,10899,525343,2626,2769\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,144,0.0036,2177115,10884,11199,543343,2716,2859\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,148,0.0036,2237115,11184,11499,555343,2776,2919\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,152,0.0037,2297115,11484,11799,573343,2866,3009\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,156,0.0038,2357115,11784,12099,585343,2926,3069\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,160,0.0038,2417115,12084,12399,603343,3016,3159\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,164,0.0039,2477115,12384,12699,615343,3076,3219\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,168,0.0039,2537115,12684,12999,633343,3166,3309\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,172,0.0040,2597115,12984,13299,645343,3226,3369\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,176,0.0041,2657115,13284,13599,663343,3316,3459\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,180,0.0041,2717115,13584,13899,675343,3376,3519\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,184,0.0042,2777115,13884,14199,693343,3466,3609\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,188,0.0043,2837115,14184,14499,705343,3526,3669\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,192,0.0043,2897115,14484,14799,723343,3616,3759\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,196,0.0044,2957115,14784,15099,735343,3676,3819\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,200,0.0045,3017115,15084,15399,753343,3766,3909\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,204,0.0045,3077115,15384,15699,765343,3826,3969\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,208,0.0046,3137115,15684,15999,783343,3916,4059\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,212,0.0047,3197115,15984,16299,795343,3976,4119\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,216,0.0047,3257115,16284,16599,813343,4066,4209\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,220,0.0048,3317115,16584,16899,825343,4126,4269\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,224,0.0049,3377115,16884,17199,843343,4216,4359\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,228,0.0049,3437115,17184,17499,855343,4276,4419\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,232,0.0050,3497115,17484,17799,873343,4366,4509\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,236,0.0051,3557115,17784,18099,885343,4426,4569\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,240,0.0052,3617115,18084,18399,903343,4516,4659\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,244,0.0052,3677115,18384,18699,915343,4576,4719\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,248,0.0052,3737115,18684,18999,933343,4666,4809\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,252,0.0054,3797115,18984,19299,945343,4726,4869\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,256,0.0054,3857115,19284,19599,963343,4816,4959\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,260,0.0054,3917115,19584,19899,975343,4876,5019\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,264,0.0055,3977115,19884,20199,993343,4966,5109\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,268,0.0056,4037115,20184,20499,1005343,5026,5169\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,272,0.0056,4097115,20484,20799,1023343,5116,5259\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,276,0.0057,4157115,20784,21099,1035343,5176,5319\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,280,0.0057,4217115,21084,21399,1053343,5266,5409\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,284,0.0058,4277115,21384,21699,1065343,5326,5469\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,288,0.0059,4337115,21684,21999,1083343,5416,5559\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,292,0.0059,4397115,21984,22299,1095343,5476,5619\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,296,0.0061,4457115,22284,22599,1113343,5566,5709\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,300,0.0061,4517115,22584,22899,1125343,5626,5769\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,304,0.0061,4577115,22884,23199,1143343,5716,5859\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,308,0.0062,4637115,23184,23499,1155343,5776,5919\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,312,0.0063,4697115,23484,23799,1173343,5866,6009\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,316,0.0064,4757115,23784,24099,1185343,5926,6069\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,320,0.0064,4817115,24084,24399,1203343,6016,6159\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,324,0.0065,4877115,24384,24699,1215343,6076,6219\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,328,0.0065,4937115,24684,24999,1233343,6166,6309\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,332,0.0066,4997115,24984,25299,1245343,6226,6369\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,336,0.0066,5057115,25284,25599,1263343,6316,6459\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,340,0.0068,5117115,25584,25899,1275343,6376,6519\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,344,0.0068,5177115,25884,26199,1293343,6466,6609\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,348,0.0069,5237115,26184,26499,1305343,6526,6669\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,352,0.0071,5297115,26484,26799,1323343,6616,6759\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,356,0.0070,5357115,26784,27099,1335343,6676,6819\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,360,0.0070,5417115,27084,27399,1353343,6766,6909\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,364,0.0071,5477115,27384,27699,1365343,6826,6969\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,368,0.0072,5537115,27684,27999,1383343,6916,7059\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,372,0.0073,5597115,27984,28299,1395343,6976,7119\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,376,0.0073,5657115,28284,28599,1413343,7066,7209\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,380,0.0074,5717115,28584,28899,1425343,7126,7269\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,384,0.0074,5777115,28884,29199,1443343,7216,7359\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,388,0.0075,5837115,29184,29499,1455343,7276,7419\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,392,0.0076,5897115,29484,29799,1473343,7366,7509\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,396,0.0076,5957115,29784,30099,1485343,7426,7569\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,400,0.0078,6017115,30084,30399,1503343,7516,7659\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,404,0.0078,6077115,30384,30699,1515343,7576,7719\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,408,0.0078,6137115,30684,30999,1533343,7666,7809\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,412,0.0079,6197115,30984,31299,1545343,7726,7869\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,416,0.0080,6257115,31284,31599,1563343,7816,7959\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,420,0.0080,6317115,31584,31899,1575343,7876,8019\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,424,0.0081,6377115,31884,32199,1593343,7966,8109\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,428,0.0081,6437115,32184,32499,1605343,8026,8169\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,432,0.0082,6497115,32484,32799,1623343,8116,8259\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,436,0.0083,6557115,32784,33099,1635343,8176,8319\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,440,0.0083,6617115,33084,33399,1653343,8266,8409\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,444,0.0084,6677115,33384,33699,1665343,8326,8469\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,448,0.0085,6737115,33684,33999,1683343,8416,8559\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,452,0.0085,6797115,33984,34299,1695343,8476,8619\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,456,0.0086,6857115,34284,34599,1713343,8566,8709\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,460,0.0087,6917115,34584,34899,1725343,8626,8769\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,464,0.0088,6977115,34884,35199,1743343,8716,8859\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,468,0.0088,7037115,35184,35499,1755343,8776,8919\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,472,0.0089,7097115,35484,35799,1773343,8866,9009\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,476,0.0090,7157115,35784,36099,1785343,8926,9069\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,480,0.0090,7217115,36084,36399,1803343,9016,9159\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,484,0.0091,7277115,36384,36699,1815343,9076,9219\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,488,0.0091,7337115,36684,36999,1833343,9166,9309\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,492,0.0092,7397115,36984,37299,1845343,9226,9369\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,496,0.0093,7457115,37284,37599,1863343,9316,9459\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,500,0.0093,7517115,37584,37899,1875343,9376,9519\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,504,0.0094,7577115,37884,38199,1893343,9466,9609\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,508,0.0095,7637115,38184,38499,1905343,9526,9669\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,512,0.0095,7697115,38484,38799,1923343,9616,9759\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,516,0.0096,7757115,38784,39099,1938343,9691,9834\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,520,0.0097,7817115,39084,39399,1953343,9766,9909\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,524,0.0097,7877115,39384,39699,1968343,9841,9984\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,528,0.0098,7937115,39684,39999,1983343,9916,10059\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,532,0.0099,7997115,39984,40299,1998343,9991,10134\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,536,0.0100,8057115,40284,40599,2013343,10066,10209\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,540,0.0101,8117115,40584,40899,2028343,10141,10284\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,544,0.0101,8177115,40884,41199,2043343,10216,10359\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,548,0.0102,8237115,41184,41499,2058343,10291,10434\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,552,0.0103,8297115,41484,41799,2073343,10366,10509\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,556,0.0104,8357115,41784,42099,2088343,10441,10584\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,560,0.0104,8417115,42084,42399,2103343,10516,10659\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,564,0.0105,8477115,42384,42699,2118343,10591,10734\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,568,0.0106,8537115,42684,42999,2133343,10666,10809\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,572,0.0106,8597115,42984,43299,2148343,10741,10884\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,576,0.0107,8657115,43284,43599,2163343,10816,10959\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,580,0.0109,8717115,43584,43899,2178343,10891,11034\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,584,0.0108,8777115,43884,44199,2193343,10966,11109\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,588,0.0110,8837115,44184,44499,2208343,11041,11184\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,592,0.0110,8897115,44484,44799,2223343,11116,11259\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,596,0.0111,8957115,44784,45099,2238343,11191,11334\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,600,0.0111,9017115,45084,45399,2253343,11266,11409\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,604,0.0112,9077115,45384,45699,2268343,11341,11484\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,608,0.0113,9137115,45684,45999,2283343,11416,11559\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,612,0.0113,9197115,45984,46299,2298343,11491,11634\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,616,0.0114,9257115,46284,46599,2313343,11566,11709\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,620,0.0115,9317115,46584,46899,2328343,11641,11784\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,624,0.0115,9377115,46884,47199,2343343,11716,11859\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,628,0.0115,9437115,47184,47499,2358343,11791,11934\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,632,0.0117,9497115,47484,47799,2373343,11866,12009\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,636,0.0118,9557115,47784,48099,2388343,11941,12084\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,640,0.0119,9617115,48084,48399,2403343,12016,12159\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,644,0.0118,9677115,48384,48699,2418343,12091,12234\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,648,0.0119,9737115,48684,48999,2433343,12166,12309\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,652,0.0121,9797115,48984,49299,2448343,12241,12384\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,656,0.0121,9857115,49284,49599,2463343,12316,12459\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,660,0.0122,9917115,49584,49899,2478343,12391,12534\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,664,0.0122,9977115,49884,50199,2493343,12466,12609\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,668,0.0123,10037115,50184,50499,2508343,12541,12684\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,672,0.0123,10097115,50484,50799,2523343,12616,12759\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,676,0.0125,10157115,50784,51099,2538343,12691,12834\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,680,0.0124,10217115,51084,51399,2553343,12766,12909\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,684,0.0125,10277115,51384,51699,2568343,12841,12984\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,688,0.0126,10337115,51684,51999,2583343,12916,13059\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,692,0.0126,10397115,51984,52299,2598343,12991,13134\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,696,0.0127,10457115,52284,52599,2613343,13066,13209\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,700,0.0128,10517115,52584,52899,2628343,13141,13284\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,704,0.0129,10577115,52884,53199,2643343,13216,13359\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,708,0.0129,10637115,53184,53499,2658343,13291,13434\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,712,0.0129,10697115,53484,53799,2673343,13366,13509\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,716,0.0130,10757115,53784,54099,2688343,13441,13584\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,720,0.0130,10817115,54084,54399,2703343,13516,13659\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,724,0.0132,10877115,54384,54699,2718343,13591,13734\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,728,0.0131,10937115,54684,54999,2733343,13666,13809\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,732,0.0133,10997115,54984,55299,2748343,13741,13884\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,736,0.0135,11057115,55284,55599,2763343,13816,13959\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,740,0.0134,11117115,55584,55899,2778343,13891,14034\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,744,0.0134,11177115,55884,56199,2793343,13966,14109\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,748,0.0135,11237115,56184,56499,2808343,14041,14184\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,752,0.0136,11297115,56484,56799,2823343,14116,14259\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,756,0.0136,11357115,56784,57099,2838343,14191,14334\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,760,0.0138,11417115,57084,57399,2853343,14266,14409\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,764,0.0139,11477115,57384,57699,2868343,14341,14484\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,768,0.0138,11537115,57684,57999,2883343,14416,14559\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,772,0.0140,11597115,57984,58299,2898343,14491,14634\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,776,0.0140,11657115,58284,58599,2913343,14566,14709\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,780,0.0142,11717115,58584,58899,2928343,14641,14784\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,784,0.0141,11777115,58884,59199,2943343,14716,14859\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,788,0.0143,11837115,59184,59499,2958343,14791,14934\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,792,0.0143,11897115,59484,59799,2973343,14866,15009\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,796,0.0146,11957115,59784,60099,2988343,14941,15084\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,800,0.0144,12017115,60084,60399,3003343,15016,15159\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,804,0.0145,12077115,60384,60699,3018343,15091,15234\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,808,0.0146,12137115,60684,60999,3033343,15166,15309\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,812,0.0146,12197115,60984,61299,3048343,15241,15384\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,816,0.0146,12257115,61284,61599,3063343,15316,15459\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,820,0.0148,12317115,61584,61899,3078343,15391,15534\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,824,0.0149,12377115,61884,62199,3093343,15466,15609\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,828,0.0149,12437115,62184,62499,3108343,15541,15684\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,832,0.0149,12497115,62484,62799,3123343,15616,15759\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,836,0.0151,12557115,62784,63099,3138343,15691,15834\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,840,0.0150,12617115,63084,63399,3153343,15766,15909\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,844,0.0152,12677115,63384,63699,3168343,15841,15984\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,848,0.0152,12737115,63684,63999,3183343,15916,16059\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,852,0.0153,12797115,63984,64299,3198343,15991,16134\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,856,0.0153,12857115,64284,64599,3213343,16066,16209\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,860,0.0155,12917115,64584,64899,3228343,16141,16284\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,864,0.0156,12977115,64884,65199,3243343,16216,16359\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,868,0.0157,13037115,65184,65499,3258343,16291,16434\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,872,0.0156,13097115,65484,65799,3273343,16366,16509\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,876,0.0157,13157115,65784,66099,3288343,16441,16584\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,880,0.0158,13217115,66084,66399,3303343,16516,16659\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,884,0.0158,13277115,66384,66699,3318343,16591,16734\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,888,0.0159,13337115,66684,66999,3333343,16666,16809\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,892,0.0160,13397115,66984,67299,3348343,16741,16884\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,896,0.0161,13457115,67284,67599,3363343,16816,16959\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,900,0.0162,13517115,67584,67899,3378343,16891,17034\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,904,0.0163,13577115,67884,68199,3393343,16966,17109\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,908,0.0164,13637115,68184,68499,3408343,17041,17184\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,912,0.0165,13697115,68484,68799,3423343,17116,17259\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,916,0.0165,13757115,68784,69099,3438343,17191,17334\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,920,0.0165,13817115,69084,69399,3453343,17266,17409\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,924,0.0168,13877115,69384,69699,3468343,17341,17484\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,928,0.0167,13937115,69684,69999,3483343,17416,17559\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,932,0.0169,13997115,69984,70299,3498343,17491,17634\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,936,0.0168,14057115,70284,70599,3513343,17566,17709\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,940,0.0169,14117115,70584,70899,3528343,17641,17784\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,944,0.0169,14177115,70884,71199,3543343,17716,17859\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,948,0.0170,14237115,71184,71499,3558343,17791,17934\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,952,0.0171,14297115,71484,71799,3573343,17866,18009\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,956,0.0173,14357115,71784,72099,3588343,17941,18084\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,960,0.0172,14417115,72084,72399,3603343,18016,18159\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,964,0.0177,14477115,72384,72699,3618343,18091,18234\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,968,0.0177,14537115,72684,72999,3633343,18166,18309\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,972,0.0177,14597115,72984,73299,3648343,18241,18384\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,976,0.0179,14657115,73284,73599,3663343,18316,18459\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,980,0.0180,14717115,73584,73899,3678343,18391,18534\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,984,0.0180,14777115,73884,74199,3693343,18466,18609\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,988,0.0180,14837115,74184,74499,3708343,18541,18684\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,992,0.0181,14897115,74484,74799,3723343,18616,18759\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,996,0.0184,14957115,74784,75099,3738343,18691,18834\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,1000,0.0182,15017115,75084,75399,3753343,18766,18909\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,1004,0.0183,15077115,75384,75699,3768343,18841,18984\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,1008,0.0184,15137115,75684,75999,3783343,18916,19059\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,1012,0.0185,15197115,75984,76299,3798343,18991,19134\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,1016,0.0185,15257115,76284,76599,3813343,19066,19209\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,1020,0.0186,15317115,76584,76899,3828343,19141,19284\n",
-      "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n",
-      "200,32,1024,0.0183,15377115,76884,77199,3843343,19216,19359\n",
-      "mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.ld_st.bin.csv .\n"
-     ]
-    }
-   ],
-   "source": [
-    "!make bench_task2"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Once the run finished, let's plot it again with the following cell (non-interactive: `make graph_task2a`)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>iter</th>\n",
-       "      <th>ny</th>\n",
-       "      <th>nx</th>\n",
-       "      <th>Runtime</th>\n",
-       "      <th>PM_LD_CMPL (total)</th>\n",
-       "      <th>PM_LD_CMPL (min)</th>\n",
-       "      <th>PM_LD_CMPL (max)</th>\n",
-       "      <th>PM_ST_CMPL (total)</th>\n",
-       "      <th>PM_ST_CMPL (min)</th>\n",
-       "      <th>PM_ST_CMPL (max)</th>\n",
-       "      <th>Loads / Loop Iteration</th>\n",
-       "      <th>Stores / Loop Iteration</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>200</td>\n",
-       "      <td>32</td>\n",
-       "      <td>4</td>\n",
-       "      <td>0.0012</td>\n",
-       "      <td>95115</td>\n",
-       "      <td>474</td>\n",
-       "      <td>789</td>\n",
-       "      <td>21343</td>\n",
-       "      <td>106</td>\n",
-       "      <td>249</td>\n",
-       "      <td>3.703125</td>\n",
-       "      <td>0.828125</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>200</td>\n",
-       "      <td>32</td>\n",
-       "      <td>8</td>\n",
-       "      <td>0.0014</td>\n",
-       "      <td>137115</td>\n",
-       "      <td>684</td>\n",
-       "      <td>999</td>\n",
-       "      <td>33343</td>\n",
-       "      <td>166</td>\n",
-       "      <td>309</td>\n",
-       "      <td>2.671875</td>\n",
-       "      <td>0.648438</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>200</td>\n",
-       "      <td>32</td>\n",
-       "      <td>12</td>\n",
-       "      <td>0.0014</td>\n",
-       "      <td>197115</td>\n",
-       "      <td>984</td>\n",
-       "      <td>1299</td>\n",
-       "      <td>45343</td>\n",
-       "      <td>226</td>\n",
-       "      <td>369</td>\n",
-       "      <td>2.562500</td>\n",
-       "      <td>0.588542</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>200</td>\n",
-       "      <td>32</td>\n",
-       "      <td>16</td>\n",
-       "      <td>0.0015</td>\n",
-       "      <td>257115</td>\n",
-       "      <td>1284</td>\n",
-       "      <td>1599</td>\n",
-       "      <td>63343</td>\n",
-       "      <td>316</td>\n",
-       "      <td>459</td>\n",
-       "      <td>2.507812</td>\n",
-       "      <td>0.617188</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>200</td>\n",
-       "      <td>32</td>\n",
-       "      <td>20</td>\n",
-       "      <td>0.0016</td>\n",
-       "      <td>317115</td>\n",
-       "      <td>1584</td>\n",
-       "      <td>1899</td>\n",
-       "      <td>75343</td>\n",
-       "      <td>376</td>\n",
-       "      <td>519</td>\n",
-       "      <td>2.475000</td>\n",
-       "      <td>0.587500</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   iter  ny  nx  Runtime  PM_LD_CMPL (total)  PM_LD_CMPL (min)  \\\n",
-       "0   200  32   4   0.0012               95115               474   \n",
-       "1   200  32   8   0.0014              137115               684   \n",
-       "2   200  32  12   0.0014              197115               984   \n",
-       "3   200  32  16   0.0015              257115              1284   \n",
-       "4   200  32  20   0.0016              317115              1584   \n",
-       "\n",
-       "    PM_LD_CMPL (max)  PM_ST_CMPL (total)  PM_ST_CMPL (min)   PM_ST_CMPL (max)  \\\n",
-       "0                789               21343               106                249   \n",
-       "1                999               33343               166                309   \n",
-       "2               1299               45343               226                369   \n",
-       "3               1599               63343               316                459   \n",
-       "4               1899               75343               376                519   \n",
-       "\n",
-       "   Loads / Loop Iteration  Stores / Loop Iteration  \n",
-       "0                3.703125                 0.828125  \n",
-       "1                2.671875                 0.648438  \n",
-       "2                2.562500                 0.588542  \n",
-       "3                2.507812                 0.617188  \n",
-       "4                2.475000                 0.587500  "
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_ldst = pd.read_csv(\"poisson2d.ld_st.bin.csv\", skiprows=range(2, 50000, 2))\n",
-    "common.normalize(df_ldst, \"PM_LD_CMPL (min)\", \"Loads / Loop Iteration\")\n",
-    "common.normalize(df_ldst, \"PM_ST_CMPL (min)\", \"Stores / Loop Iteration\")\n",
-    "df_ldst.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 79,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "\n",
-      "text/plain": [
-       "<Figure size 1008x432 with 2 Axes>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
-    "df_ldst.set_index(\"nx\")[\"Loads / Loop Iteration\"].plot(ax=ax1, legend=True);\n",
-    "df_ldst.set_index(\"nx\")[\"Stores / Loop Iteration\"].plot(ax=ax2, legend=True);"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Did you expect more?\n",
-    "\n",
-    "The reason is simple: Among the load and store instructions counted by `PM_LD_CMPL` and `PM_ST_CMPL` are vector instructions which can load and store multiple (two) values at a time. To see how many *bytes* are loaded and stored, we need to measure counters for vectorized loads and stores as well.\n",
-    "\n",
-    "<a name=\"task2-b\"></a>**TASK B**: Please measure counters for _vectorized_ loads and _vectorized_ stores. See the TODOs in [`poisson2d.vld.c`](/edit/Tasks/poisson2d.vld.c) and [`poisson2d.vst.c`](/edit/Tasks/poisson2d.vst.c) (*Note: These vector counters can not be measured together and need separate files and runs*). Can you find out the name of the counters yourself, using `papi_native_avail | grep VECTOR_`?\n",
-    "\n",
-    "Compile, test, and bench-run your program again.\n",
-    "\n",
-    "[Back to top](#toc)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "| PM_VECTOR_FLOP_CMPL                                                          |\r\n",
-      "| PM_VECTOR_LD_CMPL                                                            |\r\n",
-      "| PM_VECTOR_ST_CMPL                                                            |\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "!papi_native_avail | grep VECTOR_"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "`make bench_task3` will submit benchmark runs of both vectorized counters to the batch system (as two subsequent runs of the individual files)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "bsub -W 60 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vld.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.vld.bin.csv\n",
-      "Job <4097> is submitted to default queue <batch>.\n",
-      "<<Waiting for dispatch ...>>\n",
-      "<<Starting on login1>>\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,4,0.0010,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,8,0.0011,114000,570,570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,12,0.0012,174000,870,870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,16,0.0013,234000,1170,1170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,20,0.0014,294000,1470,1470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,24,0.0014,354000,1770,1770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,28,0.0014,414000,2070,2070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,32,0.0015,474000,2370,2370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,36,0.0016,534000,2670,2670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,40,0.0016,594000,2970,2970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,44,0.0017,654000,3270,3270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,48,0.0017,714000,3570,3570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,52,0.0018,774000,3870,3870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,56,0.0020,834000,4170,4170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,60,0.0020,894000,4470,4470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,64,0.0021,954000,4770,4770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,68,0.0022,1014000,5070,5070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,72,0.0022,1074000,5370,5370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,76,0.0023,1134000,5670,5670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,80,0.0023,1194000,5970,5970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,84,0.0023,1254000,6270,6270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,88,0.0024,1314000,6570,6570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,92,0.0025,1374000,6870,6870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,96,0.0025,1434000,7170,7170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,100,0.0026,1494000,7470,7470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,104,0.0027,1554000,7770,7770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,108,0.0027,1614000,8070,8070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,112,0.0028,1674000,8370,8370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,116,0.0028,1734000,8670,8670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,120,0.0029,1794000,8970,8970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,124,0.0030,1854000,9270,9270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,128,0.0030,1914000,9570,9570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,132,0.0031,1974000,9870,9870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,136,0.0032,2034000,10170,10170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,140,0.0032,2094000,10470,10470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,144,0.0033,2154000,10770,10770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,148,0.0034,2214000,11070,11070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,152,0.0035,2274000,11370,11370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,156,0.0035,2334000,11670,11670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,160,0.0036,2394000,11970,11970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,164,0.0036,2454000,12270,12270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,168,0.0037,2514000,12570,12570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,172,0.0037,2574000,12870,12870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,176,0.0038,2634000,13170,13170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,180,0.0039,2694000,13470,13470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,184,0.0041,2754000,13770,13770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,188,0.0040,2814000,14070,14070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,192,0.0041,2874000,14370,14370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,196,0.0041,2934000,14670,14670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,200,0.0042,2994000,14970,14970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,204,0.0043,3054000,15270,15270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,208,0.0044,3114000,15570,15570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,212,0.0044,3174000,15870,15870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,216,0.0044,3234000,16170,16170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,220,0.0045,3294000,16470,16470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,224,0.0046,3354000,16770,16770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,228,0.0047,3414000,17070,17070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,232,0.0047,3474000,17370,17370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,236,0.0048,3534000,17670,17670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,240,0.0048,3594000,17970,17970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,244,0.0049,3654000,18270,18270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,248,0.0049,3714000,18570,18570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,252,0.0050,3774000,18870,18870\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,256,0.0051,3834000,19170,19170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,260,0.0052,3894000,19470,19470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,264,0.0052,3954000,19770,19770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,268,0.0053,4014000,20070,20070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,272,0.0053,4074000,20370,20370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,276,0.0055,4134000,20670,20670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,280,0.0055,4194000,20970,20970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,284,0.0055,4254000,21270,21270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,288,0.0057,4314000,21570,21570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,292,0.0056,4374000,21870,21870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,296,0.0057,4434000,22170,22170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,300,0.0059,4494000,22470,22470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,304,0.0059,4554000,22770,22770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,308,0.0060,4614000,23070,23070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,312,0.0060,4674000,23370,23370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,316,0.0061,4734000,23670,23670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,320,0.0061,4794000,23970,23970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,324,0.0062,4854000,24270,24270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,328,0.0062,4914000,24570,24570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,332,0.0063,4974000,24870,24870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,336,0.0063,5034000,25170,25170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,340,0.0066,5094000,25470,25470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,344,0.0065,5154000,25770,25770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,348,0.0067,5214000,26070,26070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,352,0.0068,5274000,26370,26370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,356,0.0067,5334000,26670,26670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,360,0.0067,5394000,26970,26970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,364,0.0068,5454000,27270,27270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,368,0.0069,5514000,27570,27570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,372,0.0069,5574000,27870,27870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,376,0.0070,5634000,28170,28170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,380,0.0071,5694000,28470,28470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,384,0.0071,5754000,28770,28770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,388,0.0073,5814000,29070,29070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,392,0.0074,5874000,29370,29370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,396,0.0073,5934000,29670,29670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,400,0.0074,5994000,29970,29970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,404,0.0074,6054000,30270,30270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,408,0.0075,6114000,30570,30570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,412,0.0076,6174000,30870,30870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,416,0.0076,6234000,31170,31170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,420,0.0080,6294000,31470,31470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,424,0.0079,6354000,31770,31770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,428,0.0078,6414000,32070,32070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,432,0.0079,6474000,32370,32370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,436,0.0080,6534000,32670,32670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,440,0.0080,6594000,32970,32970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,444,0.0083,6654000,33270,33270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,448,0.0082,6714000,33570,33570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,452,0.0082,6774000,33870,33870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,456,0.0083,6834000,34170,34170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,460,0.0086,6894000,34470,34470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,464,0.0084,6954000,34770,34770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,468,0.0085,7014000,35070,35070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,472,0.0086,7074000,35370,35370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,476,0.0086,7134000,35670,35670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,480,0.0087,7194000,35970,35970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,484,0.0088,7254000,36270,36270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,488,0.0088,7314000,36570,36570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,492,0.0089,7374000,36870,36870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,496,0.0091,7434000,37170,37170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,500,0.0092,7494000,37470,37470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,504,0.0091,7554000,37770,37770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,508,0.0092,7614000,38070,38070\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,512,0.0092,7674000,38370,38370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,516,0.0093,7734000,38670,38670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,520,0.0093,7794000,38970,38970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,524,0.0094,7854000,39270,39270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,528,0.0097,7914000,39570,39570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,532,0.0095,7974000,39870,39870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,536,0.0096,8034000,40170,40170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,540,0.0097,8094000,40470,40470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,544,0.0097,8154000,40770,40770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,548,0.0099,8214000,41070,41070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,552,0.0099,8274000,41370,41370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,556,0.0100,8334000,41670,41670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,560,0.0100,8394000,41970,41970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,564,0.0101,8454000,42270,42270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,568,0.0102,8514000,42570,42570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,572,0.0103,8574000,42870,42870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,576,0.0103,8634000,43170,43170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,580,0.0104,8694000,43470,43470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,584,0.0104,8754000,43770,43770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,588,0.0106,8814000,44070,44070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,592,0.0106,8874000,44370,44370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,596,0.0107,8934000,44670,44670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,600,0.0107,8994000,44970,44970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,604,0.0109,9054000,45270,45270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,608,0.0109,9114000,45570,45570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,612,0.0110,9174000,45870,45870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,616,0.0110,9234000,46170,46170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,620,0.0111,9294000,46470,46470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,624,0.0112,9354000,46770,46770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,628,0.0112,9414000,47070,47070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,632,0.0113,9474000,47370,47370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,636,0.0114,9534000,47670,47670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,640,0.0115,9594000,47970,47970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,644,0.0115,9654000,48270,48270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,648,0.0115,9714000,48570,48570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,652,0.0116,9774000,48870,48870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,656,0.0118,9834000,49170,49170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,660,0.0117,9894000,49470,49470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,664,0.0118,9954000,49770,49770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,668,0.0118,10014000,50070,50070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,672,0.0120,10074000,50370,50370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,676,0.0121,10134000,50670,50670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,680,0.0120,10194000,50970,50970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,684,0.0121,10254000,51270,51270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,688,0.0123,10314000,51570,51570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,692,0.0122,10374000,51870,51870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,696,0.0123,10434000,52170,52170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,700,0.0124,10494000,52470,52470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,704,0.0124,10554000,52770,52770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,708,0.0125,10614000,53070,53070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,712,0.0126,10674000,53370,53370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,716,0.0126,10734000,53670,53670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,720,0.0126,10794000,53970,53970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,724,0.0128,10854000,54270,54270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,728,0.0128,10914000,54570,54570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,732,0.0129,10974000,54870,54870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,736,0.0130,11034000,55170,55170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,740,0.0130,11094000,55470,55470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,744,0.0130,11154000,55770,55770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,748,0.0131,11214000,56070,56070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,752,0.0132,11274000,56370,56370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,756,0.0133,11334000,56670,56670\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,760,0.0134,11394000,56970,56970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,764,0.0134,11454000,57270,57270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,768,0.0135,11514000,57570,57570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,772,0.0135,11574000,57870,57870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,776,0.0136,11634000,58170,58170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,780,0.0138,11694000,58470,58470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,784,0.0138,11754000,58770,58770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,788,0.0139,11814000,59070,59070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,792,0.0139,11874000,59370,59370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,796,0.0141,11934000,59670,59670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,800,0.0140,11994000,59970,59970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,804,0.0141,12054000,60270,60270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,808,0.0142,12114000,60570,60570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,812,0.0143,12174000,60870,60870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,816,0.0143,12234000,61170,61170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,820,0.0143,12294000,61470,61470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,824,0.0144,12354000,61770,61770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,828,0.0145,12414000,62070,62070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,832,0.0145,12474000,62370,62370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,836,0.0146,12534000,62670,62670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,840,0.0146,12594000,62970,62970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,844,0.0147,12654000,63270,63270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,848,0.0148,12714000,63570,63570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,852,0.0149,12774000,63870,63870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,856,0.0150,12834000,64170,64170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,860,0.0150,12894000,64470,64470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,864,0.0151,12954000,64770,64770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,868,0.0152,13014000,65070,65070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,872,0.0151,13074000,65370,65370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,876,0.0152,13134000,65670,65670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,880,0.0154,13194000,65970,65970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,884,0.0154,13254000,66270,66270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,888,0.0154,13314000,66570,66570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,892,0.0155,13374000,66870,66870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,896,0.0156,13434000,67170,67170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,900,0.0158,13494000,67470,67470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,904,0.0158,13554000,67770,67770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,908,0.0159,13614000,68070,68070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,912,0.0161,13674000,68370,68370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,916,0.0162,13734000,68670,68670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,920,0.0162,13794000,68970,68970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,924,0.0163,13854000,69270,69270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,928,0.0162,13914000,69570,69570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,932,0.0164,13974000,69870,69870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,936,0.0163,14034000,70170,70170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,940,0.0164,14094000,70470,70470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,944,0.0165,14154000,70770,70770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,948,0.0166,14214000,71070,71070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,952,0.0166,14274000,71370,71370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,956,0.0170,14334000,71670,71670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,960,0.0168,14394000,71970,71970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,964,0.0174,14454000,72270,72270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,968,0.0172,14514000,72570,72570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,972,0.0173,14574000,72870,72870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,976,0.0173,14634000,73170,73170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,980,0.0175,14694000,73470,73470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,984,0.0175,14754000,73770,73770\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,988,0.0176,14814000,74070,74070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,992,0.0176,14874000,74370,74370\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,996,0.0178,14934000,74670,74670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,1000,0.0179,14994000,74970,74970\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,1004,0.0178,15054000,75270,75270\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,1008,0.0179,15114000,75570,75570\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,1012,0.0179,15174000,75870,75870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,1016,0.0181,15234000,76170,76170\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,1020,0.0181,15294000,76470,76470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n",
-      "200,32,1024,0.0179,15354000,76770,76770\n",
-      "mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vld.bin.csv .\n",
-      "bsub -W 60 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vst.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.vst.bin.csv\n",
-      "Job <4098> is submitted to default queue <batch>.\n",
-      "<<Waiting for dispatch ...>>\n",
-      "<<Starting on login1>>\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,4,0.0010,200,1,1\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,8,0.0011,18200,91,91\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,12,0.0012,30200,151,151\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,16,0.0012,42200,211,211\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,20,0.0013,54200,271,271\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,24,0.0014,66200,331,331\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,28,0.0014,78200,391,391\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,32,0.0016,90200,451,451\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,36,0.0015,102200,511,511\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,40,0.0016,114200,571,571\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,44,0.0017,126200,631,631\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,48,0.0017,138200,691,691\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,52,0.0018,150200,751,751\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,56,0.0019,162200,811,811\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,60,0.0020,174200,871,871\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,64,0.0022,186200,931,931\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,68,0.0022,198200,991,991\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,72,0.0021,210200,1051,1051\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,76,0.0023,222200,1111,1111\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,80,0.0023,234200,1171,1171\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,84,0.0023,246200,1231,1231\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,88,0.0024,258200,1291,1291\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,92,0.0025,270200,1351,1351\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,96,0.0027,282200,1411,1411\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,100,0.0026,294200,1471,1471\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,104,0.0027,306200,1531,1531\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,108,0.0027,318200,1591,1591\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,112,0.0028,330200,1651,1651\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,116,0.0028,342200,1711,1711\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,120,0.0030,354200,1771,1771\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,124,0.0030,366200,1831,1831\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,128,0.0030,378200,1891,1891\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,132,0.0032,390200,1951,1951\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,136,0.0032,402200,2011,2011\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,140,0.0032,414200,2071,2071\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,144,0.0033,426200,2131,2131\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,148,0.0033,438200,2191,2191\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,152,0.0034,450200,2251,2251\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,156,0.0035,462200,2311,2311\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,160,0.0036,474200,2371,2371\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,164,0.0036,486200,2431,2431\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,168,0.0037,498200,2491,2491\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,172,0.0037,510200,2551,2551\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,176,0.0039,522200,2611,2611\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,180,0.0039,534200,2671,2671\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,184,0.0039,546200,2731,2731\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,188,0.0040,558200,2791,2791\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,192,0.0040,570200,2851,2851\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,196,0.0041,582200,2911,2911\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,200,0.0042,594200,2971,2971\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,204,0.0042,606200,3031,3031\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,208,0.0043,618200,3091,3091\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,212,0.0044,630200,3151,3151\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,216,0.0044,642200,3211,3211\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,220,0.0046,654200,3271,3271\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,224,0.0046,666200,3331,3331\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,228,0.0046,678200,3391,3391\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,232,0.0047,690200,3451,3451\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,236,0.0047,702200,3511,3511\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,240,0.0048,714200,3571,3571\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,244,0.0049,726200,3631,3631\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,248,0.0049,738200,3691,3691\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,252,0.0050,750200,3751,3751\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,256,0.0051,762200,3811,3811\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,260,0.0051,774200,3871,3871\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,264,0.0053,786200,3931,3931\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,268,0.0053,798200,3991,3991\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,272,0.0054,810200,4051,4051\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,276,0.0055,822200,4111,4111\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,280,0.0055,834200,4171,4171\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,284,0.0055,846200,4231,4231\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,288,0.0056,858200,4291,4291\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,292,0.0057,870200,4351,4351\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,296,0.0057,882200,4411,4411\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,300,0.0058,894200,4471,4471\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,304,0.0058,906200,4531,4531\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,308,0.0059,918200,4591,4591\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,312,0.0060,930200,4651,4651\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,316,0.0060,942200,4711,4711\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,320,0.0061,954200,4771,4771\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,324,0.0061,966200,4831,4831\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,328,0.0062,978200,4891,4891\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,332,0.0063,990200,4951,4951\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,336,0.0063,1002200,5011,5011\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,340,0.0064,1014200,5071,5071\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,344,0.0065,1026200,5131,5131\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,348,0.0066,1038200,5191,5191\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,352,0.0066,1050200,5251,5251\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,356,0.0067,1062200,5311,5311\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,360,0.0067,1074200,5371,5371\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,364,0.0068,1086200,5431,5431\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,368,0.0068,1098200,5491,5491\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,372,0.0069,1110200,5551,5551\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,376,0.0070,1122200,5611,5611\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,380,0.0071,1134200,5671,5671\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,384,0.0072,1146200,5731,5731\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,388,0.0072,1158200,5791,5791\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,392,0.0072,1170200,5851,5851\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,396,0.0073,1182200,5911,5911\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,400,0.0074,1194200,5971,5971\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,404,0.0074,1206200,6031,6031\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,408,0.0076,1218200,6091,6091\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,412,0.0076,1230200,6151,6151\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,416,0.0077,1242200,6211,6211\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,420,0.0077,1254200,6271,6271\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,424,0.0078,1266200,6331,6331\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,428,0.0078,1278200,6391,6391\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,432,0.0080,1290200,6451,6451\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,436,0.0079,1302200,6511,6511\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,440,0.0081,1314200,6571,6571\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,444,0.0081,1326200,6631,6631\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,448,0.0082,1338200,6691,6691\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,452,0.0082,1350200,6751,6751\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,456,0.0084,1362200,6811,6811\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,460,0.0084,1374200,6871,6871\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,464,0.0084,1386200,6931,6931\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,468,0.0085,1398200,6991,6991\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,472,0.0085,1410200,7051,7051\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,476,0.0086,1422200,7111,7111\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,480,0.0087,1434200,7171,7171\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,484,0.0088,1446200,7231,7231\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,488,0.0088,1458200,7291,7291\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,492,0.0089,1470200,7351,7351\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,496,0.0089,1482200,7411,7411\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,500,0.0090,1494200,7471,7471\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,504,0.0092,1506200,7531,7531\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,508,0.0093,1518200,7591,7591\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,512,0.0092,1530200,7651,7651\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,516,0.0093,1542200,7711,7711\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,520,0.0094,1554200,7771,7771\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,524,0.0094,1566200,7831,7831\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,528,0.0094,1578200,7891,7891\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,532,0.0097,1590200,7951,7951\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,536,0.0096,1602200,8011,8011\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,540,0.0097,1614200,8071,8071\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,544,0.0097,1626200,8131,8131\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,548,0.0099,1638200,8191,8191\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,552,0.0099,1650200,8251,8251\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,556,0.0101,1662200,8311,8311\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,560,0.0100,1674200,8371,8371\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,564,0.0101,1686200,8431,8431\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,568,0.0102,1698200,8491,8491\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,572,0.0103,1710200,8551,8551\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,576,0.0103,1722200,8611,8611\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,580,0.0104,1734200,8671,8671\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,584,0.0104,1746200,8731,8731\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,588,0.0105,1758200,8791,8791\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,592,0.0107,1770200,8851,8851\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,596,0.0108,1782200,8911,8911\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,600,0.0107,1794200,8971,8971\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,604,0.0109,1806200,9031,9031\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,608,0.0109,1818200,9091,9091\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,612,0.0109,1830200,9151,9151\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,616,0.0110,1842200,9211,9211\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,620,0.0111,1854200,9271,9271\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,624,0.0112,1866200,9331,9331\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,628,0.0111,1878200,9391,9391\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,632,0.0112,1890200,9451,9451\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,636,0.0113,1902200,9511,9511\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,640,0.0116,1914200,9571,9571\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,644,0.0114,1926200,9631,9631\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,648,0.0115,1938200,9691,9691\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,652,0.0117,1950200,9751,9751\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,656,0.0117,1962200,9811,9811\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,660,0.0117,1974200,9871,9871\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,664,0.0118,1986200,9931,9931\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,668,0.0119,1998200,9991,9991\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,672,0.0120,2010200,10051,10051\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,676,0.0120,2022200,10111,10111\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,680,0.0120,2034200,10171,10171\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,684,0.0121,2046200,10231,10231\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,688,0.0122,2058200,10291,10291\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,692,0.0123,2070200,10351,10351\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,696,0.0124,2082200,10411,10411\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,700,0.0124,2094200,10471,10471\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,704,0.0125,2106200,10531,10531\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,708,0.0125,2118200,10591,10591\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,712,0.0125,2130200,10651,10651\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,716,0.0125,2142200,10711,10711\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,720,0.0126,2154200,10771,10771\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,724,0.0127,2166200,10831,10831\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,728,0.0128,2178200,10891,10891\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,732,0.0128,2190200,10951,10951\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,736,0.0130,2202200,11011,11011\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,740,0.0130,2214200,11071,11071\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,744,0.0130,2226200,11131,11131\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,748,0.0131,2238200,11191,11191\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,752,0.0133,2250200,11251,11251\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,756,0.0133,2262200,11311,11311\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,760,0.0133,2274200,11371,11371\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,764,0.0134,2286200,11431,11431\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,768,0.0135,2298200,11491,11491\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,772,0.0137,2310200,11551,11551\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,776,0.0136,2322200,11611,11611\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,780,0.0137,2334200,11671,11671\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,784,0.0137,2346200,11731,11731\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,788,0.0138,2358200,11791,11791\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,792,0.0139,2370200,11851,11851\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,796,0.0140,2382200,11911,11911\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,800,0.0140,2394200,11971,11971\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,804,0.0141,2406200,12031,12031\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,808,0.0143,2418200,12091,12091\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,812,0.0142,2430200,12151,12151\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,816,0.0143,2442200,12211,12211\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,820,0.0144,2454200,12271,12271\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,824,0.0144,2466200,12331,12331\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,828,0.0145,2478200,12391,12391\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,832,0.0146,2490200,12451,12451\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,836,0.0146,2502200,12511,12511\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,840,0.0147,2514200,12571,12571\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,844,0.0148,2526200,12631,12631\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,848,0.0149,2538200,12691,12691\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,852,0.0149,2550200,12751,12751\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,856,0.0150,2562200,12811,12811\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,860,0.0152,2574200,12871,12871\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,864,0.0151,2586200,12931,12931\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,868,0.0151,2598200,12991,12991\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,872,0.0151,2610200,13051,13051\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,876,0.0152,2622200,13111,13111\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,880,0.0155,2634200,13171,13171\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,884,0.0154,2646200,13231,13231\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,888,0.0155,2658200,13291,13291\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,892,0.0155,2670200,13351,13351\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,896,0.0156,2682200,13411,13411\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,900,0.0157,2694200,13471,13471\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,904,0.0159,2706200,13531,13531\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,908,0.0160,2718200,13591,13591\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,912,0.0161,2730200,13651,13651\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,916,0.0162,2742200,13711,13711\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,920,0.0161,2754200,13771,13771\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,924,0.0162,2766200,13831,13831\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,928,0.0163,2778200,13891,13891\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,932,0.0165,2790200,13951,13951\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,936,0.0165,2802200,14011,14011\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,940,0.0165,2814200,14071,14071\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,944,0.0166,2826200,14131,14131\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,948,0.0166,2838200,14191,14191\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,952,0.0168,2850200,14251,14251\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,956,0.0167,2862200,14311,14311\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,960,0.0168,2874200,14371,14371\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,964,0.0173,2886200,14431,14431\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,968,0.0172,2898200,14491,14491\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,972,0.0172,2910200,14551,14551\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,976,0.0173,2922200,14611,14611\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,980,0.0175,2934200,14671,14671\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,984,0.0176,2946200,14731,14731\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,988,0.0176,2958200,14791,14791\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,992,0.0177,2970200,14851,14851\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,996,0.0178,2982200,14911,14911\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,1000,0.0177,2994200,14971,14971\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,1004,0.0179,3006200,15031,15031\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,1008,0.0179,3018200,15091,15091\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,1012,0.0180,3030200,15151,15151\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,1016,0.0180,3042200,15211,15211\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,1020,0.0182,3054200,15271,15271\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n",
-      "200,32,1024,0.0178,3066200,15331,15331\n",
-      "mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vst.bin.csv .\n"
-     ]
-    }
-   ],
-   "source": [
-    "!make bench_task3"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's plot it again, as soon as the run finishes! Non-interactively, call `graph_task2b`.\n",
-    "\n",
-    "*We need to read in two CSV files now, which we combine to one common dataframe `df_vldvst`.*"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_vld = pd.read_csv(\"poisson2d.vld.bin.csv\", skiprows=range(2, 50000, 2))\n",
-    "df_vst = pd.read_csv(\"poisson2d.vst.bin.csv\", skiprows=range(2, 50000, 2))\n",
-    "df_vldvst = pd.concat([df_vld.set_index(\"nx\"), df_vst.set_index(\"nx\")[['PM_VECTOR_ST_CMPL (total)', 'PM_VECTOR_ST_CMPL (min)', ' PM_VECTOR_ST_CMPL (max)']]], axis=1).reset_index()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>nx</th>\n",
-       "      <th>iter</th>\n",
-       "      <th>ny</th>\n",
-       "      <th>Runtime</th>\n",
-       "      <th>PM_VECTOR_LD_CMPL (total)</th>\n",
-       "      <th>PM_VECTOR_LD_CMPL (min)</th>\n",
-       "      <th>PM_VECTOR_LD_CMPL (max)</th>\n",
-       "      <th>PM_VECTOR_ST_CMPL (total)</th>\n",
-       "      <th>PM_VECTOR_ST_CMPL (min)</th>\n",
-       "      <th>PM_VECTOR_ST_CMPL (max)</th>\n",
-       "      <th>Vector Loads / Loop Iteration</th>\n",
-       "      <th>Vector Stores / Loop Iteration</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>4</td>\n",
-       "      <td>200</td>\n",
-       "      <td>32</td>\n",
-       "      <td>0.0010</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>200</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.007812</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>8</td>\n",
-       "      <td>200</td>\n",
-       "      <td>32</td>\n",
-       "      <td>0.0011</td>\n",
-       "      <td>114000</td>\n",
-       "      <td>570</td>\n",
-       "      <td>570</td>\n",
-       "      <td>18200</td>\n",
-       "      <td>91</td>\n",
-       "      <td>91</td>\n",
-       "      <td>2.226562</td>\n",
-       "      <td>0.355469</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>12</td>\n",
-       "      <td>200</td>\n",
-       "      <td>32</td>\n",
-       "      <td>0.0012</td>\n",
-       "      <td>174000</td>\n",
-       "      <td>870</td>\n",
-       "      <td>870</td>\n",
-       "      <td>30200</td>\n",
-       "      <td>151</td>\n",
-       "      <td>151</td>\n",
-       "      <td>2.265625</td>\n",
-       "      <td>0.393229</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>16</td>\n",
-       "      <td>200</td>\n",
-       "      <td>32</td>\n",
-       "      <td>0.0013</td>\n",
-       "      <td>234000</td>\n",
-       "      <td>1170</td>\n",
-       "      <td>1170</td>\n",
-       "      <td>42200</td>\n",
-       "      <td>211</td>\n",
-       "      <td>211</td>\n",
-       "      <td>2.285156</td>\n",
-       "      <td>0.412109</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>20</td>\n",
-       "      <td>200</td>\n",
-       "      <td>32</td>\n",
-       "      <td>0.0014</td>\n",
-       "      <td>294000</td>\n",
-       "      <td>1470</td>\n",
-       "      <td>1470</td>\n",
-       "      <td>54200</td>\n",
-       "      <td>271</td>\n",
-       "      <td>271</td>\n",
-       "      <td>2.296875</td>\n",
-       "      <td>0.423438</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   nx  iter  ny  Runtime  PM_VECTOR_LD_CMPL (total)  PM_VECTOR_LD_CMPL (min)  \\\n",
-       "0   4   200  32   0.0010                          0                        0   \n",
-       "1   8   200  32   0.0011                     114000                      570   \n",
-       "2  12   200  32   0.0012                     174000                      870   \n",
-       "3  16   200  32   0.0013                     234000                     1170   \n",
-       "4  20   200  32   0.0014                     294000                     1470   \n",
-       "\n",
-       "    PM_VECTOR_LD_CMPL (max)  PM_VECTOR_ST_CMPL (total)  \\\n",
-       "0                         0                        200   \n",
-       "1                       570                      18200   \n",
-       "2                       870                      30200   \n",
-       "3                      1170                      42200   \n",
-       "4                      1470                      54200   \n",
-       "\n",
-       "   PM_VECTOR_ST_CMPL (min)   PM_VECTOR_ST_CMPL (max)  \\\n",
-       "0                        1                         1   \n",
-       "1                       91                        91   \n",
-       "2                      151                       151   \n",
-       "3                      211                       211   \n",
-       "4                      271                       271   \n",
-       "\n",
-       "   Vector Loads / Loop Iteration  Vector Stores / Loop Iteration  \n",
-       "0                       0.000000                        0.007812  \n",
-       "1                       2.226562                        0.355469  \n",
-       "2                       2.265625                        0.393229  \n",
-       "3                       2.285156                        0.412109  \n",
-       "4                       2.296875                        0.423438  "
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "common.normalize(df_vldvst, \"PM_VECTOR_LD_CMPL (min)\", \"Vector Loads / Loop Iteration\")\n",
-    "common.normalize(df_vldvst, \"PM_VECTOR_ST_CMPL (min)\", \"Vector Stores / Loop Iteration\")\n",
-    "df_vldvst.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "\n",
-      "text/plain": [
-       "<Figure size 1008x432 with 2 Axes>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
-    "df_vldvst.set_index(\"nx\")[\"Vector Loads / Loop Iteration\"].plot(ax=ax1, legend=True);\n",
-    "df_vldvst.set_index(\"nx\")[\"Vector Stores / Loop Iteration\"].plot(ax=ax2, legend=True);"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's try to make sense of those numbers.\n",
-    "\n",
-    "Vector loads and vector stores use two 8 Byte values at a time. When we measured loads and stores with `LD_CMPL` and `ST_CMPL` in part A of this task, we measured total number of stores and loads; that is: vector and scalar versions of the instructions. In order to convert the load and store instructions into **bytes** loaded and stored, we need to separate them. The difference of total instructions and vector instructions yield scalar instructions. We multiply the scalar instructions by 8 Byte (double precision) and the vector instructions by 16 Byte (two loads or stores of double precision). That yields the loaded or stored data (or, more precisely, the instruction-equivalent data).\n",
-    "\n",
-    "To formualize it, see the following equations, as an example for load ($ld$), with $b$ denoting data loaded in bytes and $n$ denoting the number of instructions.\n",
-    "\n",
-    "\\begin{align}\n",
-    "b_\\text{ld} &= b_\\text{ld}^\\text{scalar} + b_\\text{ld}^\\text{vector}\\\\\n",
-    "b_\\text{ld}^\\text{scalar} &= n_\\text{ld}^\\text{scalar} * 8\\,\\text{Byte} \\\\\n",
-    "b_\\text{ld}^\\text{vector} &= n_\\text{ld}^\\text{vector} * 16\\,\\text{Byte} \\\\\n",
-    "n_\\text{ld}^\\text{scalar} &= n_\\text{ld}^\\text{total} - n_\\text{ld}^\\text{vector}\\\\\n",
-    "\\Rightarrow b_\\text{ld} &= n_\\text{ld}^\\text{scalar}* 8 \\,\\text{Byte} + n_\\text{ld}^\\text{vector} * 16\\,\\text{Byte} \\\\\n",
-    "& = (n_\\text{ld}^\\text{scalar}+2 n_\\text{ld}^\\text{vector}) * 8\\,Byte \\\\\n",
-    "& = (n_\\text{ld}^\\text{total} - n_\\text{ld}^\\text{vector} + 2 n_\\text{ld}^\\text{vector}) * 8\\,Byte \\\\\n",
-    "& = (n_\\text{ld}^\\text{total} + n_\\text{ld}^\\text{vector}) *8\\,Byte \n",
-    "\\end{align}\n",
-    "\n",
-    "We are going to print this in the next cell. In case you look at this Notebook non-interactively, call `graph_task2b-2`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 83,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0oAAAF/CAYAAAB38jnaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAAIABJREFUeJzs3Xl8FdX9//H3zF2yk0AIEBCVXVwQZHFFrUtdQEGlSmldQMqvKBa0gBsG1KBfQKvUDXdsbbWoBQ0uuFAVKaIUVFpoEVDWQCCEJSHJXWZ+f9zkhgQIc+EuJL6ejwfl3lnO+czkYOedM3euYdu2LQAAAABAmJnoAgAAAADgaENQAgAAAIA6CEoAAAAAUAdBCQAAAADqICgBAAAAQB0EJQAAAACog6AEAAAAAHUQlAAAAACgDoISAAAAANRBUAIAAACAOghKAAAAAFAHQQkAAAAA6iAoAQAAAEAd7kQXEA0lJWWyLDshfWdnp6u4uDQhfaPhYbzAKcYKnGKswCnGCiLRmMaLaRpq2jQt4v0aRVCyLDthQam6f8ApxgucYqzAKcYKnGKsIBI/9fHCrXcAAAAAUEfcg9KTTz6pLl26aNWqVZKkb775RldeeaUuueQSDRs2TMXFxfEuCQAAAABqiWtQ+s9//qNvvvlGrVu3liTZtq1x48YpLy9P8+bNU69evfTII4/EsyQAAAAA2E/cgpLP59MDDzygiRMnyjAMSdLy5cuVlJSkXr16SZIGDx6sDz74IF4lAQAAAMABxS0oTZ8+XVdeeaXatm0bXlZYWBieXZKkZs2aybIs7dy5M15lAQAAAMB+4vLUu2XLlmn58uUaO3ZsTNrPzk6PSbtO5eRkJLR/NCyMFzjFWIFTjBU4xVhBJH7q4yUuQenrr7/W2rVrdeGFF0qStmzZoptvvlnXX3+9Nm/eHN5ux44dMgxDWVlZEbVfXFyasMcX5uRkaNu2PQnpGw0P4wVOMVbgFGMFTjFWEInGNF5M0zisiZW43Ho3YsQIffHFF5o/f77mz5+vVq1a6cUXX9Tw4cNVUVGhJUuWSJJef/11XXbZZfEoCQAAAAAOKqFfOGuapqZOnaqJEyeqsrJSbdq00bRp0xJZEgAAAAAkJijNnz8//Pq0005TQUFBIsoAAAAAgAOK+xfOAgAAAMDRjqAUA5/8a6PmfbU+0WUAAAAAOEwJ/YxSY/Xlii36YfMedeuQrdzstESXAwAAACBCzCjFgD9gybJtvfXZ2kSXAgAAAOAwEJRiwB+wZBqGlq7aptUbdyW6HAAAAAARIijFgD9gqUen5spM82rWp6tl24n5MlwAAAAAh4egFAP+gKX0VI8GnNNOqzfu0uKVWxNdEgAAAIAIEJRiwB+w5HGZ6ntqrtrlZujFuSv15YotiS4LAAAAgEMEpRjwBy153KZcpqnfX9dDHdpk6vl3VuiTf21MdGkAAAAAHCAoRZlt26EZJXfo1KYmu3XHtafq1I7N9ZePVunJvy/Xlh17E1wlAAAAgPoQlKIsELQkKRyUJMnrcenWq0/WVX3b6T8/7tB9LyzWnz/8nzZuK01UmQAAAADqwRfORpk/UB2UXLWWu0xTV5zdTud2b6O3v/hBn3+zWf9YuknHtkzX6V1b6sTjm6lty3SZhpGIsgEAAADsg6AUZTVB6cCTdZlpXt1wSRcN7NtOX63YqoX/3qI3Pl0jaY3SUzzq2CZTx7fK0HGtMpSbnapmTZLldjHxBwAAAMQTQSnKwkHpEOGmSapXF/Vqq4t6tVXJnkqtXLdDK34s0drNu/Xt6u2q/uYl0zCUnZmkFk1T1SIrRc2zkpWZ5lVmWpIy07xqkuZVeqqHmSgAAAAgighKUeY/wGeUDqVpRpLOOjlXZ52cK0kqrwxoQ1GptpbsVVFJubbtLFdRSbkWb96tvZWB/fY3DUOpyW6lJrmVUvV3arJbKUn7vPa65fWY8npc8rpdSqp+7THldbvC65LcLnk8JsELAAAAP2kEpSg71K13TqQkudW5bZY6t83ab93eioB27/Vpd5lPu8qq/65UWXlAeysDKq8MaG9FQIXFe7W3wq/yyqAq/cGIazANQ26XIZfLlNtlyO0y5TJDf++73GUeZL1pyjRD7ZimUfN3+HXNOlfVMsOoem0aMg0dZL/a+xuGIcNQ1R9DhkJ/m1Xvq4/FMCQZNa8NGTX7GArvV2tZuL2avkLtVbVdq70D7VdTmxTq07Js2XZovrC6PQAAABx9CEpR5otCUKpPanJohqhVs1TH+wSClip8Qfn8QfkCVu2//ZZ8gVCYqn7t91sKWJYCQVuBoKVg9d9W7fcBy1YwaMnnt7S3IqBA0FbQskLrgrYs25Zt2bJsKWjVvA+/tg9d+0+Bsc+L6gAXXmTUbFEdyKq3q9lH++xj1HlfO5DVtGHU9G3U1FC9rbHPMh2gJmOfnarbM/ZZX6utOn1Ut7fPodUcT60TUutlrbbrrq2bOQ+yWe1+6umr9v619zEOvqr2dgdZWX+tNe+SvG75fIEDbFT7OOqtwUE99bVRd5+Dn9eDN3Y45+tg/ez/1vkvGyL5vUREv8KIqN3YFJGS7FFFhd9ZszE6uNid3whqiKjdmGxatX1sGo/G8aWmeLW33Fdn09j8nCMR2fg5es/vgTdtWOd3X6mpSdq7t1KR/ysIaZaRpHO7t27QdykRlKKsekbJG6OgdDjcLlPpKaaU4kl0KbXYdigwWZZkVYUnyw4FqZqAZcmyVStghbe1JFtVgctWVfgKvber2g+9rvq7+r1d3bekqnVWVWoLBzi75rVt23Xa27etutvUHJcUOq5QL6H/SU31qmyvLzyrVB0Wq7faNzxW1169gR3+u86+4U3smu322bb2+5od7PDLmvbq9lG9j71PI/vsEj4XtWuu3ce+NdU9btXdXwdXax/VelN7u4PsVLftgwV1+2Ar9qthn/d19rH3+9+D1Xrg47AlBYK2/IHgAeqsfb4P0nSd9pydL7ueYg+2qvb+zs/dwVY6PV+x+kVLZO063ziSZiM9NtM0wv+tqb/d2NQbycaRtXt0nN9IRHKOI2s3gm3rWWcaklXfP7I41HAkG9sxGmyxGz+xOb8RVXAk7RpHNqaz0pN09im5Mt0EJVSpDkruoygoHa0Mw5DLMPRTeqhfTk6Gtm3bk+gy0AAwVuAUYwVOMVYQCcYLXzgbdU6fegcAAADg6MXVfJT5g6EHJ8TqM0oAAAAAYo+r+SiLxlPvAAAAACQWV/NRVvMwB1eCKwEAAABwuAhKUcaMEgAAANDwcTUfZQQlAAAAoOHjaj7K/AFLhiSX2XCfGQ8AAAD81BGUoswftORxm/t9oz0AAACAhoOgFGX+gMVtdwAAAEADxxV9lPkDQYISAAAA0MBxRR9lzCgBAAAADR9X9FEWCkp8hxIAAADQkBGUoswfsORxcVoBAACAhowr+iirfuodAAAAgIaLK/oo8/EZJQAAAKDB44o+yniYAwAAANDwcUUfZQGCEgAAANDguePV0S233KKNGzfKNE2lpqbqvvvuU9euXXXBBRfI6/UqKSlJkjR27Fj17ds3XmVFHTNKAAAAQMMXt6A0ZcoUZWRkSJI+/vhj3XPPPZo9e7Yk6Y9//KM6d+4cr1Jiyh/kqXcAAABAQxe3K/rqkCRJpaWlMgwjXl3HFTNKAAAAQMMXtxklSbr33nu1cOFC2batF154Ibx87Nixsm1bPXv21B133KEmTZrEs6yo8gWC8vKFswAAAECDZti2bce70zlz5ujdd9/V888/r8LCQuXm5srn82ny5MkqKyvTI488Eu+SosK2bQ0c945+cWFn/fqyrokuBwAAAMBhiuuMUrWBAwcqLy9PJSUlys3NlSR5vV4NGTJEI0eOjLi94uJSWVbc854kKScnQ9u27ZEkBYKWLFvy+QLhZcC+9h0vQH0YK3CKsQKnGCuIRGMaL6ZpKDs7PfL9YlDLfsrKylRYWBh+P3/+fGVmZiopKUl79oR+ALZt67333lPXrg13JsYfsCSJhzkAAAAADVxcZpTKy8s1evRolZeXyzRNZWZmasaMGSouLtZtt92mYDAoy7LUoUMHTZw4MR4lxYQ/WBWUeJgDAAAA0KDFJSg1b95cs2bNOuC6OXPmxKOEuAhUzSh5CUoAAABAg8YVfRT5AswoAQAAAI0BV/RR5CcoAQAAAI0CV/RRRFACAAAAGgeu6KPIHwhK4ql3AAAAQEPHFX0U1Tz1zpXgSgAAAAAcCYJSFHHrHQAAANA4cEUfRQQlAAAAoHHgij6KCEoAAABA48AVfRQRlAAAAIDGgSv6KCIoAQAAAI0DV/RRFH7qHY8HBwAAABo0ruijiBklAAAAoHHgij6KfIGg3C5ThmEkuhQAAAAAR4CgFEX+gMVsEgAAANAIcFUfRQGCEgAAANAocFUfRf6AxYMcAAAAgEaAq/oo8gcteT2cUgAAAKCh46o+inx+ZpQAAACAxoCr+ijyB/mMEgAAANAYcFUfRTz1DgAAAGgcuKqPIn/AkpugBAAAADR4XNVHEU+9AwAAABoHruqjKPTUO1eiywAAAABwhAhKUeQPBJlRAgAAABoBruqjiIc5AAAAAI0DV/VRRFACAAAAGgeu6qOIoAQAAAA0DlzVR4ll2QpaNkEJAAAAaAS4qo8Sf9CSJIISAAAA0AhwVR8l/kBVUOKpdwAAAECDx1V9lISDEjNKAAAAQIPHVX2U+ANBSQQlAAAAoDHgqj5KamaUXAmuBAAAAMCRIihFCQ9zAAAAABoPruqjhM8oAQAAAI0HV/VR4uOpdwAAAECjwVV9lDCjBAAAADQe7nh1dMstt2jjxo0yTVOpqam677771LVrV/3www+66667tHPnTmVlZWnKlCk6/vjj41VW1AQISgAAAECjEbegNGXKFGVkZEiSPv74Y91zzz2aPXu2Jk6cqCFDhmjAgAF6++23lZeXpz/96U/xKitqqmeUvAQlAAAAoMGL21V9dUiSpNLSUhmGoeLiYq1YsUL9+/eXJPXv318rVqzQjh074lVW1NQ89Y7HgwMAAAANXdxmlCTp3nvv1cKFC2Xbtl544QUVFhaqZcuWcrlC4cLlcqlFixYqLCxUs2bN4lnaEeMzSgAAAEDjEdegNHnyZEnSnDlzNHXqVI0ePToq7WZnp0elncOVk5Mhb1LoVOa2bKLkpLieVjQwOTkZh94IEGMFzjFW4BRjBZH4qY8Xx1f0a9eu1X//+1/t3bu31vJBgwZF3OnAgQOVl5enVq1aaevWrQoGg3K5XAoGgyoqKlJubm5E7RUXl8qy7IjriIacnAxt27ZHJbvKJUk7d5bJZTKrhAOrHi/AoTBW4BRjBU4xVhCJxjReTNM4rIkVR0FpxowZeuqpp3TCCScoOTk5vNwwDEdBqaysTLt37w4HoPnz5yszM1PZ2dnq2rWr5s6dqwEDBmju3Lnq2rVrg7vtTgrdeucyDUISAAAA0Ag4CkqvvPKK3njjDZ1wwgmH1Ul5eblGjx6t8vJymaapzMxMzZgxQ4ZhaNKkSbrrrrv09NNPq0mTJpoyZcph9ZFobVuk66R2DS/gAQAAANifo6CUnJys9u3bH3YnzZs316xZsw64rkOHDnrjjTcOu+2jRZ+uLdWna8tElwEAAAAgChzdJzZ69Gjl5+erqKhIlmXV+gMAAAAAjY2jGaW77rpLkmrN/Ni2LcMwtHLlythUBgAAAAAJ4igoffLJJ7GuAwAAAACOGo6CUps2bSRJlmVp+/btat68uUye7gYAAACgkXKUdkpLSzV+/Hh169ZN5557rrp166Y777xTe/Y0jmerAwAAAMC+HAWl/Px8lZeXq6CgQN99950KCgpUXl6u/Pz8WNcHAAAAAHHn6Na7BQsW6OOPP1ZKSookqV27dnr44Yd18cUXx7Q4AAAAAEgERzNKSUlJ2rFjR61lJSUl8nq9MSkKAAAAABLJ0YzSoEGDNGzYMN10001q3bq1Nm/erJkzZ+raa6+NdX0AAAAAEHeOgtLIkSPVokULzZ07V0VFRWrRooWGDx+uQYMGxbo+AAAAAIg7R0HJMAwNGjSIYAQAAADgJ+GgQWnOnDkaOHCgJOnNN988aAOEJwAAAACNzUGD0rvvvhsOSm+//fYBt6meaQIAAACAxuSgQen5558Pv/7zn/8cl2IAAAAA4Gjg6PHg1TNLdV199dVRLQYAAAAAjgaOgtK6dev2W2bbtjZu3Bj1ggAAAAAg0ep96t348eMlSX6/P/y62qZNm9SxY8fYVQYAAAAACVJvUDr22GMP+FqSTjvtNF166aWxqQoAAAAAEqjeoDRq1ChJ0qmnnqq+ffvGpSAAAAAASDRHXzjbt29f+Xw+/fDDDyopKZFt2+F1Z555ZsyKAwAAAIBEcBSUlixZojFjxsjn86m0tFTp6ekqKytTq1at9Mknn8S6RgAAAACIK0dPvXv44Yc1fPhwffXVV0pLS9NXX32lkSNHasiQIbGuDwAAAADizlFQ+vHHH3XDDTfUWjZixAjNnDkzFjUBAAAAQEI5CkoZGRkqLS2VJOXk5Gj16tXavXu39u7dG9PiAAAAACARHH1G6eKLL9Znn32mK664QoMGDdINN9wgt9vN48EBAAAANEqOgtK9994bfj1s2DB169ZNZWVlPDIcAAAAQKN0yFvvgsGgLrroIvl8vvCyXr166bzzzpNpOrpzDwAAAAAalEMmHZfLJZfLpcrKynjUAwAAAAAJ5+jWuxtuuEFjxozR//t//0+tWrWSYRjhdW3bto1ZcQAAAACQCI6C0oMPPihJWrhwYa3lhmFo5cqV0a8KAAAAABLIUVD673//G+s6AAAAAOCoEdHTGAoLC/XNN9/EqhYAAAAAOCo4CkqbN2/W4MGDddlll2no0KGSpA8++KDWY8MBAAAAoLFwFJTy8vJ0/vnna+nSpXK7Q3frnX322frnP/8Z0+IAAAAAIBEcBaXly5drxIgRMk0z/MS7jIwM7dmzJ6bFAQAAAEAiOApK2dnZWrduXa1lq1evVm5ubkyKAgAAAIBEcvTUu2HDhum3v/2tRowYoUAgoLlz5+rZZ5/Vb37zG0edlJSUaPz48Vq/fr28Xq+OO+44PfDAA2rWrJm6dOmizp07yzRDmW3q1Knq0qXL4R8RAAAAABwhR0Fp0KBBysrK0t/+9jfl5uZqzpw5Gj16tC666CJHnRiGoeHDh+v000+XJE2ZMkWPPPKIHnroIUnS66+/rrS0tMM8BAAAAACILkdB6dtvv9VFF120XzD67rvv1K1bt0Pun5WVFQ5JktS9e3e99tprEZYKAAAAAPHh6DNK1Y8Er2v48OERd2hZll577TVdcMEF4WXXX3+9BgwYoEcffVQ+ny/iNgEAAAAgmuqdUbIsS7Zt1/pTbf369XK5XBF3+OCDDyo1NVW//vWvJUmffvqpcnNzVVpaqnHjxumpp57S7bffHlGb2dnpEdcRTTk5GQntHw0L4wVOMVbgFGMFTjFWEImf+nipNyideOKJ4ceBn3jiibXWmaap3/72txF1NmXKFK1bt04zZswIP7yh+sl56enp+sUvfqGXX345ojYlqbi4VJZlH3rDGMjJydC2bTwmHc4wXuAUYwVOMVbgFGMFkWhM48U0jcOaWKk3KH3yySeybVvXX3+9Xn311fBywzDUrFkzJScnO+7oscce07///W8999xz8nq9kqRdu3YpKSlJycnJCgQCmjdvnrp27RrxQQAAACBxbNtWaekulZeXyrKCiS4HUVBUZMqyrESXERG326umTXPkcjl6DMOh26tvZZs2bSRJ//jHP46ok++//14zZszQ8ccfr8GDB0uSjjnmGA0fPlx5eXkyDEOBQEA9evTQ6NGjj6gvAAAAxFdJybaqX6S3lMvlDt+RhIbL7TYVCDScoGTbtsrKdqukZJuaN4/Od73WG5SmT59+yAacBJtOnTrpf//73wHXFRQUHHJ/AAAAHL18vgq1bHmMDMPRc8KAqDMMQ2lpTVRaujNqbdYblLZs2RK1jgAAANBY2YQkJFy0ZzLrDUoPP/xwVDsDAAAAgIaA6A8AAIBGZ9CgK7R27eqYtP3eewWaMGF8xPvNnPmC3npr1n7LR40aoYULF0SjtIhMnjxJb731N0nS0qVL9NVXX8akn1mz/qqSkh3h93PmvKm//e0vMekrmghKAAAAQBx88cXnOueccxNdxgEtW/avww5KwWD9TzqcNeu1WkFp4MBBuu66Xx1WX/EUnWfnAQAAAA3AypX/0eOPP6KKinIlJ6dozJix6tr1JAUCAY0fP0a7du1SZWWlTjzxJI0bd488Ho/8fr8ee2yqli37l3JyWujYY48Pt7d8+bd67LGpsixbgUBAN944TBdffOl+/RYVbZVt22rZspXjWjdu3KBp0x7Szp0lcrlcGjHiVp1xxlmSpC+//KeeffZJWZalrKymGjfuHh1zTFstXbpE06c/qi5dTtDq1avkcrl0zz2T1K5d+4P2s2bNar399t9lWZaWLPlKF174cw0dOkyLFn2hP/3pJVVW+uTxeHTbbXfo5JNP0dKlS/THP/5Bp57aXStXrtCNN96ssrIyvfHGawoE/JKkW28do169+uiVV17U9u3bNGHCnfJ6kzRxYr7mz/9I5eXlGjVqjILBoJ555gktXvxPSdLpp5+lkSNvk8vl0uTJk+T1erVhw3oVFW3VSSedogkT7o/bUxUJSgAAAIiqhcsL9cV3hTFp+5xuuTr7lMN7/LPf79e9947X3XfnqXfv07VkyVe6997x+tvf5sjtdmvixHxlZmbJtm3l50/Uu+++rYEDB+ntt99SYeFm/fnPsxQIBHTrrb9Rbm6ohr/85RVde+0QXXppv6rvkyo9YN8LFnwW8WzS/fdP0IABV6l//4H64Ye1GjXqN3r11Tcl2crPz9MTTzyndu3aa+7cObr//gl6/vlXJElr1nyvMWPGqkePnnr//bnKz5+oF1/880H76dChowYMuDocXqRQSJs580X94Q9PKC0tXWvXrtHYsb/T3//+riRp7drVGjv2Lt1+e+gWxF27duriiy+RYRhav/5HjR59i2bPfk833nizCgrmKD9/itq377hf3++8M1vff79KL70UuhVv7Njf6Z13ZuuqqwZV9bNGjz/+tEzT1NChv9KSJYvVu/cZEZ3Hw+U4KC1atEjvvvuuioqK1KJFC/Xr109nnnlmLGsDAAAAomb9+nXyeDzq3ft0SVKvXn3k8Xi0fv06HX98O7322qv68st/yrKC2rNnj5KTkyVJS5f+S5dd1l9ut1tut1uXXHKZvvvuG0nSaaf10quvztSWLYXq3fsMnXTSyQfs+4svPtMttzj/vtC9e8u0evUqXX75lZKkdu3aq2PHLvrPf5ZLkjp06ByeJbr88iv16KNTtHdvmSTpmGPaqkePnpKkSy65XFOnTlZZWanS0tId9//ll4u0adNG3XrriPCyYDCoHTuKw32cfHK38LpNmzZq0qR7tW3bNrndbu3YUazi4u3Kzm5ebz9LlizW5Zf3l8fjqTqWK/T55/8IB6W+fc9XUlKSJKlLly7atGmjevd2fBhHxFFQevnll/Xcc8/p6quvVteuXVVYWKjf//73Gj58uIYNGxbrGgEAANCAnH3K4c/6xJJt2we8bcswpI8++kDfffeNnn76eaWmpulPf3pJGzasD+93MNdeO0Rnn32uvv56sR5/fKp69z5DI0bcUmub0tJSFRYWqlOnzhHVeiCGYci2LcX+7jNbp59+pu6774H91vz44w9KSUmttWzSpHs1atTtOvfc82VZli666Bz5fL5D92Lv/1jvfd8nJXnDr03TdcjPQ0WTo4c5vPTSS3rllVc0btw4/epXv9LYsWP1yiuv6KWXXop1fQAAAEBUHHfc8fL5fFq6dImk0JPeAoGA2rY9TqWle5SZmaXU1DSVlpbqo48+CO/Xq1dvffDBewoEAqqsrKi1bv36dWrT5hgNHHiNfvGLX2rlyv/s1++iRV+EP1vkVFpaujp27Kz3358rSVq37ketWbNKJ554sk46qZtWr16ldet+lCS9//5cderURampaZJCt819++0ySaEA2L59x0POJqWlpamsrOa2wT59ztTixYu0du2a8LIDHVu10tJS5ea2liTNnft2rZCUlpZ20FsSe/c+Xe+9V6BAIKBAIKD335+rXr361FtrvDi+9e64446r9b5t27Zx+yAVAAAAEKkxY26Vy+UKv3/lldc1efLUWg9zyM+fIo/Ho0sv7a8FCz7Xr399rXJycnTqqT1UWVkpSbryyqu1evVqXX/9tWrRoqW6d++pwsJNkqQ333xdS5f+Sx6PWx6PV7ffPm6/OhYs+EwDBlxdb60PPTRJXm9S+P20adM1cWK+pk17SLNm/VUul0sTJjygpk2bSpImTHhA999/r4LBoLKymiov78Hwvp06ddZHH83T9OmPyuUyNWHC/Yc8V+ee+zPde+843XTTkPDDHPLyHtT//d+DqqysVCDg1ymnnKquXU864P6/+90duueesWrePEfdu5+mzMzM8LpBgwbroYceUHJysiZOzK+135VXXqWNGzdo6NAhkkIB7YorrjpkvfFg2PXNJVaZNWuWFi9erNtuu02tWrVSYWGhnn76afXp00fXXHNNeDvTTMzTxouLS2VZhzyMmMjJydC2bXsS0jcaHsYLnGKswCnGCpyK5VjZsmWdWrU67tAb/gT5/X798pdX6/XXZ8vtjv1z1JYuXaKnnppe78MbnHC7TQUCVpSqip8DjUXTNJSd7fzzWdUc/bTy8vIkSe+++27VfZGhUFJQUKC8vLzw/Z4rV66MuAAAAACgsfJ4PHrzzYJEl4HD4CgoffLJJ7GuAwAAAMAROu20Xkc8m4QQR0GpTZs2kiTLsrR9+3Y1b948YbfZAQAAAECsOUo7paWlGj9+vLp166Zzzz1X3bp105133qk9e7gnGgAAAEDj4ygo5efnq7y8XAUFBfruu+9UUFCg8vJy5efnH3pnAAAAAGhgHN16t2DBAn388cdKSUmRJLVr104PP/ywLr744pgWBwAAAACJ4GhGKSkpSTt27Ki1rKSkRF6v9yB7AAAAAIkzf/7HGjp0iG66aYhZO0TlAAAgAElEQVSGDLlGkybdG1734ovPyu/3J7A6aebMF/TWW7P2Wz5q1AgtXLgg7vVMnjxJb731N0mhR4wvXrwoJv3MmvVXlZTU5Io5c97U3/72l5j0daQczSgNGjRIw4YN00033aTWrVtr8+bNmjlzpq699tpY1wcAAABEZPv27frDH/5PL774qlq2bCXbtrV69arw+pdffl6//OX18ng8EbUbCASi9l1IX3zxuSZPnhqVtqJt2bJ/qbKyQj17nh7xvsFgsNaX/NY1a9Zr6tWrj5o2bSZJGjhw0GHXGWuOftIjR45UixYtNHfuXBUVFalFixYaPny4Bg06eg8MAAAAP007dmyXy+VWZmaWJMkwDHXq1EWS9OijUyRJI0cOk2GYeuKJZ+X3+zRt2sPavHmjbNvWL395vS67rL8kadCgK9S//wD9619fq3XrNrr77jy9//5c/f3vbygYDCo9PV1jx96lY489XsuXf6vHHpsqy7IVCAR0443DdPHFl+5XX1HRVtm2rZYtWzk+po0bN2jatIe0c2eJXC6XRoy4VWeccZYk6csv/6lnn31SlmUpK6upxo27R8cc01ZLly7R9OmPqkuXE7R69Sq5XC7dc88ktWvX/qD9rFmzWm+//XfZtqWvvlqsCy/8ua6//iYtWvSF/vSnl1RZ6ZPH49Ftt92hk08+RUuXLtEf//gHnXpqd61cuUI33nizysrK9MYbrykQCM3a3XrrGPXq1UevvPKitm/fpgkT7pTXm6SJE/M1f/5HKi8v16hRYxQMBvXMM09o8eJ/SpJOP/0sjRx5m1wulyZPniSv16sNG9arqGirTjrpFE2YcL8Mw3B8DiPlKCgZhqFBgwYRjAAAAHBI/lUL5f/f5zFp29PlXHk6n13vNh07dtaJJ56ka67ppx49eqpbt+665JLLlZmZpd///k7Nnv2GnnnmJaWmpkqS8vLuVvv2HfTww49o+/btuvnmX6lLlxPUvn1HSaEZqieeeFaS9O23yzR//kd66qnn5fV6tWjRQj388AN65pmX9Je/vKJrrx2iSy/tJ9u2VVpaesD6Fiz4TOecc25Ex33//RM0YMBV6t9/oH74Ya1GjfqNXn31TUm28vPz9MQTz6ldu/aaO3eO7r9/gp5//hVJ0po132vMmLHq0aOn3n9/rvLzJ9b7PUsdOnTUgAFXq7KyQrfcMlqStGnTRs2c+aL+8IcnlJaWrrVr12js2N/p739/V5K0du1qjR17l26/fbwkadeunbr44ktkGIbWr/9Ro0ffotmz39ONN96sgoI5ys+fEj63+3rnndn6/vtVeuml0K14Y8f+Tu+8M1tXXTWoqp81evzxp2WapoYO/ZWWLFms3r3PiOg8RsLx3OFbb72lt99+W1u3blXLli01YMAAXXPNNTErDAAAADgcpmnq4Ycf1dq1q7Vs2VItWPCp/vrXP+tPf3pdTZpk7rf9kiVfadSoMZKk5s2b68wzz9HSpUvCF/OXXtovvO3ChZ9r9ervNWLETZIk27a1Z89uSaEve3311ZnasqVQvXufoZNOOvmA9X3xxWfhEOLE3r1lWr16lS6//EpJUrt27dWxYxf95z/LJUkdOnQOzxJdfvmVevTRKdq7t0ySdMwxbdWjR09J0iWXXK6pUyerrKxUaWnpjvtfvHiRNm3aqFtvHRFeFgwGtWNHcbiPk0/uFl63adNGTZp0r7Zt2ya3260dO4pVXLxd2dnN6+1nyZLFuvzy/uFbIi+//Ap9/vk/wkGpb9/zlZSUJEnq0qWLNm3aqN69HR9GxBwFpWeeeUZz5szRsGHDwp9ReuGFF1RUVKSRI0fGrjoAAAA0OJ7OZx9y1ice2rfvqPbtO+qaa67Vr3/9Cy1b9i+dd94FB9y27i1c+75PTU0Jv7ZtqV+/KzV8+G/3a+Paa4fo7LPP1ddfL9bjj09V795naMSIW2ptU1paqsLCQnXq1Nnxcdi2fdCabdtSDO8+C/d/+uln6r77Hthv3Y8//qCUlNRayyZNulejRt2uc889X5Zl6aKLzpHP53PQT/0/h6SkmgfJmaZLwWAw0kOJiKOn3r3xxht66aWXdN1116lv37667rrr9MILL2jWrP2f1AEAAAAk0rZtRfr3v78Lvy8q2qqdO0uUm9takpSamqaysprb4nr16qN33pktSSou3q5FixaqR49eB2z77LP76oMP3lVR0VZJoZmV//53pSRp/fp1atPmGA0ceI1+8YtfauXK/+y3/6JFX4Q/W+RUWlq6OnbsrPffnytJWrfuR61Zs0onnniyTjqpm1avXqV1636UJL3//lx16tRFqalpkkKfbfr222WSpI8++kDt23c85GxSWlpardsG+/Q5Q4sXL9LatWvCyw50bNVKS0vD53ru3LdrhaS6be+rd+/T9d57BQoEAgoEAnr//bnq1atPvbXGkqMZpfLycjVr1qzWsqysLFVUVMSkKAAAAOBwBYNBvfjis9qypVBJScmybUvDh49U584nSJIGD/6Vfve73yopKVlPPPGsxowZq2nTHtKNNw6Wbdv67W9HqX37Dgdsu3v30zRixC266647FAxaCgT8+tnPLtIJJ3TVm2++rqVL/yWPxy2Px6vbbx+33/4LFnymAQOurrf+hx6aJK83Kfx+2rTpmjgxX9OmPaRZs/4ql8ulCRMeUNOmTSVJEyY8oPvvv1fBYFBZWU2Vl/dgeN9OnTrro4/mafr0R+VymZow4f5Dnr9zz/2ZJkwYr5tuGhJ+mENe3oP6v/97UJWVlQoE/DrllFPVtetJB9z/d7+7Q/fcM1bNm+eoe/fTlJlZc7vjoEGD9dBDDyg5OVkTJ+bX2u/KK6/Sxo0bNHToEElSnz5n6oorrjpkvbFi2Aeby9vH+PHjVVZWpt///vdq3bq1Nm3apMcff1zJycmaNm1aPOqsV3FxqSzrkIcREzk5Gdq2bU9C+kbDw3iBU4wVOMVYgVOxHCtbtqxTq1bHxaTtxsTv9+uXv7xar78+O2qPGa/P0qVL9NRT0+t9eMPBuN2mAgErBlXF1oHGomkays52/pms8H5ONsrLy1NaWpoGDBigHj16aODAgUpJSdF9990XcYcAAADAT5HH49GbbxbEJSThyDmaUapmWZZKSkrUtGlTmaapDRs2qG3btrGszxFmlNBQMF7gFGMFTjFW4BQzSogEM0oOZ5RqOjGVnZ0t0zTl8/n085//POIOAQAAAOBoF1FQqiuCySgAAAA0WqHHVAOJFO1sckRBqe5zzgEAAPDT4/Uma+fO7QoE/PwiHQlh27bKynbL7fYeemOH+CQZAAAAjkjTpjkqLd2lHTu2yrJi+yWgiA/TNGVZDWuW0O32qmnTnOi1V9/K884776CzRvy2AAAAAFLoLqOMjCxlZGQluhRECQ+KOURQOhq+IwkAAAAA4q3eoNSnT5941QEAAAAAR40jepgDAAAAADRGBCUAAAAAqCMuT70rKSnR+PHjtX79enm9Xh133HF64IEH1KxZM33zzTfKy8tTZWWl2rRpo2nTpik7OzseZQEAAADAAdU7ozR27FgVFBRo586dR9SJYRgaPny45s2bp4KCArVt21aPPPKIbNvWuHHjlJeXp3nz5qlXr1565JFHjqgvAAAAADhS9Qal888/X59//rn69eunwYMH65lnntGKFSsi7iQrK0unn356+H337t21efNmLV++XElJSerVq5ckafDgwfrggw8ibh8AAAAAoqneW+/69++v/v37y7Ztfffdd/r00081YcIEbd++XX379tV5552ns846S+np6Y47tCxLr732mi644AIVFhaqdevW4XXNmjWTZVnauXOnsrJ4Dj8AAACAxDDsw/jm2O3bt+uzzz7TZ599pq+//lqjR4/W4MGDHe17//33a+vWrXryySf10Ucf6a233tJzzz0XXn/qqafqs88+IygBAAAASJjDephD8+bNdc011+iaa65RMBjUrl27HO03ZcoUrVu3TjNmzJBpmsrNzdXmzZvD63fs2CHDMCIOScXFpbKsiPNeVPCtxYgE4wVOMVbgFGMFTjFWEInGNF5M01B2tvM74ML7HWnHLpdLzZo1O+R2jz32mP7973/rqaeektfrlSSdfPLJqqio0JIlSyRJr7/+ui677LIjLQkAAAAAjkhcHg/+/fffa8aMGTr++OPDt+gdc8wxeuqppzR16lRNnDix1uPBAQAAACCR4hKUOnXqpP/9738HXHfaaaepoKAgHmUAAAAAgCOHdevdhg0btGnTpmjXAgAAAABHBUdB6Y477tDSpUslSW+99Zb69eunfv366Y033ohpcQAAAACQCI6C0qJFi3TyySdLkmbOnKmXX35Zb7zxhp5//vmYFgcAAAAAieDoM0p+v19er1dbt27Vzp071bNnT0mh71MCAAAAgMbGUVDq2rWrnn32WW3atEnnn3++JGnr1q1KT4/8eeQAAAAAcLRzdOvd5MmTtWrVKlVWVmrMmDGSpGXLlumKK66IaXEAAAAAkAiGbdt2oos4UsXFpbKsxBxGY/rWYsQe4wVOMVbgFGMFTjFWEInGNF5M01B2duR3wjmaUbJtW7NmzdKNN94YnkX6+uuv9d5770XcIQAAAAAc7RwFpenTp+vNN9/Utddeq8LCQklSq1at9MILL8S0OAAAAABIBEdBafbs2ZoxY4b69esnwzAkScccc4w2bNgQ0+IAAAAAIBEcBaVgMKi0tDRJCgelsrIypaamxq4yAAAAAEgQR0HpvPPO08MPPyyfzycp9Jml6dOn62c/+1lMiwMAAACARHAUlO6++24VFRWpZ8+e2rNnj3r06KHNmzdr7Nixsa4PAAAAAOLO0RfOpqen6+mnn1ZxcbE2bdqk3Nxc5eTkxLo2AAAAAEgIRzNKAwcOlCRlZ2erW7du4ZB09dVXx64yAAAAAEgQR0Fp3bp1+y2zbVsbN26MekEAAAAAkGj13no3fvx4SZLf7w+/rrZp0yZ17NgxdpUBAAAAQILUG5SOPfbYA76WpNNOO02XXnppbKoCAAAAgASqNyiNGjVKknTqqaeqb9++cSkIAAAAABLN0WeUHn30Uc2cOVPFxcWxrgcAAAAAEs5RULrlllu0ZMkSXXjhhRo+fLgKCgpUUVER69oAAAAAICEcBaWf//znevLJJ/Xpp5/qwgsv1F//+ledc845uvvuu7Vo0aJY1wgAAAAAceXoC2erZWVlaeDAgUpNTdULL7ygDz/8UEuWLJFpmpo4caLOOuusWNUJAAAAAHHjKChZlqWFCxfq7bff1qeffqru3btrxIgRuvjii5WcnKx58+Zp3LhxWrhwYazrBQAAAICYcxSU+vbtq6ZNm2rAgAEaN26cWrZsWWv9JZdcoldffTUmBQIAAABAvDkKSjNmzNApp5xS7zZ//vOfo1IQAAAAACSao4c57BuSiouL9eGHH2rNmjUxKwoAAAAAEqneGaWtW7fqwQcf1OrVq9WjRw8NGzZMv/71r2Wapvbs2aMpU6aoX79+8aoVAAAAAOKi3hmliRMnqkmTJrr77rtl27Zuvvlm5efna9GiRXr88cc1Y8aMeNUJAAAAAHFT74zSsmXLtGDBAnm9XvXp00e9evXSRRddJEm66KKLdOedd8alSAAAAACIp3pnlPx+v7xeryQpJSVFaWlpMgwjvN627dhWBwAAAAAJUO+MUjAY1JdffhkORIFAoNZ7y7JiXyEAAAAAxFm9QSk7O1v33HNP+H1WVlat982aNYtdZQAAAACQIPUGpfnz58erDgAAAAA4ajj6HiUAAAAA+CkhKAEAAABAHQQlAAAAAKij3s8oRdOUKVM0b948bdq0SQUFBercubMk6YILLpDX61VSUpIkaezYserbt2+8ygIAAACA/cQtKF144YW64YYb9Ktf/Wq/dX/84x/DwQkAAAAAEi1uQalXr17x6goAAAAAjkjcglJ9xo4dK9u21bNnT91xxx1q0qRJoksCAAAA8BNm2LZtx7PDCy64QDNmzAjfaldYWKjc3Fz5fD5NnjxZZWVleuSRR+JZEgAAAADUkvAZpdzcXEmS1+vVkCFDNHLkyIjbKC4ulWXFNe+F5eRkaNu2PQnpGw0P4wVOMVbgFGMFTjFWEInGNF5M01B2dnrk+8WgFsf27t2rPXtCPwDbtvXee++pa9euiSwJAAAAAOI3o5Sfn68PP/xQ27dv19ChQ5WVlaUZM2botttuUzAYlGVZ6tChgyZOnBivkgAAAADggOL+GaVY4NY7NBSMFzjFWIFTjBU4xVhBJBrTeGmQt94BAAAAwNGIoAQAAAAAdRCUAAAAAKAOghIAAAAA1EFQAgAAAIA6CEoAAAAAUAdBCQAAAADqICgBAAAAQB0EJQAAAACog6AEAAAAAHUQlAAAAACgDoISAAAAANRBUAIAAACAOghKAAAAAFAHQQkAAAAA6iAoAQAAAEAdBCUAAAAAqIOgBAAAAAB1EJQAAAAAoA6CEgAAAADUQVACAAAAgDoISgAAAABQB0EJAAAAAOogKAEAAABAHQQlAAAAAKiDoAQAAAAAdRCUAAAAAKAOghIAAAAA1EFQAgAAAIA6CEoAAAAAUAdBCQAAAADqICgBAAAAQB0EJQAAAACog6AEAAAAAHUQlAAAAACgDoISAAAAANQRl6A0ZcoUXXDBBerSpYtWrVoVXv7DDz/ouuuu0yWXXKLrrrtOP/74YzzKAQAAAIB6xSUoXXjhhfrLX/6iNm3a1Fo+ceJEDRkyRPPmzdOQIUOUl5cXj3IAAAAAoF5xCUq9evVSbm5urWXFxcVasWKF+vfvL0nq37+/VqxYoR07dsSjJAAAAAA4qIR9RqmwsFAtW7aUy+WSJLlcLrVo0UKFhYWJKgkAAAAAJEnuRBcQDdnZ6QntPycnI6H9o2FhvMApxgqcYqzAKcYKIvFTHy8JC0q5ubnaunWrgsGgXC6XgsGgioqK9rtFz4ni4lJZlh2DKg8tJydD27btSUjfaHgYL3CKsQKnGCtwirGCSDSm8WKaxmFNrCTs1rvs7Gx17dpVc+fOlSTNnTtXXbt2VbNmzRJVEgAAAABIitOMUn5+vj788ENt375dQ4cOVVZWlt59911NmjRJd911l55++mk1adJEU6ZMiUc5AAAAAFAvw7btxNyzFkXceoeGgvECpxgrcIqxAqcYK4hEYxovDe7WOwAAAAA4WhGUAAAAAKAOghIAAAAA1EFQAgAAAIA6CEoAAAAAUAdBCQAAAADqICgBAAAAQB0EJQAAAACow53oAoBDsW1bhmHst0yyZRi1s75tBSXDrLW9bVuSFZTh8tTeNuCTTJcM07XP/pYUqJQ8SeG2bduWAr7QBm5vuG3bCkr+CsmdJMPlrtnWXyHZluRJkWFWtREMyPaXy/K5w8dj25bkr5AdDMjwJMtwe8N12f6KUP/eZBmmO1SXv1x2wCfDkyx5kiQZUtAv27dXkiHDmyy5vKG+/RWy/ZWhNj3JkumSApWy/RWSbcvwJEnu5JptA5WS6Q617fKEtg1Uhs6bO0lyeyXDkPxVbZiuUNturxQMhJYFA6Hz40mSDDO0v78iVKcnKbS9FZTtr5SCfsnlCS0z3aFtA5WSXXWO3aHjsAO+0Ll3uUPLTbfsoD+0zArWbFv98wz4JMOU3B4ZLk/oZ1R1HHJ5Qn0aZlUblaHa3N6abYM+KRiUXK7QtqY71G7QV1VbqA3ZthT0yQ76ZZju0DLTFToHQZ9s2wqNt+oxF/SHtjXMqm3doZqCPtnVY9PlCZ3jqm19ZrqsPVXHblmh5Vagqj93VX+hbWUrNAZdntDPNBgIjSvTrNrWHVpm+UM//+qaZYe3lWGE2qiqzbb8kmXV9GcY+2yrUM2mq6Y/Kxj6t2S6JdOs2Vb2/rVZAclwVfXnqllm79OfrVANwap/0y7XPrUFQufE5Qptbxihn4llhf5dhf80+O9Td6QymKpgyd5El+FAI/p5NNBDqQzUHSsN9EAO5Cfy7z2eKnypCu48/P+2GClNZGY0j2JF8UdQOgrZti2raI2MtGYy05uFllmWgptXyirZKFfLjjKbHy/JVrBwlQIblstwe+Vq2UFm9rGySjYrWPhfWSWbZWa2ktn8OBneFAW3/SCraK1s25KraRuZTVvLLt+tYPF6WTu3yEjNkpnVSmZKE1k7tyhYsknyV8pskiOjSQsp6Je1a4us3UUyvCkyM3JkpGTKKtshe/c22ZVlMtKbyUxrJhmGrD3bZZcWSy63jLSmMlMyZVeWySotll2xR0ZKExmpWTLcXtllJbLKSkIXcalZMlOayA5Uyt67U3bFHsmbKjOlieRJll2+W3b5rtBFfHKGjOQM2VZQdvluybc3dMGfnC7Dmyq7siy0v22FLoiTM0Lns2JPTfjxpMhISg1d7FfuVeiq05DhTZNcbtmVpaELYCnUdlKq7IBf8pfX/NDc3tCFoK9CsoM1yz1VYaSqrzKp6kLeK/krVev/pMyqf45WoPaAcLlr+t+XYYbarrXM4P8sGomGcNmLowNjBU4xVhCJIx4vhkvpQ58J/0KzITJsu+FfVRUXl8qyEnMYOTkZ2rZtzyG3s21LwfXfytWyk4zkdEmSVVaiis9flrVjo9wd+sjT6WxZuwrlW1Ygq3iDJMnMPk6unHYKbPhOdtmOmgY9yaGLYl956DeylqVaF92GISMjR/ae4loX7mZmK8nllrWzMPQbbUlGerbMzFay9+6StXuLFAzISM6Q2ewYGZ5kWbuLZO0uklxumZm5MjNbyPaVy96zTdbeXTLTmsls0kJGUlooNJUWy7ZtmRnNZaY3l20FZJeVyN67S0Zymoz07FC4Kd8dCkIBn8y0pjLSmkoyZJfvkr13l+RJkpnaVEZyeqi/8l2hmZaUJjJSMmW43LLL98iu2B2a4UhpEgpCAZ/sij2yfeUyklJlJDeR3N6q0FQqya4KWOmh30BXlsqu3CvDkxRa7kkK9VdZVnUu0qWktNAJrCyTXbk3NGORlC7DkxwKdL69UsAnw5sqIyktNKPi2xtabrpCy70pSks2VVqyMzTb402W4U0Nzaj4K0IhT5K8KTI8KaEZFX95KFC5vaFl7qrZHl9F6OfnTQnNAkmhNvwVoWDqSZbcSVUzTuWh8OVODs06yaiZXTLMmhmqYKBqtscXmkXyJFXNRIVmuCRVzXwlVc2GhWadDLenalbNUzWrUxmaUXEnhdqRQuco4AvNOLiTZLg9odmGQKXsYFBG9UyUVDPbY5g17VqBUNgMBkLn3l01a1W9ra3QrFXVLMm+M1GGO3QcoW39NaHZ7ZVsu2aGqnqWrCqY2kF/eCYqPGtV3YZp1swCWcHQ8mAg1J/Ls8+Mij903qq3ta39tzVMyaqafdlnJqpJk2TtLtkT2t501czgVPe370yUVNOGYVRt6w792993tic8a1VVm2HsM4NjV81a7bNt1XEoGJCtA8xEWYGqWTJ3zX+Hgv7Qz796W0Ph/mq2ddeaXaq1rRWs+cVAeFs7NIaDgVA/1cdhBUN/bCt0Hg0j9Ldp1rxvNA5+LJlNUrRrd/lB1x9NjHqOo8FpgIfSpEmKdu83VhrggRxMozmUo+NAMjNTtGvX4f+3xUjNkqv5cVGs6PCZpqHs7PSI92NGKU58S2bLt6xAcifJc+LP5Gp2jCoWvSYF/HK17iL/8o/k/+4DSaEwk3TuUNkVZQquWyb/qi/kanOiPGcMlqtlBwW3rlFw80rJtuQ6tpvcbU6SbFvB7T/K2r5eZlYruVp1luFNkR3wySrZLNu3V67mx4Uu4qVQeNm9rSYwVKm+xat6u/Dyqjxd9xY4RCYrJ0N+B8EaSM/JUDljBQ6k5WRoL2MFDjBWEInUnAyV/cTHC0EpBgKF/5Ndvlvudj1lGKb8PyyRb1mB3O37SKYp//J58tu2zJx2SvnZCJlZubLKdyvwwxIZyRlyH98z/NkWdb98v/bN9Gx5OvTZb7m7dVepdddaywy3V66c4/fb1jDdMrJyD7DcrJk92Xc5AQkAAAA/IQSlGKj8/GVZu7bIbNFB3pMvVsXnL8ts0V7J5w+X4fbK6nmVgsXr5T6+R+iWE0lmShN5T7wgwZUDAAAAkAhKUWdV7JG1a4tcbU+RtX2dKubPkJGSqZSLbwt/zsHMbCkzs2WCKwUAAABwMASlKLO2rpEkebv3lyv7WPlW/EPuY06SmdY0wZUBAAAAcIqgFGXBraslwyVXzvEy3ElKOsBnjAAAAAAc3cxDb4JIBLd+L7P5seHHIgMAAABoeAhKUWRbAQWLfpCrZcdElwIAAADgCBCUosjavl4K+uT6/+3cS0jU/R7H8c+MVlbPE2oXs4wieAojKJhIOptyiiTKihYJlQRpLaKLQYsuVIsSmlpUoKVdttUqIroRZAQFlZJFgVhplpSXHPWESQMz8z2LA3L6PzWnY3L+zcz7tdL5IX7VD37nw2806y+3RwEAAADwCyhKQyjS8UaSuFECAAAA4hxFaQhFOt7I88dYef/IdHsUAAAAAL+AojSEIh1vuE0CAAAAEgBFaYhE+4KyL90UJQAAACABUJSGSKT9tSTxjxwAAACABEBRGiLW/0950v6Ud2yO26MAAAAA+EWpbg+QKIbNXqzUGf+Qx8u3FAAAAIh3PKsfIh5vqjxpf7o9BgAAAIAhwEvvAAAAAMCBogQAAAAADhQlAAAAAHCgKAEAAACAA0UJAAAAABwoSgAAAADgQFECAAAAAAeKEgAAAAA4UJQAAAAAwIGiBAAAAAAOFCUAAAAAcEh1e4Ch4PV6kvrzI76QF/wssoKfRVbws8gK/heJkpfBfh0eM7MhngUAAAAA4hovvQMAAAAAB4oSAAAAADhQlAAAAADAgaIEAAAAAA4UJQAAAABwoCgBAAAAgANFCQAAAAAcKEoAAAAA4EBRAgAAAAAHitIvePv2rYqKilRQUKCioiK1tLS4PRJc0tPTo5SkYNkAAAYdSURBVM2bN6ugoECFhYXatm2buru7JUnPnj3TypUrVVBQoE2bNikYDA58XKwzJL6KigrNnDlTr169kkRW8HehUEiHDh3S0qVLVVhYqAMHDkiKvX/YTcnr3r17Wr16tVatWqXCwkLduXNHEnmBFAgE5Pf7v9k50uCzkTS5MQxacXGxXb161czMrl69asXFxS5PBLf09PTYo0ePBt4/evSo7d2716LRqC1ZssRqa2vNzKyystL27NljZhbzDInv5cuXVlJSYosWLbLGxkaygu86fPiwlZeXWzQaNTOzT58+mVns/cNuSk7RaNTmzZtnjY2NZmbW0NBgc+fOtUgkQl5gtbW19vHjR8vPzx/IiNngf5ckS24oSoPU1dVlPp/PwuGwmZmFw2Hz+XwWDAZdngy/g9u3b9vGjRvt+fPntnz58oHHg8GgzZ0718ws5hkSWygUsrVr19r79+8HlhZZgVNfX5/5fD7r6+v75vFY+4fdlLyi0ajNnz/f6urqzMzsyZMntnTpUvKCb/xnURpsNpIpN6lu32jFq7a2NmVlZSklJUWSlJKSogkTJqitrU2ZmZkuTwc3RaNRXbp0SX6/X21tbZo0adLAWWZmpqLRqHp7e2OepaenuzE6/k9OnTqllStXasqUKQOPkRU4tba2Kj09XRUVFXr8+LFGjx6tnTt3Ki0t7Yf7x8zYTUnK4/Ho5MmT2rp1q0aNGqUvX76ouro65vMV8pLcBpuNZMoNf6MEDLHDhw9r1KhR2rBhg9uj4DdUX1+vFy9eaN26dW6Pgt9cOBxWa2urZs2apStXrmj37t3avn27+vv73R4Nv6FwOKzq6mqdPn1a9+7d05kzZ7Rr1y7yAvwCbpQGKTs7Wx0dHYpEIkpJSVEkElFnZ6eys7PdHg0uCgQCevfunaqqquT1epWdna2PHz8OnHd3d8vj8Sg9PT3mGRJXbW2tmpubtXjxYklSe3u7SkpKVFxcTFbwjUmTJik1NVUrVqyQJM2ZM0cZGRlKS0v74f4xM3ZTkmpoaFBnZ6d8Pp8kyefzaeTIkRoxYgR5wXfFei4bKxvJlBtulAZp7Nixys3N1fXr1yVJ169fV25ubsJdOeLnnThxQi9fvlRlZaWGDx8uSZo9e7a+fv2quro6SdLly5e1bNmy/3qGxLVlyxY9ePBANTU1qqmp0cSJE3XhwgWVlpaSFXwjMzNTeXl5evjwoaR//5epYDCoadOm/XD/sJuS18SJE9Xe3q7m5mZJUlNTk7q6ujR16lTygu+K9fMf7Fmi8ZiZuT1EvGpqatKePXv0+fNnjRkzRoFAQNOnT3d7LLjg9evXWrFihaZNm6a0tDRJUk5OjiorK/X06VMdOnRIoVBIkydP1vHjxzVu3DhJinmG5OD3+1VVVaUZM2aQFfxNa2ur9u3bp97eXqWmpqqsrEwLFy6MuX/YTcnr2rVrOnfunDwejyRpx44dWrJkCXmBjhw5ojt37qirq0sZGRlKT0/XjRs3Bp2NZMkNRQkAAAAAHHjpHQAAAAA4UJQAAAAAwIGiBAAAAAAOFCUAAAAAcKAoAQAAAIADRQkAAAAAHChKAAAAAOBAUQIAAAAAB4oSACAu+f1+XbhwQYWFhfL5fCorK1MoFNLZs2e1du1ahcNhSdLFixe1fPlyhUIhlycGAMQTihIAIG7dunVL58+f1927d9XY2KgrV66otLRUw4YN05kzZ9TS0qITJ07o+PHjGjFihNvjAgDiSKrbAwAAMFjFxcXKysqSJOXn56uhoUFer1eBQEBr1qzRzZs3VVpaqlmzZrk8KQAg3nCjBACIW+PHjx94e+TIkerv75ck5eTkKC8vTx8+fND69evdGg8AEMcoSgCAhHP//n3V19drwYIFOnbsmNvjAADiEEUJAJBQuru7tX//fpWXl+vo0aOqqanR/fv33R4LABBnKEoAgIRy8OBB+f1+LVy4UBkZGSovL9f+/fvV09Pj9mgAgDjiMTNzewgAAAAA+J1wowQAAAAADhQlAAAAAHCgKAEAAACAA0UJAAAAABwoSgAAAADgQFECAAAAAAeKEgAAAAA4UJQAAAAAwIGiBAAAAAAO/wJINFT8ouENxQAAAABJRU5ErkJggg==\n",
-      "text/plain": [
-       "<Figure size 1008x432 with 1 Axes>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "df_byte = pd.DataFrame()\n",
-    "df_byte[\"Loads / Loop Iteration\"] = (df_vldvst.set_index(\"nx\")[\"Vector Loads / Loop Iteration\"] + df_ldst.set_index(\"nx\")[\"Loads / Loop Iteration\"])*8\n",
-    "df_byte[\"Stores / Loop Iteration\"] = (df_vldvst.set_index(\"nx\")[\"Vector Stores / Loop Iteration\"] + df_ldst.set_index(\"nx\")[\"Stores / Loop Iteration\"])*8\n",
-    "ax = df_byte.plot()\n",
-    "ax.set_ylabel(\"Bytes / Loop Iteration\");"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Mean byte loaded: 37.52662546714877\tMean byte stored: 8.428951320998907\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "mean_byte_ld = np.polyfit(df_byte[df_byte.index > 200].index, df_byte[df_byte.index > 200][\"Loads / Loop Iteration\"], 0)[0]\n",
-    "mean_byte_st = np.polyfit(df_byte[df_byte.index > 200].index, df_byte[df_byte.index > 200][\"Stores / Loop Iteration\"], 0)[0]\n",
-    "print(\"Mean byte loaded: {}\\tMean byte stored: {}\".format(mean_byte_ld, mean_byte_st))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "*Not really a* <a name=\"task2-c\"></a>**TASK C**: We can combine this information with the cycles measured in Task 1 to create a bandwidth of exchanged bytes per cycle."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_bandwidth = pd.DataFrame()\n",
-    "df_bandwidth[\"Bandwidth / Byte/Cycle\"] = (df_byte[\"Loads / Loop Iteration\"] + df_byte[\"Stores / Loop Iteration\"]) / df.set_index(\"nx\")[\"Cycles / Loop Iteration\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's display it as a function of `nx`. And also compare it to the available L1 cache bandwidth in a second (sub-)plot. Non-interactive users, call `make graph_task2c`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "\n",
-      "text/plain": [
-       "<Figure size 1008x432 with 2 Axes>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n",
-    "for ax in [ax1, ax2]:\n",
-    "    df_bandwidth[\"Bandwidth / Byte/Cycle\"].plot(ax=ax, legend=True, label=\"Jacobi Bandwidth\")\n",
-    "    ax.set_ylabel(\"Byte/Cycle\")\n",
-    "ax2.axhline(2*16, color=sns.color_palette()[1], label=\"L1 Bandwidth\");\n",
-    "ax2.legend();"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "As you can see, we are quite a bit away from the available L1 cache bandwidth. Can you think of reasons why?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Task E1: Measuring FlOps\n",
-    "<a name=\"taske1\"></a>\n",
-    "\n",
-    "If you still have time, feel free to work on the following extended task.\n",
-    "\n",
-    "\n",
-    "**TASK**: Please measure counters for _vectorized_ floating point operations and _scalar_ floating point operations. The two counters can also not be measured during the same run. So please see the TODOs in [`poisson2d.sflops.c`](/edit/Tasks/poisson2d.sflops.c) and [`poisson2d.vflops.c`](/edit/Tasks/poisson2d.vflops.c). By now you should be able to find out the names of the counters by yourself (*Hint: they include the words scalar and vector…*).\n",
-    "\n",
-    "As usual, compile, test, and bench-run your program.\n",
-    "\n",
-    "[Back to top](#toc)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "bsub -W 60 -nnodes 1 -Is -P GEN110 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.sflop.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.sflop.bin.csv\n",
-      "Job <4299> is submitted to default queue <batch>.\n",
-      "<<Waiting for dispatch ...>>\n",
-      "<<Starting on login1>>\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,4,0.0010,96000,480,480\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,8,0.0011,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,12,0.0012,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,16,0.0012,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,20,0.0013,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,24,0.0014,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,28,0.0014,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,32,0.0015,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,36,0.0015,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,40,0.0016,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,44,0.0017,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,48,0.0017,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,52,0.0018,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,56,0.0019,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,60,0.0020,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,64,0.0021,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,68,0.0022,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,72,0.0022,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,76,0.0022,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,80,0.0023,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,84,0.0024,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,88,0.0024,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,92,0.0025,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,96,0.0025,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,100,0.0028,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,104,0.0027,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,108,0.0027,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,112,0.0029,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,116,0.0028,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,120,0.0029,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,124,0.0030,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,128,0.0031,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,132,0.0031,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,136,0.0032,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,140,0.0033,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,144,0.0034,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,148,0.0034,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,152,0.0034,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,156,0.0035,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,160,0.0036,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,164,0.0037,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,168,0.0037,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,172,0.0038,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,176,0.0038,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,180,0.0039,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,184,0.0039,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,188,0.0040,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,192,0.0041,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,196,0.0041,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,200,0.0042,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,204,0.0043,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,208,0.0043,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,212,0.0044,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,216,0.0045,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,220,0.0046,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,224,0.0047,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,228,0.0047,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,232,0.0047,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,236,0.0048,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,240,0.0049,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,244,0.0049,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,248,0.0050,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,252,0.0050,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,256,0.0051,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,260,0.0052,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,264,0.0053,0,0,0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,268,0.0054,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,272,0.0055,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,276,0.0055,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,280,0.0055,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,284,0.0056,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,288,0.0057,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,292,0.0057,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,296,0.0058,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,300,0.0059,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,304,0.0059,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,308,0.0059,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,312,0.0060,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,316,0.0061,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,320,0.0061,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,324,0.0062,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,328,0.0063,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,332,0.0065,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,336,0.0064,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,340,0.0065,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,344,0.0065,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,348,0.0066,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,352,0.0067,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,356,0.0067,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,360,0.0068,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,364,0.0069,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,368,0.0070,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,372,0.0070,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,376,0.0071,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,380,0.0072,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,384,0.0072,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,388,0.0072,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,392,0.0075,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,396,0.0074,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,400,0.0075,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,404,0.0075,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,408,0.0076,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,412,0.0077,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,416,0.0077,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,420,0.0078,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,424,0.0079,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,428,0.0079,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,432,0.0080,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,436,0.0080,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,440,0.0081,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,444,0.0083,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,448,0.0084,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,452,0.0084,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,456,0.0084,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,460,0.0085,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,464,0.0086,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,468,0.0086,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,472,0.0088,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,476,0.0087,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,480,0.0088,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,484,0.0089,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,488,0.0089,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,492,0.0090,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,496,0.0090,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,500,0.0092,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,504,0.0092,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,508,0.0093,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,512,0.0092,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,516,0.0093,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,520,0.0094,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,524,0.0094,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,528,0.0094,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,532,0.0095,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,536,0.0096,0,0,0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,540,0.0098,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,544,0.0097,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,548,0.0098,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,552,0.0099,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,556,0.0099,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,560,0.0100,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,564,0.0102,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,568,0.0102,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,572,0.0103,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,576,0.0103,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,580,0.0105,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,584,0.0104,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,588,0.0106,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,592,0.0107,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,596,0.0106,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,600,0.0107,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,604,0.0109,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,608,0.0109,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,612,0.0109,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,616,0.0110,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,620,0.0117,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,624,0.0112,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,628,0.0111,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,632,0.0112,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,636,0.0113,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,640,0.0115,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,644,0.0114,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,648,0.0115,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,652,0.0116,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,656,0.0117,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,660,0.0117,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,664,0.0118,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,668,0.0119,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,672,0.0119,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,676,0.0119,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,680,0.0120,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,684,0.0121,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,688,0.0122,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,692,0.0122,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,696,0.0123,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,700,0.0124,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,704,0.0124,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,708,0.0125,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,712,0.0125,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,716,0.0126,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,720,0.0126,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,724,0.0127,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,728,0.0128,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,732,0.0128,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,736,0.0129,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,740,0.0130,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,744,0.0130,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,748,0.0131,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,752,0.0131,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,756,0.0132,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,760,0.0133,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,764,0.0134,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,768,0.0134,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,772,0.0136,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,776,0.0136,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,780,0.0136,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,784,0.0137,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,788,0.0138,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,792,0.0139,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,796,0.0139,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,800,0.0140,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,804,0.0141,0,0,0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,808,0.0142,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,812,0.0142,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,816,0.0143,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,820,0.0143,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,824,0.0144,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,828,0.0145,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,832,0.0145,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,836,0.0146,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,840,0.0147,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,844,0.0147,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,848,0.0148,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,852,0.0149,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,856,0.0149,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,860,0.0150,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,864,0.0150,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,868,0.0152,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,872,0.0151,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,876,0.0153,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,880,0.0153,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,884,0.0153,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,888,0.0155,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,892,0.0156,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,896,0.0156,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,900,0.0158,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,904,0.0158,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,908,0.0159,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,912,0.0159,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,916,0.0162,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,920,0.0162,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,924,0.0162,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,928,0.0162,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,932,0.0163,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,936,0.0164,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,940,0.0165,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,944,0.0165,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,948,0.0166,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,952,0.0167,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,956,0.0168,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,960,0.0168,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,964,0.0172,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,968,0.0173,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,972,0.0173,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,976,0.0173,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,980,0.0175,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,984,0.0176,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,988,0.0175,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,992,0.0176,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,996,0.0178,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,1000,0.0177,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,1004,0.0178,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,1008,0.0178,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,1012,0.0181,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,1016,0.0180,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,1020,0.0182,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n",
-      "200,32,1024,0.0179,0,0,0\n",
-      "mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.sflop.bin.csv .\n",
-      "bsub -W 60 -nnodes 1 -Is -P GEN110 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vflop.bin /gpfs/wolf/gen110/scratch/aherten//poisson2d.vflop.bin.csv\n",
-      "Job <4300> is submitted to default queue <batch>.\n",
-      "<<Waiting for dispatch ...>>\n",
-      "<<Starting on login1>>\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,4,0.0010,0,0,0\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,8,0.0011,150000,750,750\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,12,0.0012,246000,1230,1230\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,16,0.0012,342000,1710,1710\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,20,0.0013,438000,2190,2190\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,24,0.0014,534000,2670,2670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,28,0.0014,630000,3150,3150\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,32,0.0015,726000,3630,3630\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,36,0.0016,822000,4110,4110\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,40,0.0016,918000,4590,4590\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,44,0.0017,1014000,5070,5070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,48,0.0018,1110000,5550,5550\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,52,0.0018,1206000,6030,6030\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,56,0.0020,1302000,6510,6510\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,60,0.0020,1398000,6990,6990\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,64,0.0021,1494000,7470,7470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,68,0.0022,1590000,7950,7950\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,72,0.0022,1686000,8430,8430\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,76,0.0022,1782000,8910,8910\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,80,0.0023,1878000,9390,9390\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,84,0.0024,1974000,9870,9870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,88,0.0024,2070000,10350,10350\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,92,0.0025,2166000,10830,10830\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,96,0.0025,2262000,11310,11310\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,100,0.0026,2358000,11790,11790\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,104,0.0027,2454000,12270,12270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,108,0.0028,2550000,12750,12750\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,112,0.0028,2646000,13230,13230\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,116,0.0029,2742000,13710,13710\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,120,0.0032,2838000,14190,14190\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,124,0.0030,2934000,14670,14670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,128,0.0031,3030000,15150,15150\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,132,0.0031,3126000,15630,15630\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,136,0.0032,3222000,16110,16110\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,140,0.0033,3318000,16590,16590\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,144,0.0033,3414000,17070,17070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,148,0.0034,3510000,17550,17550\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,152,0.0034,3606000,18030,18030\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,156,0.0036,3702000,18510,18510\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,160,0.0036,3798000,18990,18990\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,164,0.0036,3894000,19470,19470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,168,0.0037,3990000,19950,19950\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,172,0.0038,4086000,20430,20430\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,176,0.0039,4182000,20910,20910\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,180,0.0039,4278000,21390,21390\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,184,0.0040,4374000,21870,21870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,188,0.0040,4470000,22350,22350\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,192,0.0041,4566000,22830,22830\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,196,0.0042,4662000,23310,23310\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,200,0.0042,4758000,23790,23790\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,204,0.0043,4854000,24270,24270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,208,0.0043,4950000,24750,24750\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,212,0.0044,5046000,25230,25230\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,216,0.0045,5142000,25710,25710\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,220,0.0047,5238000,26190,26190\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,224,0.0046,5334000,26670,26670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,228,0.0047,5430000,27150,27150\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,232,0.0047,5526000,27630,27630\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,236,0.0048,5622000,28110,28110\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,240,0.0049,5718000,28590,28590\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,244,0.0050,5814000,29070,29070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,248,0.0050,5910000,29550,29550\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,252,0.0051,6006000,30030,30030\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,256,0.0051,6102000,30510,30510\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,260,0.0052,6198000,30990,30990\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,264,0.0052,6294000,31470,31470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,268,0.0053,6390000,31950,31950\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,272,0.0054,6486000,32430,32430\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,276,0.0058,6582000,32910,32910\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,280,0.0055,6678000,33390,33390\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,284,0.0056,6774000,33870,33870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,288,0.0056,6870000,34350,34350\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,292,0.0057,6966000,34830,34830\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,296,0.0058,7062000,35310,35310\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,300,0.0059,7158000,35790,35790\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,304,0.0060,7254000,36270,36270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,308,0.0060,7350000,36750,36750\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,312,0.0061,7446000,37230,37230\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,316,0.0061,7542000,37710,37710\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,320,0.0062,7638000,38190,38190\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,324,0.0063,7734000,38670,38670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,328,0.0064,7830000,39150,39150\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,332,0.0064,7926000,39630,39630\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,336,0.0064,8022000,40110,40110\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,340,0.0065,8118000,40590,40590\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,344,0.0066,8214000,41070,41070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,348,0.0066,8310000,41550,41550\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,352,0.0068,8406000,42030,42030\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,356,0.0069,8502000,42510,42510\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,360,0.0068,8598000,42990,42990\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,364,0.0069,8694000,43470,43470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,368,0.0069,8790000,43950,43950\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,372,0.0070,8886000,44430,44430\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,376,0.0071,8982000,44910,44910\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,380,0.0071,9078000,45390,45390\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,384,0.0072,9174000,45870,45870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,388,0.0073,9270000,46350,46350\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,392,0.0074,9366000,46830,46830\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,396,0.0074,9462000,47310,47310\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,400,0.0075,9558000,47790,47790\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,404,0.0075,9654000,48270,48270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,408,0.0076,9750000,48750,48750\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,412,0.0077,9846000,49230,49230\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,416,0.0077,9942000,49710,49710\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,420,0.0078,10038000,50190,50190\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,424,0.0079,10134000,50670,50670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,428,0.0079,10230000,51150,51150\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,432,0.0080,10326000,51630,51630\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,436,0.0080,10422000,52110,52110\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,440,0.0081,10518000,52590,52590\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,444,0.0082,10614000,53070,53070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,448,0.0082,10710000,53550,53550\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,452,0.0083,10806000,54030,54030\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,456,0.0084,10902000,54510,54510\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,460,0.0085,10998000,54990,54990\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,464,0.0085,11094000,55470,55470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,468,0.0086,11190000,55950,55950\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,472,0.0088,11286000,56430,56430\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,476,0.0089,11382000,56910,56910\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,480,0.0088,11478000,57390,57390\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,484,0.0088,11574000,57870,57870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,488,0.0089,11670000,58350,58350\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,492,0.0090,11766000,58830,58830\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,496,0.0090,11862000,59310,59310\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,500,0.0091,11958000,59790,59790\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,504,0.0092,12054000,60270,60270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,508,0.0094,12150000,60750,60750\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,512,0.0092,12246000,61230,61230\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,516,0.0093,12342000,61710,61710\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,520,0.0093,12438000,62190,62190\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,524,0.0094,12534000,62670,62670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,528,0.0094,12630000,63150,63150\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,532,0.0095,12726000,63630,63630\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,536,0.0096,12822000,64110,64110\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,540,0.0100,12918000,64590,64590\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,544,0.0097,13014000,65070,65070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,548,0.0098,13110000,65550,65550\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,552,0.0099,13206000,66030,66030\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,556,0.0100,13302000,66510,66510\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,560,0.0101,13398000,66990,66990\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,564,0.0102,13494000,67470,67470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,568,0.0103,13590000,67950,67950\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,572,0.0103,13686000,68430,68430\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,576,0.0103,13782000,68910,68910\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,580,0.0105,13878000,69390,69390\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,584,0.0105,13974000,69870,69870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,588,0.0106,14070000,70350,70350\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,592,0.0106,14166000,70830,70830\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,596,0.0106,14262000,71310,71310\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,600,0.0108,14358000,71790,71790\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,604,0.0109,14454000,72270,72270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,608,0.0109,14550000,72750,72750\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,612,0.0109,14646000,73230,73230\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,616,0.0111,14742000,73710,73710\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,620,0.0111,14838000,74190,74190\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,624,0.0112,14934000,74670,74670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,628,0.0112,15030000,75150,75150\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,632,0.0112,15126000,75630,75630\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,636,0.0114,15222000,76110,76110\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,640,0.0114,15318000,76590,76590\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,644,0.0114,15414000,77070,77070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,648,0.0115,15510000,77550,77550\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,652,0.0117,15606000,78030,78030\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,656,0.0117,15702000,78510,78510\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,660,0.0117,15798000,78990,78990\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,664,0.0118,15894000,79470,79470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,668,0.0120,15990000,79950,79950\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,672,0.0120,16086000,80430,80430\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,676,0.0121,16182000,80910,80910\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,680,0.0120,16278000,81390,81390\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,684,0.0121,16374000,81870,81870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,688,0.0122,16470000,82350,82350\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,692,0.0122,16566000,82830,82830\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,696,0.0124,16662000,83310,83310\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,700,0.0124,16758000,83790,83790\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,704,0.0124,16854000,84270,84270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,708,0.0125,16950000,84750,84750\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,712,0.0125,17046000,85230,85230\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,716,0.0126,17142000,85710,85710\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,720,0.0126,17238000,86190,86190\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,724,0.0127,17334000,86670,86670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,728,0.0128,17430000,87150,87150\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,732,0.0130,17526000,87630,87630\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,736,0.0129,17622000,88110,88110\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,740,0.0129,17718000,88590,88590\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,744,0.0130,17814000,89070,89070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,748,0.0131,17910000,89550,89550\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,752,0.0132,18006000,90030,90030\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,756,0.0132,18102000,90510,90510\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,760,0.0133,18198000,90990,90990\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,764,0.0134,18294000,91470,91470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,768,0.0135,18390000,91950,91950\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,772,0.0136,18486000,92430,92430\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,776,0.0136,18582000,92910,92910\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,780,0.0137,18678000,93390,93390\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,784,0.0137,18774000,93870,93870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,788,0.0138,18870000,94350,94350\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,792,0.0138,18966000,94830,94830\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,796,0.0140,19062000,95310,95310\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,800,0.0140,19158000,95790,95790\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,804,0.0140,19254000,96270,96270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,808,0.0141,19350000,96750,96750\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,812,0.0142,19446000,97230,97230\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,816,0.0143,19542000,97710,97710\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,820,0.0143,19638000,98190,98190\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,824,0.0144,19734000,98670,98670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,828,0.0146,19830000,99150,99150\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,832,0.0146,19926000,99630,99630\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,836,0.0146,20022000,100110,100110\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,840,0.0147,20118000,100590,100590\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,844,0.0147,20214000,101070,101070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,848,0.0148,20310000,101550,101550\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,852,0.0148,20406000,102030,102030\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,856,0.0150,20502000,102510,102510\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,860,0.0150,20598000,102990,102990\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,864,0.0151,20694000,103470,103470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,868,0.0151,20790000,103950,103950\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,872,0.0152,20886000,104430,104430\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,876,0.0153,20982000,104910,104910\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,880,0.0154,21078000,105390,105390\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,884,0.0154,21174000,105870,105870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,888,0.0154,21270000,106350,106350\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,892,0.0155,21366000,106830,106830\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,896,0.0157,21462000,107310,107310\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,900,0.0156,21558000,107790,107790\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,904,0.0158,21654000,108270,108270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,908,0.0159,21750000,108750,108750\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,912,0.0159,21846000,109230,109230\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,916,0.0161,21942000,109710,109710\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,920,0.0161,22038000,110190,110190\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,924,0.0162,22134000,110670,110670\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,928,0.0164,22230000,111150,111150\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,932,0.0164,22326000,111630,111630\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,936,0.0164,22422000,112110,112110\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,940,0.0164,22518000,112590,112590\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,944,0.0165,22614000,113070,113070\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,948,0.0167,22710000,113550,113550\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,952,0.0168,22806000,114030,114030\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,956,0.0168,22902000,114510,114510\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,960,0.0168,22998000,114990,114990\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,964,0.0174,23094000,115470,115470\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,968,0.0172,23190000,115950,115950\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,972,0.0173,23286000,116430,116430\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,976,0.0172,23382000,116910,116910\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,980,0.0174,23478000,117390,117390\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,984,0.0174,23574000,117870,117870\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,988,0.0176,23670000,118350,118350\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,992,0.0176,23766000,118830,118830\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,996,0.0179,23862000,119310,119310\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,1000,0.0177,23958000,119790,119790\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,1004,0.0178,24054000,120270,120270\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,1008,0.0178,24150000,120750,120750\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,1012,0.0180,24246000,121230,121230\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,1016,0.0180,24342000,121710,121710\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,1020,0.0181,24438000,122190,122190\n",
-      "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n",
-      "200,32,1024,0.0178,24534000,122670,122670\n",
-      "mv /gpfs/wolf/gen110/scratch/aherten//poisson2d.vflop.bin.csv .\n"
-     ]
-    }
-   ],
-   "source": [
-    "!make bench_task4"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_sflop = pd.read_csv(\"poisson2d.sflop.bin.csv\", skiprows=range(2, 50000, 2))\n",
-    "df_vflop = pd.read_csv(\"poisson2d.vflop.bin.csv\", skiprows=range(2, 50000, 2))\n",
-    "df_flop = pd.concat([df_sflop.set_index(\"nx\"), df_vflop.set_index(\"nx\")[['PM_VECTOR_FLOP_CMPL (total)', 'PM_VECTOR_FLOP_CMPL (min)', ' PM_VECTOR_FLOP_CMPL (max)']]], axis=1).reset_index()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The name of the vector counter is a bit misleading; not floating point operations are measured but floating point instructions. To get *real* floating point operations, each value needs to be multiplied by the vector width (2). We can plot the values afterwards (non-interactive: `make graph_task4`)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "common.normalize(df_flop, \"PM_SCALAR_FLOP_CMPL (min)\", \"Scalar FlOps / Loop Iteration\")\n",
-    "common.normalize(df_flop, \"PM_VECTOR_FLOP_CMPL (min)\", \"Vector Instructions / Loop Iteration\")\n",
-    "df_flop[\"Vector FlOps / Loop Iteration\"] = df_flop[\"Vector Instructions / Loop Iteration\"] * 2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "\n",
-      "text/plain": [
-       "<Figure size 1008x432 with 1 Axes>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "df_flop.set_index(\"nx\")[[\"Scalar FlOps / Loop Iteration\", \"Vector FlOps / Loop Iteration\"]].plot();"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "With that measured, we can determine the Arithmetic Intensity; the balance of floating point operations to bytes transmitted:\n",
-    "\n",
-    "\\begin{align}\n",
-    "\\text{AI}^\\text{emp} = I_\\text{flop} / I_\\text{mem} \\text{,}\n",
-    "\\end{align}\n",
-    "\n",
-    "with $I$ denoting the respective amount. This is the emperically determined Arithmetic Intensity.\n",
-    "\n",
-    "In the non-interactive version of the Notebook, please plot the graph calling `make graph_task4-2` in the terminal."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 66,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "I_flop_scalar = df_flop.set_index(\"nx\")[\"Scalar FlOps / Loop Iteration\"]\n",
-    "I_flop_vector = df_flop.set_index(\"nx\")[\"Vector FlOps / Loop Iteration\"]\n",
-    "I_mem_load    = df_byte[\"Loads / Loop Iteration\"]\n",
-    "I_mem_store   = df_byte[\"Stores / Loop Iteration\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 75,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "\n",
-      "text/plain": [
-       "<Figure size 1008x432 with 1 Axes>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "df_ai = pd.DataFrame()\n",
-    "df_ai[\"Arithmetic Intensity\"] = (I_flop_scalar + I_flop_vector) / (I_mem_load + I_mem_store)\n",
-    "ax = df_ai.plot();\n",
-    "ax.set_ylabel(\"Byte/FlOp\");"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Thinking back to the first lecture of the tutorial, what Arithemtic Intensity did you expect?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Task E2: Measuring a Larger Range\n",
-    "<a name=\"taske2\"></a>\n",
-    "\n",
-    "If you still still have time, you might venture into your own benchmarking adventure.\n",
-    "\n",
-    "\n",
-    "**TASK**: Revisit the counters measured above for a larger range of `nx`. Right now, we only studied `nx` until 1000. New effects appear above that value – partly only well above, though ($nx > 15000$).\n",
-    "\n",
-    "You're on your own here. Edit the `bench.sh` script to change the range and the stepping increments.\n",
-    "\n",
-    "**Good luck!**\n",
-    "\n",
-    "[Back to top](#toc)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
+{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Hands-On: Performance Counters\n", "\n", "This Notebook is part of the exercises for the SC19 Tutorial \u00bbApplication Porting and Optimization on GPU-accelerated POWER Architectures\u00ab. It is to be run on a POWER9 machine; in the tutorial: on Ascent, the POWER9 training cluster of Oak Ridge National Lab.\n", "\n", "This Notebook can be run interactively on Ascent. If this capability is unavailable to you, use it as a description for executing the tasks on Ascent via a shell access. During data evaluation, the Notebook mentions the corresponding commands to execute in case you are not able to run the Notebook interactively directly on Ascent.\n", "\n", "## Table of Contents\n", "<a name=\"toc\"></a>\n", "\n", "* [Task 1: Measuring Cycles and Instructions](#task1)\n", "* [Task 2: Measuring Loads and Stores](#task2)\n", "  - [A: Loads and Stores](#task2-a)\n", "  - [B: More Loads and Stores](#task2-b)\n", "  - [C: Bandwidth](#task2-c)\n", "* [Task E1: Measuring FLOP](#taske1)\n", "* [Task E2: Measuring a Greater Range](#taske2)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Task 1: Measuring Cycles and Instructions\n", "<a name=\"task1\"></a>\n", "\n", "Throughout this exercise, the core loop of the Jacobi algorithm is instrumented and analyzed. The part in question is\n", "\n", "```c\n", "for (int iy = iy_start; iy < iy_end; iy++)\n", "{\n", "    for( int ix = ix_start; ix < ix_end; ix++ )\n", "    {\n", "        Anew[iy*nx+ix] = -0.25 * (rhs[iy*nx+ix] - (A[ iy   *nx+ix+1] + A[ iy   *nx+ix-1]\n", "                                                +  A[(iy-1)*nx+ix  ] + A[(iy+1)*nx+ix  ]));\n", "        error = fmaxr( error, fabsr(Anew[iy*nx+ix]-A[iy*nx+ix]));\n", "    }\n", "}\n", "```\n", "\n", "The code is instrumented using PAPI. The API routine `PAPI_add_named_event()` is used to add *named* PMU events outside of the relaxation iteration. After that, calls to `PAPI_start()`\n", "and `PAPI_stop()` can be used to count how often a PMU event is incremented.\n", "\n", "For the first task, we will measure quantities often used to characterize an application: cycles and instructions.\n", "\n", "**TASK**: Please measure counters for completed instructions and run cycles. See the TODOs in file [`poisson2d.ins_cyc.c`](poisson2d.ins_cyc.c). You can either edit the files with Jupyter capabilities by clicking on the link of the file or selecting it in the file drawer on the left; or use a dedicated editor on the system(`vim` is available). The names of the counters to be implemented are `PM_INST_CMPL` and `PM_RUN_CYC`.\n", "\n", "After changing the source code, compile it with `make task1` or by executing the following cell (we need to change directories first, though).  \n", "*(Using the `Makefile` we have hidden quite a few intricacies from you in order to focus on the relevant content at hand. Don't worry too much about it right now\u00a0\u2013 we'll un-hide it gradually during the course of the tutorial.)*\n", "\n", "[Back to top](#toc)"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["/autofs/nccsopen-svm1_home/aherten/OpenPOWER-SC19/Prototyping/2-Performance_Counters/Handson/Solutions\n"]}], "source": ["!pwd"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["/autofs/nccsopen-svm1_home/aherten/OpenPOWER-SC18/2-PAPI/Compiling/Solutions\n"]}], "source": ["%cd Tasks/\n", "# Use `%cd Solutions` to look at the solutions for each task"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["gcc -DUSE_DOUBLE -Ofast -std=c99 -lm -lpapi  poisson2d.ins_cyc.c -o poisson2d.ins_cyc.bin\n"]}], "source": ["!make task1"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Before we launch our measurement campaign we should make sure that the program is measuring correctly. Let's invoking it, for instance, with these arguments: `./poisson2d.ins_cyc.bin 100 64 32` \u2013 see the next cell. The `100` specifies the number of iterations to perform, `64` and `32` are the size of the grid in y and x direction, respectively."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "100,64,32,0.0011,3324225,33235,33960,1859440,18357,25033\n"]}], "source": ["!./poisson2d.ins_cyc.bin 100 64 32\n", "# alternatively call !make run_task1"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Alright! That should return a comma-seperated list of measurements.\n", "\n", "For the following runs, we are going to use Ascent's compute backend nodes which are not shared amongst users and also have six GPUs available (each!). We use the available batch scheduler *IBM Spectrum LSF* for this. For convenience, a call to the batch submission system is stored in the environment variable `$SC19_SUBMIT_CMD`. You are welcome to adapt it once you get more familiar with the system.\n", "\n", "For now, we want to run our first benchmarking run and measure cycles and instructions for different data sizes, as a function of `nx`. The Makefile holds a target for this, call it with `make bench_task1`:"]}, {"cell_type": "code", "execution_count": 2, "metadata": {"scrolled": true}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.ins_cyc.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.ins_cyc.bin.csv\n", "Job <24059> is submitted to default queue <batch>.\n", "<<Waiting for dispatch ...>>\n", "<<Starting on login1>>\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,4,0.0012,572978,2861,3639,261330,1235,4684\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,8,0.0014,1082978,5411,6189,601962,2914,5099\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,12,0.0014,1442978,7211,7989,811603,3992,5761\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,16,0.0014,1802978,9011,9789,1017305,4988,7017\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,20,0.0015,2162978,10811,11589,1221559,6002,7999\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,24,0.0016,2522978,12611,13389,1435167,7037,9259\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,28,0.0016,2882978,14411,15189,1633061,8054,9789\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,32,0.0017,3242978,16211,16989,1842895,9092,10889\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,36,0.0018,3602978,18011,18789,2042894,10108,12457\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,40,0.0019,3962978,19811,20589,2261332,11191,14233\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,44,0.0020,4322978,21611,22389,2458267,12112,14375\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,48,0.0020,4682978,23411,24189,2658621,13164,15613\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,52,0.0020,5042978,25211,25989,2866175,14190,16864\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,56,0.0021,5402978,27011,27789,3080357,15237,21565\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,60,0.0022,5762978,28811,29589,3283103,16278,18799\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,64,0.0022,6122978,30611,31389,3587582,17820,19681\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,68,0.0025,6482978,32411,33189,3893368,19284,20847\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,72,0.0025,6842978,34211,34989,4289441,21278,22715\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,76,0.0024,7202978,36011,36789,4208700,20936,22677\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,80,0.0025,7562978,37811,38589,4409613,21897,23855\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,84,0.0026,7922978,39611,40389,4611755,22921,24910\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,88,0.0026,8282978,41411,42189,4821904,23974,26087\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,92,0.0028,8642978,43211,43989,5104722,25036,38488\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,96,0.0028,9002978,45011,45789,5238952,26060,27927\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,100,0.0028,9362978,46811,47589,5441545,27049,29275\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,104,0.0030,9722978,48611,49389,5920763,28136,72679\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,108,0.0030,10082978,50411,51189,5853554,29106,31403\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,112,0.0030,10442978,52211,52989,6053498,30123,32279\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,116,0.0031,10802978,54011,54789,6296056,31338,33377\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,120,0.0033,11162978,55811,56589,6468115,32146,33869\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,124,0.0032,11522978,57611,58389,6675248,33233,35075\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,128,0.0033,11882978,59411,60189,6894325,34338,36207\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,132,0.0034,12242978,61211,61989,7093543,35299,37463\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,136,0.0034,12602978,63011,63789,7312105,36353,48105\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,140,0.0035,12962978,64811,65589,7503757,37375,39247\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,144,0.0036,13322978,66611,67389,7692611,38277,40419\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,148,0.0037,13682978,68411,69189,7968094,39656,42113\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,152,0.0037,14042978,70211,70989,8122466,40468,42706\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,156,0.0038,14402978,72011,72789,8328043,41484,45104\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,160,0.0040,14762978,73811,74589,8547674,42493,54216\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,164,0.0039,15122978,75611,76389,8738805,43542,45427\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,168,0.0040,15482978,77411,78189,8948025,44560,46819\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,172,0.0040,15842978,79211,79989,9186567,45735,47659\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,176,0.0041,16202978,81011,81789,9391949,46573,70131\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,180,0.0042,16562978,82811,83589,9549568,47559,54271\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,184,0.0042,16922978,84611,85389,9766306,48609,58645\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,188,0.0043,17282978,86411,87189,9974165,49613,56721\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,192,0.0044,17642978,88211,88989,10187263,50734,52953\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,196,0.0044,18002978,90011,90789,10386920,51763,53773\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,200,0.0045,18362978,91811,92589,10593326,52744,54962\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,204,0.0045,18722978,93611,94389,10791966,53796,55775\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,208,0.0046,19082978,95411,96189,10993938,54691,56692\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,212,0.0047,19442978,97211,97989,11183564,55716,57663\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,216,0.0047,19802978,99011,99789,11413409,56842,65317\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,220,0.0049,20162978,100811,101589,11747337,57952,85917\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,224,0.0049,20522978,102611,103389,11967444,58993,147575\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,228,0.0050,20882978,104411,105189,12176974,59986,107137\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,232,0.0051,21242978,106211,106989,12243039,61011,62843\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,236,0.0051,21602978,108011,108789,12454738,61985,74677\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,240,0.0051,21962978,109811,110589,12632612,62912,64911\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,244,0.0052,22322978,111611,112389,12844679,63954,74316\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,248,0.0053,22682978,113411,114189,13049050,65048,67067\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,252,0.0054,23042978,115211,115989,13274577,66113,68093\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,256,0.0054,23402978,117011,117789,13479975,67191,69232\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,260,0.0055,23762978,118811,119589,13702476,68321,70257\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,264,0.0055,24122978,120611,121389,13885554,69178,71473\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,268,0.0056,24482978,122411,123189,14091173,70236,72538\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,272,0.0057,24842978,124211,124989,14277355,71142,73153\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,276,0.0057,25202978,126011,126789,14477479,72149,74585\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,280,0.0058,25562978,127811,128589,14807542,73365,106386\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,284,0.0059,25922978,129611,130389,14919273,74349,83988\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,288,0.0060,26282978,131411,132189,15262342,75369,108903\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,292,0.0061,26642978,133211,133989,15457489,76550,112579\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,296,0.0061,27002978,135011,135789,15587890,77470,113796\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,300,0.0063,27362978,136811,137589,15736737,78474,80976\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,304,0.0062,27722978,138611,139389,15931699,79424,85309\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,308,0.0064,28082978,140411,141189,16127895,80426,82181\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,312,0.0063,28442978,142211,142989,16353667,81487,91316\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,316,0.0064,28802978,144011,144789,16544730,82526,84583\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,320,0.0064,29162978,145811,146589,16778054,83692,85621\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,324,0.0065,29522978,147611,148389,16975790,84670,86933\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,328,0.0066,29882978,149411,150189,17193806,85651,95908\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,332,0.0067,30242978,151211,151989,17391042,86658,92746\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,336,0.0067,30602978,153011,153789,17579650,87566,101073\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,340,0.0068,30962978,154811,155589,17823659,88601,131503\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,344,0.0069,31322978,156611,157389,18045749,89720,131352\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,348,0.0069,31682978,158411,159189,18233228,90790,129666\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,352,0.0070,32042978,160211,160989,18429938,91908,93827\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,356,0.0071,32402978,162011,162789,18723870,92891,169000\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,360,0.0071,32762978,163811,164589,18839189,93872,104313\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,364,0.0072,33122978,165611,166389,19052230,94828,108456\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,368,0.0072,33482978,167411,168189,19224348,95828,106832\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,372,0.0073,33842978,169211,169989,19409746,96825,98825\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,376,0.0074,34202978,171011,171789,19635914,97934,100015\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,380,0.0075,34562978,172811,173589,19901265,99194,108856\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,384,0.0075,34922978,174611,175389,20087150,100132,113306\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,388,0.0076,35282978,176411,177189,20289560,101187,111225\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,392,0.0076,35642978,178211,178989,20478069,102158,104431\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,396,0.0077,36002978,180011,180789,20703541,103136,118462\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,400,0.0078,36362978,181811,182589,20889687,104097,116051\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,404,0.0078,36722978,183611,184389,21103371,105019,150497\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,408,0.0079,37082978,185411,186189,21343392,106235,146574\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,412,0.0080,37442978,187211,187989,21499750,107213,116228\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,416,0.0081,37802978,189011,189789,21769516,108354,153304\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,420,0.0082,38162978,190811,191589,22016040,109333,166344\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,424,0.0082,38522978,192611,193389,22124948,110298,112586\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,428,0.0083,38882978,194411,195189,22375892,111391,164691\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,432,0.0083,39242978,196211,196989,22605417,112244,161120\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,436,0.0084,39602978,198011,198789,22698406,113231,115888\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,440,0.0084,39962978,199811,200589,22946025,114347,124840\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,444,0.0085,40322978,201611,202389,23138571,115404,122324\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,448,0.0086,40682978,203411,204189,23382319,116666,118990\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,452,0.0086,41042978,205211,205989,23582320,117634,123005\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,456,0.0087,41402978,207011,207789,23777586,118606,121054\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,460,0.0088,41762978,208811,209589,24021078,119638,157473\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,464,0.0089,42122978,210611,211389,24177273,120536,137152\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,468,0.0089,42482978,212411,213189,24354431,121510,124378\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,472,0.0090,42842978,214211,214989,24680874,122798,163001\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,476,0.0092,43202978,216011,216789,24806941,123695,126112\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,480,0.0091,43562978,217811,218589,25036974,124855,131240\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,484,0.0092,43922978,219611,220389,25277560,125834,159926\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,488,0.0093,44282978,221411,222189,25492002,126931,169890\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,492,0.0094,44642978,223211,223989,25799993,127811,292316\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,496,0.0094,45002978,225011,225789,25879076,128748,186367\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,500,0.0094,45362978,226811,227589,26021482,129705,143377\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,504,0.0095,45722978,228611,229389,26309697,130875,185497\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,508,0.0096,46082978,230411,231189,26445482,131853,134810\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,512,0.0097,46442978,232211,232989,26722882,133313,135480\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,516,0.0097,46802978,234011,234789,26902984,134116,143429\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,520,0.0098,47162978,235811,236589,27143327,135173,182663\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,524,0.0101,47522978,237611,238389,27899728,139067,143412\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,528,0.0099,47882978,239411,240189,27539695,137281,153792\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,532,0.0100,48242978,241211,241989,27665652,137957,156345\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,536,0.0102,48602978,243011,243789,27888664,139123,142069\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,540,0.0102,48962978,244811,245589,28116288,140162,167093\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,544,0.0102,49322978,246611,247389,28395864,141365,191687\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,548,0.0105,49682978,248411,249189,28539300,142352,144923\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,552,0.0104,50042978,250211,250989,28772000,143499,153080\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,556,0.0104,50402978,252011,252789,28943938,144344,160802\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,560,0.0105,50762978,253811,254589,29192011,145318,205574\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,564,0.0106,51122978,255611,256389,29371768,146296,173660\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,568,0.0107,51482978,257411,258189,29607085,147402,185216\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,572,0.0109,51842978,259211,259989,29760468,148529,150992\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,576,0.0108,52202978,261011,261789,30001693,149671,152448\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,580,0.0109,52562978,262811,263589,30194219,150474,161954\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,584,0.0110,52922978,264611,265389,30465237,151575,196784\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,588,0.0112,53282978,266411,267189,30866027,152658,345805\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,592,0.0112,53642978,268211,268989,30806266,153631,162459\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,596,0.0112,54002978,270011,270789,31013348,154624,161083\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,600,0.0113,54362978,271811,272589,31227644,155782,158034\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,604,0.0115,54722978,273611,274389,31534633,156837,219588\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,608,0.0114,55082978,275411,276189,31675474,157869,168332\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,612,0.0115,55442978,277211,277989,31953436,158989,218652\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,616,0.0116,55802978,279011,279789,32108644,160138,180416\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,620,0.0116,56162978,280811,281589,32277424,160849,182393\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,624,0.0118,56522978,282611,283389,32423394,161797,164245\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,628,0.0117,56882978,284411,285189,32609412,162678,167394\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,632,0.0118,57242978,286211,286989,32869379,163975,168634\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,636,0.0119,57602978,288011,288789,33151217,165037,223167\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,640,0.0119,57962978,289811,290589,33341299,166215,181218\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,644,0.0121,58322978,291611,292389,33649260,167751,199967\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,648,0.0121,58682978,293411,294189,33719599,168221,178799\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,652,0.0122,59042978,295211,295989,34067206,169536,235514\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,656,0.0122,59402978,297011,297789,34164102,170144,235618\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,660,0.0123,59762978,298811,299589,34456636,171594,235316\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,664,0.0124,60122978,300611,301389,34541178,172177,211827\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,668,0.0124,60482978,302411,303189,34905159,173832,222673\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,672,0.0126,60842978,304211,304989,34988298,174422,188003\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,676,0.0126,61202978,306011,306789,35263092,175911,185984\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,680,0.0127,61562978,307811,308589,35503073,176323,305860\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,684,0.0128,61922978,309611,310389,35672483,178036,180851\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,688,0.0128,62282978,311411,312189,35790039,178289,217803\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,692,0.0128,62642978,313211,313989,36045752,179866,188983\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,696,0.0130,63002978,315011,315789,36175144,180438,195986\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,700,0.0131,63362978,316811,317589,36529049,182248,184897\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,704,0.0130,63722978,318611,319389,36611747,182765,185703\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,708,0.0130,64082978,320411,321189,36811496,183626,191140\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,712,0.0131,64442978,322211,322989,37060383,184588,255521\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,716,0.0132,64802978,324011,324789,37267356,185684,240236\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,720,0.0132,65162978,325811,326589,37393434,186562,204926\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,724,0.0133,65522978,327611,328389,37611724,187635,203956\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,728,0.0135,65882978,329411,330189,37844476,188685,217329\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,732,0.0136,66242978,331211,331989,38097715,189879,238003\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,736,0.0136,66602978,333011,333789,38249665,190960,193797\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,740,0.0137,66962978,334811,335589,38496135,191882,202980\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,744,0.0136,67322978,336611,337389,38643004,192776,211409\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,748,0.0138,67682978,338411,339189,38834497,193752,204307\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,752,0.0139,68042978,340211,340989,39026422,194674,207102\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,756,0.0139,68402978,342011,342789,39292510,195755,242534\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,760,0.0140,68762978,343811,344589,39445808,196904,199749\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,764,0.0140,69122978,345611,346389,39707448,198140,208159\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,768,0.0141,69482978,347411,348189,39961335,199314,213386\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,772,0.0142,69842978,349211,349989,40195551,200268,262442\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,776,0.0143,70202978,351011,351789,40369481,201262,243178\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,780,0.0143,70562978,352811,353589,40454251,201889,204769\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,784,0.0143,70922978,354611,355389,40804167,203132,292206\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,788,0.0144,71282978,356411,357189,40880258,203888,220805\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,792,0.0145,71642978,358211,358989,41141375,205195,222680\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,796,0.0145,72002978,360011,360789,41346667,205890,276619\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,800,0.0146,72362978,361811,362589,41586665,207290,248916\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,804,0.0147,72722978,363611,364389,41696398,208106,211465\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,808,0.0148,73082978,365411,366189,41978951,209272,255137\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,812,0.0148,73442978,367211,367989,42187366,209918,283393\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,816,0.0149,73802978,369011,369789,42482639,211214,322437\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,820,0.0149,74162978,370811,371589,42512865,212010,227823\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,824,0.0151,74522978,372611,373389,42861251,213412,278868\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,828,0.0151,74882978,374411,375189,42979335,214191,262439\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,832,0.0152,75242978,376211,376989,43402619,215543,296991\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,836,0.0152,75602978,378011,378789,43382253,216450,232179\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,840,0.0154,75962978,379811,380589,43665001,217538,261020\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,844,0.0154,76322978,381611,382389,43762162,218196,232967\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,848,0.0156,76682978,383411,384189,44077885,219619,233562\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,852,0.0155,77042978,385211,385989,44269902,220266,357562\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,856,0.0156,77402978,387011,387789,44458368,221658,275183\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,860,0.0156,77762978,388811,389589,44599845,222530,244104\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,864,0.0158,78122978,390611,391389,44856987,223898,229495\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,868,0.0157,78482978,392411,393189,45070339,224667,268426\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,872,0.0158,78842978,394211,394989,45243346,225686,238504\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,876,0.0160,79202978,396011,396789,45425044,226467,285843\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,880,0.0160,79562978,397811,398589,45637897,227585,255503\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,884,0.0163,79922978,399611,400389,45922301,228540,294854\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,888,0.0161,80282978,401411,402189,46210377,229936,317062\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,892,0.0161,80642978,403211,403989,46224897,230736,244030\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,896,0.0163,81002978,405011,405789,46706945,232252,393574\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,900,0.0163,81362978,406811,407589,46846573,233803,243774\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,904,0.0165,81722978,408611,409389,47211102,235424,247115\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,908,0.0165,82082978,410411,411189,47420647,236067,308146\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,912,0.0167,82442978,412211,412989,47664515,237299,252663\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,916,0.0166,82802978,414011,414789,47825500,238210,307878\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,920,0.0168,83162978,415811,416589,48024315,239591,249230\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,924,0.0168,83522978,417611,418389,48204506,240348,286103\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,928,0.0168,83882978,419411,420189,48474452,241766,272232\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,932,0.0169,84242978,421211,421989,48643328,242408,310910\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,936,0.0170,84602978,423011,423789,49041567,243670,350571\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,940,0.0171,84962978,424811,425589,49009612,244295,313509\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,944,0.0171,85322978,426611,427389,49257311,245620,259650\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,948,0.0172,85682978,428411,429189,49415667,246533,254714\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,952,0.0172,86042978,430211,430989,49711139,247671,319628\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,956,0.0174,86402978,432011,432789,49856592,248552,271876\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,960,0.0174,86762978,433811,434589,50136102,249978,265617\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,964,0.0176,87122978,435611,436389,50925446,253713,295499\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,968,0.0178,87482978,437411,438189,51035835,253858,318894\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,972,0.0177,87842978,439211,439989,51188317,255334,306288\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,976,0.0178,88202978,441011,441789,51436023,256205,289239\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,980,0.0179,88562978,442811,443589,51703656,257814,300077\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,984,0.0179,88922978,444611,445389,51801305,257947,349721\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,988,0.0181,89282978,446411,447189,52056854,259676,262216\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,992,0.0182,89642978,448211,448989,52237864,260535,269494\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,996,0.0183,90002978,450011,450789,52526126,262024,274178\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,1000,0.0182,90362978,451811,452589,52578843,262284,265526\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,1004,0.0183,90722978,453611,454389,52896370,263840,273834\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,1008,0.0183,91082978,455411,456189,53074476,264385,308471\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,1012,0.0184,91442978,457211,457989,53382079,266422,284446\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,1016,0.0186,91802978,459011,459789,53434221,266486,275700\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,1020,0.0186,92162978,460811,461589,53712164,268036,277528\n", "iter,ny,nx,Runtime,PM_INST_CMPL (total),PM_INST_CMPL (min), PM_INST_CMPL (max),PM_RUN_CYC (total),PM_RUN_CYC (min), PM_RUN_CYC (max)\n", "200,32,1024,0.0187,92522978,462611,463389,53754294,268076,276795\n", "mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.ins_cyc.bin.csv .\n"]}], "source": ["!make bench_task1"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Once the run is completed, let's study the data!\n", "\n", "This can be done best in the interactive version of the Jupyter Notebook. In case this version of the description is unavailable to you, call the Makefile target `make graph_task1` (either with X forwarding, or download the resulting PDF)."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": ["import numpy as np\n", "import seaborn as sns\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import common\n", "%matplotlib inline\n", "sns.set()\n", "plt.rcParams['figure.figsize'] = [14, 6]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Execute the following cell if you want to switch to color-blind-safer colors"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["sns.set_palette(\"colorblind\")"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style scoped>\n", "    .dataframe tbody tr th:only-of-type {\n", "        vertical-align: middle;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: right;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>iter</th>\n", "      <th>ny</th>\n", "      <th>nx</th>\n", "      <th>Runtime</th>\n", "      <th>PM_INST_CMPL (total)</th>\n", "      <th>PM_INST_CMPL (min)</th>\n", "      <th>PM_INST_CMPL (max)</th>\n", "      <th>PM_RUN_CYC (total)</th>\n", "      <th>PM_RUN_CYC (min)</th>\n", "      <th>PM_RUN_CYC (max)</th>\n", "      <th>Grid Points</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>4</td>\n", "      <td>0.0012</td>\n", "      <td>572978</td>\n", "      <td>2861</td>\n", "      <td>3639</td>\n", "      <td>261330</td>\n", "      <td>1235</td>\n", "      <td>4684</td>\n", "      <td>128</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>8</td>\n", "      <td>0.0014</td>\n", "      <td>1082978</td>\n", "      <td>5411</td>\n", "      <td>6189</td>\n", "      <td>601962</td>\n", "      <td>2914</td>\n", "      <td>5099</td>\n", "      <td>256</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>12</td>\n", "      <td>0.0014</td>\n", "      <td>1442978</td>\n", "      <td>7211</td>\n", "      <td>7989</td>\n", "      <td>811603</td>\n", "      <td>3992</td>\n", "      <td>5761</td>\n", "      <td>384</td>\n", "    </tr>\n", "    <tr>\n", "      <th>3</th>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>16</td>\n", "      <td>0.0014</td>\n", "      <td>1802978</td>\n", "      <td>9011</td>\n", "      <td>9789</td>\n", "      <td>1017305</td>\n", "      <td>4988</td>\n", "      <td>7017</td>\n", "      <td>512</td>\n", "    </tr>\n", "    <tr>\n", "      <th>4</th>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>20</td>\n", "      <td>0.0015</td>\n", "      <td>2162978</td>\n", "      <td>10811</td>\n", "      <td>11589</td>\n", "      <td>1221559</td>\n", "      <td>6002</td>\n", "      <td>7999</td>\n", "      <td>640</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["   iter  ny  nx  Runtime  PM_INST_CMPL (total)  PM_INST_CMPL (min)  \\\n", "0   200  32   4   0.0012                572978                2861   \n", "1   200  32   8   0.0014               1082978                5411   \n", "2   200  32  12   0.0014               1442978                7211   \n", "3   200  32  16   0.0014               1802978                9011   \n", "4   200  32  20   0.0015               2162978               10811   \n", "\n", "    PM_INST_CMPL (max)  PM_RUN_CYC (total)  PM_RUN_CYC (min)  \\\n", "0                 3639              261330              1235   \n", "1                 6189              601962              2914   \n", "2                 7989              811603              3992   \n", "3                 9789             1017305              4988   \n", "4                11589             1221559              6002   \n", "\n", "    PM_RUN_CYC (max)  Grid Points  \n", "0               4684          128  \n", "1               5099          256  \n", "2               5761          384  \n", "3               7017          512  \n", "4               7999          640  "]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["plt.rcParams['figure.figsize'] = [14, 6]\n", "df = pd.read_csv(\"poisson2d.ins_cyc.bin.csv\", skiprows=range(2, 50000, 2))  # Read in the CSV file from the bench run; parse with Pandas\n", "df[\"Grid Points\"] = df[\"nx\"] * df[\"ny\"]  # Add a new column of the number of grid points (the product of nx and ny)\n", "df.head()  # Display the head of the Pandas dataframe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Let's have a look at the counters we've just measured and see how they scaling with increasing number of grid points.\n", "\n", "*In the following, we are always using the minimal value of the counter (indicated by \u00bb(min)\u00ab) as this should give us an estimate of the best achievable result of the architecture.*"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [{"data": {"image/png": "\n", "text/plain": ["<Figure size 1008x432 with 2 Axes>"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n", "df.set_index(\"Grid Points\")[\"PM_RUN_CYC (min)\"].plot(ax=ax1, legend=True);\n", "df.set_index(\"Grid Points\")[\"PM_INST_CMPL (min)\"].plot(ax=ax2, legend=True);"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Although some slight variations can be seen for run cycles for many grid points, the correlation looks quite linear (as one would naively expect). Let's test that by fitting a linear function!\n", "\n", "*The details of the fitting have been extracted into dedicated function, `print_and_return_fit()`, of the `common.py` helper file. If you're interested, [go have a look at it](common.py).* "]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": ["def linear_function(x, a, b):\n", "    return a*x+b"]}, {"cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Counter   PM_RUN_CYC (min) is proportional to the grid points (nx*ny) by a factor of  8.1021 (\u00b1 0.0057)\n", "Counter PM_INST_CMPL (min) is proportional to the grid points (nx*ny) by a factor of 14.0630 (\u00b1 0.0003)\n"]}], "source": ["fit_parameters, fit_covariance = common.print_and_return_fit(\n", "    [\"PM_RUN_CYC (min)\", \"PM_INST_CMPL (min)\"], \n", "    df.set_index(\"Grid Points\"), \n", "    linear_function,\n", "    format_uncertainty=\".4f\"\n", ")"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Let's overlay our fits to the graphs from before."]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"data": {"image/png": "\n", "text/plain": ["<Figure size 1008x432 with 2 Axes>"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n", "for ax, pmu_counter in zip([ax1, ax2], [\"PM_RUN_CYC (min)\", \"PM_INST_CMPL (min)\"]):\n", "    df.set_index(\"Grid Points\")[pmu_counter].plot(ax=ax, legend=True);\n", "    ax.plot(\n", "        df[\"Grid Points\"], \n", "        linear_function(df[\"Grid Points\"], *fit_parameters[pmu_counter]), \n", "        linestyle=\"--\", \n", "        label=\"Fit: {:.2f} * x + {:.2f}\".format(*fit_parameters[pmu_counter])\n", "    )\n", "    ax.legend();"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Please execute the next cell to summarize the first task."]}, {"cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["The algorithm under investigation runs about 8 cycles and executes about 14 instructions per grid point\n"]}], "source": ["print(\"The algorithm under investigation runs about {:.0f} cycles and executes about {:.0f} instructions per grid point\".format(\n", "    *[fit_parameters[pmu_counter][0] for pmu_counter in [\"PM_RUN_CYC (min)\", \"PM_INST_CMPL (min)\"]]\n", "))"]}, {"cell_type": "markdown", "metadata": {}, "source": ["**Bonus:**\n", "\n", "The linear fits also calculate a y intersection (\u00bb`b`\u00ab). How do you interpret this value?"]}, {"cell_type": "markdown", "metadata": {"exercise": "solution"}, "source": ["The y axis intersection; that is, `b` of the linear fit, is the inherent overhead of the program execution. Even if our program would not compute any stencil operation at all for any grid point, it would still complete this many (~1800) instructions and run this many (~680) cycles. Interestingly, it is also the unparallelizable overhead of this (toy) example."]}, {"cell_type": "markdown", "metadata": {}, "source": ["We are revisiting the graph in a little while.\n", "\n", "[Back to top](#toc)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Task 2: Measuring Loads and Stores\n", "<a name=\"task2\"></a>\n", "\n", "Looking at the source code, how many loads and stores from / to memory do you expect? Have a look at the loop which we instrumented.\n", "\n", "Let's compare your estimate to what the system actually does!\n", "\n", "### Task A\n", "<a name=\"task2-a\"></a>\n", "\n", "Please measure counters for loads and stores. See the TODOs in [`poisson2d.ld_st.c`](/edit/Tasks/poisson2d.ld_st.c). This time, implement `PM_LD_CMPL` and `PM_ST_CMPL`.\n", "\n", "Compile with `make task2`, test your program with a single run with `make run_task2`, and then finally submit a benchmarking run to the batch system with `make bench_task2`. The following cell will take care of all this.\n", "\n", "[Back to top](#toc)"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.ld_st.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.ld_st.bin.csv\n", "Job <24416> is submitted to default queue <batch>.\n", "<<Waiting for dispatch ...>>\n", "<<Starting on login1>>\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,4,0.0012,119819,598,817,32902,164,266\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,8,0.0013,161819,808,1027,56902,284,386\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,12,0.0014,221819,1108,1327,71902,359,461\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,16,0.0015,281819,1408,1627,86902,434,536\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,20,0.0015,341819,1708,1927,101902,509,611\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,24,0.0016,401819,2008,2227,116902,584,686\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,28,0.0016,461819,2308,2527,131902,659,761\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,32,0.0018,521819,2608,2827,146902,734,836\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,36,0.0018,581819,2908,3127,161902,809,911\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,40,0.0018,641819,3208,3427,176902,884,986\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,44,0.0019,701819,3508,3727,191902,959,1061\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,48,0.0020,761819,3808,4027,206902,1034,1136\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,52,0.0020,821819,4108,4327,221902,1109,1211\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,56,0.0021,881819,4408,4627,236902,1184,1286\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,60,0.0022,941819,4708,4927,251902,1259,1361\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,64,0.0023,1001819,5008,5227,266902,1334,1436\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,68,0.0023,1061819,5308,5527,281902,1409,1511\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,72,0.0025,1121819,5608,5827,296902,1484,1586\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,76,0.0028,1181819,5908,6127,311902,1559,1661\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,80,0.0025,1241819,6208,6427,326902,1634,1736\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,84,0.0026,1301819,6508,6727,341902,1709,1811\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,88,0.0026,1361819,6808,7027,356902,1784,1886\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,92,0.0027,1421819,7108,7327,371902,1859,1961\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,96,0.0028,1481819,7408,7627,386902,1934,2036\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,100,0.0029,1541819,7708,7927,401902,2009,2111\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,104,0.0029,1601819,8008,8227,416902,2084,2186\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,108,0.0031,1661819,8308,8527,431902,2159,2261\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,112,0.0030,1721819,8608,8827,446902,2234,2336\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,116,0.0031,1781819,8908,9127,461902,2309,2411\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,120,0.0032,1841819,9208,9427,476902,2384,2486\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,124,0.0033,1901819,9508,9727,491902,2459,2561\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,128,0.0033,1961819,9808,10027,506902,2534,2636\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,132,0.0034,2021819,10108,10327,521902,2609,2711\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,136,0.0035,2081819,10408,10627,536902,2684,2786\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,140,0.0036,2141819,10708,10927,551902,2759,2861\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,144,0.0036,2201819,11008,11227,566902,2834,2936\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,148,0.0036,2261819,11308,11527,581902,2909,3011\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,152,0.0037,2321819,11608,11827,596902,2984,3086\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,156,0.0038,2381819,11908,12127,611902,3059,3161\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,160,0.0040,2441819,12208,12427,626902,3134,3236\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,164,0.0039,2501819,12508,12727,641902,3209,3311\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,168,0.0040,2561819,12808,13027,656902,3284,3386\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,172,0.0040,2621819,13108,13327,671902,3359,3461\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,176,0.0041,2681819,13408,13627,686902,3434,3536\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,180,0.0041,2741819,13708,13927,701902,3509,3611\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,184,0.0042,2801819,14008,14227,716902,3584,3686\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,188,0.0044,2861819,14308,14527,731902,3659,3761\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,192,0.0044,2921819,14608,14827,746902,3734,3836\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,196,0.0045,2981819,14908,15127,761902,3809,3911\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,200,0.0045,3041819,15208,15427,776902,3884,3986\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,204,0.0045,3101819,15508,15727,791902,3959,4061\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,208,0.0046,3161819,15808,16027,806902,4034,4136\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,212,0.0047,3221819,16108,16327,821902,4109,4211\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,216,0.0047,3281819,16408,16627,836902,4184,4286\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,220,0.0048,3341819,16708,16927,851902,4259,4361\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,224,0.0049,3401819,17008,17227,866902,4334,4436\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,228,0.0050,3461819,17308,17527,881902,4409,4511\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,232,0.0050,3521819,17608,17827,896902,4484,4586\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,236,0.0051,3581819,17908,18127,911902,4559,4661\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,240,0.0051,3641819,18208,18427,926902,4634,4736\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,244,0.0052,3701819,18508,18727,941902,4709,4811\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,248,0.0053,3761819,18808,19027,956902,4784,4886\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,252,0.0053,3821819,19108,19327,971902,4859,4961\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,256,0.0054,3881819,19408,19627,986902,4934,5036\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,260,0.0055,3941819,19708,19927,1001902,5009,5111\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,264,0.0055,4001819,20008,20227,1016902,5084,5186\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,268,0.0056,4061819,20308,20527,1031902,5159,5261\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,272,0.0057,4121819,20608,20827,1046902,5234,5336\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,276,0.0057,4181819,20908,21127,1061902,5309,5411\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,280,0.0058,4241819,21208,21427,1076902,5384,5486\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,284,0.0059,4301819,21508,21727,1091902,5459,5561\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,288,0.0059,4361819,21808,22027,1106902,5534,5636\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,292,0.0060,4421819,22108,22327,1121902,5609,5711\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,296,0.0061,4481819,22408,22627,1136902,5684,5786\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,300,0.0061,4541819,22708,22927,1151902,5759,5861\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,304,0.0062,4601819,23008,23227,1166902,5834,5936\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,308,0.0063,4661819,23308,23527,1181902,5909,6011\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,312,0.0064,4721819,23608,23827,1196902,5984,6086\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,316,0.0066,4781819,23908,24127,1211902,6059,6161\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,320,0.0065,4841819,24208,24427,1226902,6134,6236\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,324,0.0065,4901819,24508,24727,1241902,6209,6311\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,328,0.0069,4961819,24808,25027,1256902,6284,6386\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,332,0.0066,5021819,25108,25327,1271902,6359,6461\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,336,0.0067,5081819,25408,25627,1286902,6434,6536\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,340,0.0068,5141819,25708,25927,1301902,6509,6611\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,344,0.0069,5201819,26008,26227,1316902,6584,6686\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,348,0.0069,5261819,26308,26527,1331902,6659,6761\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,352,0.0070,5321819,26608,26827,1346902,6734,6836\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,356,0.0070,5381819,26908,27127,1361902,6809,6911\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,360,0.0071,5441819,27208,27427,1376902,6884,6986\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,364,0.0072,5501819,27508,27727,1391902,6959,7061\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,368,0.0072,5561819,27808,28027,1406902,7034,7136\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,372,0.0073,5621819,28108,28327,1421902,7109,7211\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,376,0.0074,5681819,28408,28627,1436902,7184,7286\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,380,0.0074,5741819,28708,28927,1451902,7259,7361\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,384,0.0075,5801819,29008,29227,1466902,7334,7436\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,388,0.0076,5861819,29308,29527,1481902,7409,7511\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,392,0.0076,5921819,29608,29827,1496902,7484,7586\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,396,0.0077,5981819,29908,30127,1511902,7559,7661\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,400,0.0078,6041819,30208,30427,1526902,7634,7736\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,404,0.0079,6101819,30508,30727,1541902,7709,7811\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,408,0.0079,6161819,30808,31027,1556902,7784,7886\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,412,0.0080,6221819,31108,31327,1571902,7859,7961\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,416,0.0081,6281819,31408,31627,1586902,7934,8036\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,420,0.0081,6341819,31708,31927,1601902,8009,8111\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,424,0.0082,6401819,32008,32227,1616902,8084,8186\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,428,0.0082,6461819,32308,32527,1631902,8159,8261\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,432,0.0085,6521819,32608,32827,1646902,8234,8336\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,436,0.0084,6581819,32908,33127,1661902,8309,8411\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,440,0.0084,6641819,33208,33427,1676902,8384,8486\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,444,0.0085,6701819,33508,33727,1691902,8459,8561\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,448,0.0087,6761819,33808,34027,1706902,8534,8636\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,452,0.0087,6821819,34108,34327,1721902,8609,8711\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,456,0.0087,6881819,34408,34627,1736902,8684,8786\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,460,0.0088,6941819,34708,34927,1751902,8759,8861\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,464,0.0088,7001819,35008,35227,1766902,8834,8936\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,468,0.0089,7061819,35308,35527,1781902,8909,9011\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,472,0.0090,7121819,35608,35827,1796902,8984,9086\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,476,0.0091,7181819,35908,36127,1811902,9059,9161\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,480,0.0091,7241819,36208,36427,1826902,9134,9236\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,484,0.0092,7301819,36508,36727,1841902,9209,9311\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,488,0.0093,7361819,36808,37027,1856902,9284,9386\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,492,0.0094,7421819,37108,37327,1871902,9359,9461\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,496,0.0095,7481819,37408,37627,1886902,9434,9536\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,500,0.0094,7541819,37708,37927,1901902,9509,9611\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,504,0.0095,7601819,38008,38227,1916902,9584,9686\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,508,0.0096,7661819,38308,38527,1931902,9659,9761\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,512,0.0097,7721819,38608,38827,1946902,9734,9836\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,516,0.0098,7781819,38908,39127,1961902,9809,9911\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,520,0.0098,7841819,39208,39427,1976902,9884,9986\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,524,0.0099,7901819,39508,39727,1991902,9959,10061\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,528,0.0099,7961819,39808,40027,2006902,10034,10136\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,532,0.0100,8021819,40108,40327,2021902,10109,10211\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,536,0.0101,8081819,40408,40627,2036902,10184,10286\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,540,0.0101,8141819,40708,40927,2051902,10259,10361\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,544,0.0103,8201819,41008,41227,2066902,10334,10436\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,548,0.0103,8261819,41308,41527,2081902,10409,10511\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,552,0.0104,8321819,41608,41827,2096902,10484,10586\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,556,0.0106,8381819,41908,42127,2111902,10559,10661\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,560,0.0106,8441819,42208,42427,2126902,10634,10736\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,564,0.0106,8501819,42508,42727,2141902,10709,10811\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,568,0.0107,8561819,42808,43027,2156902,10784,10886\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,572,0.0108,8621819,43108,43327,2171902,10859,10961\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,576,0.0109,8681819,43408,43627,2186902,10934,11036\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,580,0.0110,8741819,43708,43927,2201902,11009,11111\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,584,0.0110,8801819,44008,44227,2216902,11084,11186\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,588,0.0110,8861819,44308,44527,2231902,11159,11261\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,592,0.0111,8921819,44608,44827,2246902,11234,11336\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,596,0.0113,8981819,44908,45127,2261902,11309,11411\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,600,0.0113,9041819,45208,45427,2276902,11384,11486\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,604,0.0114,9101819,45508,45727,2291902,11459,11561\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,608,0.0115,9161819,45808,46027,2306902,11534,11636\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,612,0.0115,9221819,46108,46327,2321902,11609,11711\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,616,0.0115,9281819,46408,46627,2336902,11684,11786\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,620,0.0116,9341819,46708,46927,2351902,11759,11861\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,624,0.0117,9401819,47008,47227,2366902,11834,11936\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,628,0.0117,9461819,47308,47527,2381902,11909,12011\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,632,0.0118,9521819,47608,47827,2396902,11984,12086\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,636,0.0119,9581819,47908,48127,2411902,12059,12161\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,640,0.0119,9641819,48208,48427,2426902,12134,12236\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,644,0.0121,9701819,48508,48727,2441902,12209,12311\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,648,0.0121,9761819,48808,49027,2456902,12284,12386\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,652,0.0121,9821819,49108,49327,2471902,12359,12461\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,656,0.0122,9881819,49408,49627,2486902,12434,12536\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,660,0.0123,9941819,49708,49927,2501902,12509,12611\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,664,0.0123,10001819,50008,50227,2516902,12584,12686\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,668,0.0124,10061819,50308,50527,2531902,12659,12761\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,672,0.0124,10121819,50608,50827,2546902,12734,12836\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,676,0.0126,10181819,50908,51127,2561902,12809,12911\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,680,0.0126,10241819,51208,51427,2576902,12884,12986\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,684,0.0127,10301819,51508,51727,2591902,12959,13061\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,688,0.0128,10361819,51808,52027,2606902,13034,13136\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,692,0.0128,10421819,52108,52327,2621902,13109,13211\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,696,0.0129,10481819,52408,52627,2636902,13184,13286\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,700,0.0131,10541819,52708,52927,2651902,13259,13361\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,704,0.0131,10601819,53008,53227,2666902,13334,13436\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,708,0.0130,10661819,53308,53527,2681902,13409,13511\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,712,0.0131,10721819,53608,53827,2696902,13484,13586\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,716,0.0132,10781819,53908,54127,2711902,13559,13661\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,720,0.0132,10841819,54208,54427,2726902,13634,13736\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,724,0.0134,10901819,54508,54727,2741902,13709,13811\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,728,0.0134,10961819,54808,55027,2756902,13784,13886\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,732,0.0134,11021819,55108,55327,2771902,13859,13961\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,736,0.0135,11081819,55408,55627,2786902,13934,14036\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,740,0.0137,11141819,55708,55927,2801902,14009,14111\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,744,0.0138,11201819,56008,56227,2816902,14084,14186\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,748,0.0137,11261819,56308,56527,2831902,14159,14261\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,752,0.0138,11321819,56608,56827,2846902,14234,14336\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,756,0.0139,11381819,56908,57127,2861902,14309,14411\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,760,0.0140,11441819,57208,57427,2876902,14384,14486\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,764,0.0140,11501819,57508,57727,2891902,14459,14561\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,768,0.0141,11561819,57808,58027,2906902,14534,14636\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,772,0.0141,11621819,58108,58327,2921902,14609,14711\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,776,0.0142,11681819,58408,58627,2936902,14684,14786\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,780,0.0143,11741819,58708,58927,2951902,14759,14861\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,784,0.0144,11801819,59008,59227,2966902,14834,14936\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,788,0.0144,11861819,59308,59527,2981902,14909,15011\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,792,0.0145,11921819,59608,59827,2996902,14984,15086\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,796,0.0145,11981819,59908,60127,3011902,15059,15161\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,800,0.0147,12041819,60208,60427,3026902,15134,15236\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,804,0.0147,12101819,60508,60727,3041902,15209,15311\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,808,0.0148,12161819,60808,61027,3056902,15284,15386\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,812,0.0148,12221819,61108,61327,3071902,15359,15461\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,816,0.0150,12281819,61408,61627,3086902,15434,15536\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,820,0.0149,12341819,61708,61927,3101902,15509,15611\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,824,0.0150,12401819,62008,62227,3116902,15584,15686\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,828,0.0151,12461819,62308,62527,3131902,15659,15761\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,832,0.0152,12521819,62608,62827,3146902,15734,15836\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,836,0.0152,12581819,62908,63127,3161902,15809,15911\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,840,0.0153,12641819,63208,63427,3176902,15884,15986\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,844,0.0153,12701819,63508,63727,3191902,15959,16061\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,848,0.0154,12761819,63808,64027,3206902,16034,16136\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,852,0.0155,12821819,64108,64327,3221902,16109,16211\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,856,0.0156,12881819,64408,64627,3236902,16184,16286\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,860,0.0156,12941819,64708,64927,3251902,16259,16361\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,864,0.0157,13001819,65008,65227,3266902,16334,16436\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,868,0.0158,13061819,65308,65527,3281902,16409,16511\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,872,0.0159,13121819,65608,65827,3296902,16484,16586\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,876,0.0159,13181819,65908,66127,3311902,16559,16661\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,880,0.0160,13241819,66208,66427,3326902,16634,16736\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,884,0.0160,13301819,66508,66727,3341902,16709,16811\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,888,0.0161,13361819,66808,67027,3356902,16784,16886\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,892,0.0162,13421819,67108,67327,3371902,16859,16961\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,896,0.0163,13481819,67408,67627,3386902,16934,17036\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,900,0.0164,13541819,67708,67927,3401902,17009,17111\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,904,0.0165,13601819,68008,68227,3416902,17084,17186\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,908,0.0165,13661819,68308,68527,3431902,17159,17261\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,912,0.0166,13721819,68608,68827,3446902,17234,17336\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,916,0.0166,13781819,68908,69127,3461902,17309,17411\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,920,0.0167,13841819,69208,69427,3476902,17384,17486\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,924,0.0168,13901819,69508,69727,3491902,17459,17561\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,928,0.0169,13961819,69808,70027,3506902,17534,17636\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,932,0.0175,14021819,70108,70327,3521902,17609,17711\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,936,0.0170,14081819,70408,70627,3536902,17684,17786\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,940,0.0171,14141819,70708,70927,3551902,17759,17861\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,944,0.0171,14201819,71008,71227,3566902,17834,17936\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,948,0.0172,14261819,71308,71527,3581902,17909,18011\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,952,0.0172,14321819,71608,71827,3596902,17984,18086\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,956,0.0173,14381819,71908,72127,3611902,18059,18161\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,960,0.0174,14441819,72208,72427,3626902,18134,18236\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,964,0.0176,14501819,72508,72727,3641902,18209,18311\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,968,0.0178,14561819,72808,73027,3656902,18284,18386\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,972,0.0177,14621819,73108,73327,3671902,18359,18461\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,976,0.0178,14681819,73408,73627,3686902,18434,18536\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,980,0.0179,14741819,73708,73927,3701902,18509,18611\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,984,0.0179,14801819,74008,74227,3716902,18584,18686\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,988,0.0180,14861819,74308,74527,3731902,18659,18761\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,992,0.0181,14921819,74608,74827,3746902,18734,18836\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,996,0.0182,14981819,74908,75127,3761902,18809,18911\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,1000,0.0182,15041819,75208,75427,3776902,18884,18986\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,1004,0.0183,15101819,75508,75727,3791902,18959,19061\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,1008,0.0183,15161819,75808,76027,3806902,19034,19136\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,1012,0.0184,15221819,76108,76327,3821902,19109,19211\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,1016,0.0185,15281819,76408,76627,3836902,19184,19286\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,1020,0.0185,15341819,76708,76927,3851902,19259,19361\n", "iter,ny,nx,Runtime,PM_LD_CMPL (total),PM_LD_CMPL (min), PM_LD_CMPL (max),PM_ST_CMPL (total),PM_ST_CMPL (min), PM_ST_CMPL (max)\n", "200,32,1024,0.0186,15401819,77008,77227,3866902,19334,19436\n", "mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.ld_st.bin.csv .\n"]}], "source": ["!make bench_task2"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Once the run finished, let's plot it again in the course of the following cells (non-interactive: `make graph_task2a`)."]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style scoped>\n", "    .dataframe tbody tr th:only-of-type {\n", "        vertical-align: middle;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: right;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>iter</th>\n", "      <th>ny</th>\n", "      <th>nx</th>\n", "      <th>Runtime</th>\n", "      <th>PM_LD_CMPL (total)</th>\n", "      <th>PM_LD_CMPL (min)</th>\n", "      <th>PM_LD_CMPL (max)</th>\n", "      <th>PM_ST_CMPL (total)</th>\n", "      <th>PM_ST_CMPL (min)</th>\n", "      <th>PM_ST_CMPL (max)</th>\n", "      <th>Grid Points</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>4</td>\n", "      <td>0.0012</td>\n", "      <td>119819</td>\n", "      <td>598</td>\n", "      <td>817</td>\n", "      <td>32902</td>\n", "      <td>164</td>\n", "      <td>266</td>\n", "      <td>128</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>8</td>\n", "      <td>0.0013</td>\n", "      <td>161819</td>\n", "      <td>808</td>\n", "      <td>1027</td>\n", "      <td>56902</td>\n", "      <td>284</td>\n", "      <td>386</td>\n", "      <td>256</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>12</td>\n", "      <td>0.0014</td>\n", "      <td>221819</td>\n", "      <td>1108</td>\n", "      <td>1327</td>\n", "      <td>71902</td>\n", "      <td>359</td>\n", "      <td>461</td>\n", "      <td>384</td>\n", "    </tr>\n", "    <tr>\n", "      <th>3</th>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>16</td>\n", "      <td>0.0015</td>\n", "      <td>281819</td>\n", "      <td>1408</td>\n", "      <td>1627</td>\n", "      <td>86902</td>\n", "      <td>434</td>\n", "      <td>536</td>\n", "      <td>512</td>\n", "    </tr>\n", "    <tr>\n", "      <th>4</th>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>20</td>\n", "      <td>0.0015</td>\n", "      <td>341819</td>\n", "      <td>1708</td>\n", "      <td>1927</td>\n", "      <td>101902</td>\n", "      <td>509</td>\n", "      <td>611</td>\n", "      <td>640</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["   iter  ny  nx  Runtime  PM_LD_CMPL (total)  PM_LD_CMPL (min)  \\\n", "0   200  32   4   0.0012              119819               598   \n", "1   200  32   8   0.0013              161819               808   \n", "2   200  32  12   0.0014              221819              1108   \n", "3   200  32  16   0.0015              281819              1408   \n", "4   200  32  20   0.0015              341819              1708   \n", "\n", "    PM_LD_CMPL (max)  PM_ST_CMPL (total)  PM_ST_CMPL (min)   PM_ST_CMPL (max)  \\\n", "0                817               32902               164                266   \n", "1               1027               56902               284                386   \n", "2               1327               71902               359                461   \n", "3               1627               86902               434                536   \n", "4               1927              101902               509                611   \n", "\n", "   Grid Points  \n", "0          128  \n", "1          256  \n", "2          384  \n", "3          512  \n", "4          640  "]}, "execution_count": 8, "metadata": {}, "output_type": "execute_result"}], "source": ["df_ldst = pd.read_csv(\"poisson2d.ld_st.bin.csv\", skiprows=range(2, 50000, 2))\n", "df_ldst[\"Grid Points\"] = df_ldst[\"nx\"] * df_ldst[\"ny\"] \n", "df_ldst.head()"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"data": {"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAF/CAYAAAB+GZmgAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzs3Xt0XGd97//3XDQjzeyt61x0m5Fv8U6cOHESO7FJQkhIuAQCpFB+TUkgJEDNIacF1oL20JTScllcF7cASdu0J4SWFvrjV25JG05aFglQICSBcoCdFBJLlmWNJMvSXCSNZmb//pjRlhTb0UiWPBrp81ory9HsRzPP6JFtffzs7/fxOI6DiIiIiIiInJq31hMQERERERFZ7xScRERERERElqDgJCIiIiIisgQFJxERERERkSUoOImIiIiIiCxBwUlERERERGQJCk4iIiIiIiJLUHASERERERFZgoKTiIiIiIjIEhScREREREREluCvZpBlWS8H3g94KIet99m2/TXLsnYC9wIdwBjwetu2n6p8zqpfExERERERqYUlg5NlWR7gPuAK27Z/YVnW+cD3Lcv6F+Au4HO2bX/JsqybgLuBqyufuhbXlhIE9gFDQLHKzxERERERkc3DB3QBPwFmqv2kqnacgBLQUvn/VsrBJAJcBFxbefzLwJ2WZUUp70yt6jXbtkeqmOc+4OEq35OIiIiIiGxeVwCPVDt4yeBk27ZjWdZrga9blpUFTOBlQAIYtG27WBlXtCzrSOVxzxpcqyY4DQGMj2cplZxqvwarqqPDYGwsU5PXlpXRmtUfrVn90ZrVH61ZfdF61R+tWe14vR7a2sJQyQ7VquZWPT/wv4BX2rb9fcuyLgP+Cbh5JRNdY0Vg7gtRMx0dRk1fX5ZPa1Z/tGb1R2tWf7Rm9UXrVX+0ZjW3rNKeam7V2wN027b9fYBKeMoC00CPZVm+ys6QD+gGBijvHK32taqNjWVqtuMUjZqMjKRr8tqyMlqz+qM1qz9as/qjNasvWq/6ozWrHa/Xs6LQWk078sNAr2VZFoBlWecAncBTwBPAjZVxNwKP27Y9Ytt2arWvLfudiYiIiIiIrJJqapyOWpb1VuCfLcsqVR5+o23bxyzLOgjca1nWe4Fx4PULPnUtromIiIiIiJxxHsepzS1ta2QL8PSzb9UrFguMj49QKOTXfAJer5dSqbT0QFk3nmvN/P4AbW1RfL5qG1DKmaDbG+qP1qz+aM3qi9ar/mzkNSs5DqnxKfqH0xwaTjMwnCFfKPGuG/fg81Zzw9vaWnCr3lbgmWo/b1P8NDg+PkJjY4hwuBOPx7Omr+X3eykUFJzqyanWzHEcstlJxsdHiES6ajAzERERkfVttlBkcDRL/3DGDUkDqQwzs+W+Cz6vh+5ImHP62mo809O3KYJToZA/I6FJNhaPx0M43Ewmc7zWUxERERGpudz0LP3DGfqH0/Snyr8OjeUoVu70CgZ8JGMGl5/fRTJmkIybdEfCNPhrv8u0GjZFcAIUmmRF9H0jIiIim43jOIynZ04ISaMT0+6YlnCAZNzkgh0RknGTZNwg2tqEdwP/7LRpgpOIiIiIiCxWKjkcPZZbFJD6hzNkpmbdMfG2JrZ0NXPlnu5ySIoZtBjBGs66NhScauA1r7meQCBAQ0OAUqnIG95wG9dc82Iee+xR/vAPD3LjjTfztrf9kTv+9tvfwhNPPMaDD36PUCh00ud87LFH+dznPs0999y36PGhoSP83u/dwNat2ymVihQKBS644ELe+MY3E4vFn3OeY2OjfP7zn+HnP/8ZjY1B/H4/N9zwu7ziFTdwzz1383d/99d89KOf4nnPuxyAXC7HK1/5YpLJLe48Lr98L9u37wA8eL0e3va2t7N37yXcf/83+cEPHuYDH/jokl+vz3zmE+zefQFXXXXNkmMX+vCH389LX/pyLrjgwucc99nPfoodO3Zy7bUvWdbzi4iIiNST/GyRwyPZRSHpcKrcuAHA7/PQEzHYc1aEvsouUm/UoCmoyAAKTjXzgQ98hG3bdvDkk7/m4MHb2Lv3UgCSyT4efvi7HDx4Oz6fjyNHBpmZmV7i2Z6bYRj87//9DwDMzs5y7733cPDgrXzxi/+EYZz88K/p6Wluv/0tXHfd9fzpn74Pr9dLOp3moYcedMfs3GnxwAPfcoPTf/zH/yGZ3HLCc33hC39LKBTi4Ye/y5//+f/im9/8TtVzT6WGefTRH/M//+c7l/GOy/7kT/6sqnE33fQG3vKWN/LCF74I7zro9CIiIiJyujJTs+7uUX+q/OvQWJa5htpNQR+JmMnz93TTFzdJxAy6I2H8Pv0sdCqbMjh9/7+GeOTnQ2vy3Fde2M3+XZ1Vj9+582xCoRBDQ4MANDWF2LJlKz/+8Q85cOByHnjgW7zkJS/jV7/65arMr6GhgTe96SA/+cmP+Ld/u59Xv/q1Jx33ne/8K83NLdx88xvdx0zT5FWverX78YUX7uWHP3yEyclJmpubeeCBb3HddS/n/vu/ddLn3LdvPxMTE0xMTFQ93/vv/yYveMEL3Vqje+65m/7+Z8hmswwM9GNZ53DTTW/gzjs/xdGjQ1x55dXubt3tt7+FG2+8mcsuu4IPfvB9BAIBBgb6SaWGOffc3dxxx1/g8Xhoa2uju7uXn/70x+zbt7/quYmIiIjUmuM4jE1Oz9cjDWcYSKUZm5xxx7SZQRIxg4t2RumLGyTiJtGWRtVyL9OmDE7ryWOPPUo+n6e3N8lTT9kAXHfd9Xz9619j//7LeOihB/nCF+7hk5/82Kq+7jnnnMvTT//2lNdt+9fs2nXecz6Hx+Ph6quv5aGHHuTSSw8wMzPN1q3bTzn+3//9O8Ricdraqm9H+fjjP+XGG28+YW5/8zf30dTUxK233sRdd93Jxz/+GYrFIr/7u6/gFa+4gUQiecJz/fa3v+FTn/o8Xq+XN77xdTz66I/coHTeebt59NGfKDiJiIjIulUslRgayzFQaf3dP5xmIJUhO10AwAPE20Ns72nh6otMEnGDZMykORyo7cQ3iE0ZnC7b3cVlu9fmXJ5qz3G6444/JhAIEg6H+eAHP4Jpmu61iy7ayyc+8WG+973vsm3bdlpaWtdgpksdfFzdwcgvfenL+cu//DOOHRvjJS952UnHvPWtt+LxeGlvb+dDH/r4smaZSg3T3t6+6LFLLtnv3mK4Y8cOtm/fSSBQ/gMhmexjcPDwSYPTFVe8gGCwXMhoWRaDg4fZt698rb29g5/97PFlzU1ERERkrczkixweyVQOkS3vIh0eyTLr1iN56Y2GudiKubtIiahBMOCr8cw3rk0ZnNaDuRqnk5nbyfnoRz/Ae97zvjV5/V/96pe8+MXXnfK6ZZ3Dt7/9jSWfp6enl4aGBr7xjf+PL37xH/nNb/77hDFzNU4rEQw2ks/nFz0WCMx3cfF6fQSDgQUfeykWi6d4roXjfIvG5fN5N1SJiIiInEmTuTwDw3MhqbyLdPRYzq1HCgX9JOMGV13YQzJePh+pqyOET7XZZ5SC0zr1ylf+Dk1NTVx66YFVfd7Z2Vnuu+/vGBlJ8aIXvfSU46655sX8/d/fyz/8wxe58cab8Xg8pNNpHnjgW7z2tTcuGnvw4O0MDh5ek52x7du3099/iPPOO3/Vn3uhQ4eeZseOs9b0NURERGRzcxyH0Ynp8plIjx7m10+P0Z/KMJ6er0dqbw6SjJnsOzvmno/U0ax6pPVAwWmdikZjvO51b1jW5/zmN09xww3zu0h7917Crbe+hUwmwy23/D7FYsFtR/6FL9xzyo56AE1NTdx551/x+c9/hte+9pU0NTXh9/v5nd/53RPGnnfe+SsKNj/84fcXzfe6667nzW9+66Ixz3/+1fz7v3+H6667ftnPXy3HcXj00Z8saoQhIiIicjoKxXI9ktvZrtICfGqmXI/k9UBnRxgr2UoyZro7SUZTQ41nLqficZzqalnqxBbg6bGxDKXS/Ps6evQQnZ19Z2QC1dY4SXWKxSJvfvPr+ehHP00kElmT13j00f/kgQe+zZ/92ftPev1Mfv9IdaJRk5GRdK2nIcugNas/WrP6ovWqrel8gYFUZlFnu8HRDIVi+efRgN9Lb8wgGTMqu0gmF5wTJz0xVeOZb05er4eODgNgK/BMtZ+nHSdZ13w+H+9613sYGhpcs+CUzWZ561v/cE2eW0RERDaWiWy+Eo7md5JS41NuWy2jqYFk3OCaixPuLlK8vemEeqTGgB9F3fqi4FRnbrvt5hOaH5x77nm8613vWdHz/fEfv4Ph4eFFj8XjcT7ykU+ueI6r7Zxzzl3T53/hC6/VLqGIiIgsUnIcRo5PLdpF6k+lmcjMN62KtDSSiBkcOLfTrUdqM4OqR9qgFJzqzD333Leqz7eeApKIiIhILRSKJQZHsvSn5neRBlIZpvPlf6z2ejx0R0Ls6mt3W38n4wbhRtUjbSabJjg5jqP0L8u2wWoARURENr2pmXI9knuA7HCGwdEsxUp9fKDBW95FOq+TvrhJImbQGw3T4Nf5SJvdpghOfn+AbHaScLhZ4Umq5jgO2ewkfr9O2xYREak3juNwPJNnIFU+QHYuJKWOzzdkMEMNJOMmL9rW7na2i7eF8Hr186KcaFMEp7a2KOPjI2Qyx9f8tbxeL6WS6mXqyXOtmd8foK0teoZnJCIiIstRchxS41PzB8hWgtJkbtYdE21tJBk3uez8Lre7XasR0D+qS9U2RXDy+fxEIl1n5LXUDrT+aM1ERETqx2yhyOBolv7hjBuSBlIZZmbL9Ug+r4fuSJjd2zvcXaREzCTUuCl+7JU1pO8gEREREVmXctOziw6P7R9OMzSWc+uRggEfyZjB5Qt2kbojYRr83iWeWWT5FJxEREREpKYcx2E8PXNCSBqdmHbHtIQD5YNjd0Tc1t/R1ia8utVOzhAFJxERERE5Y0olh6PHcosCUv9whszUfD1SvK2JLV3NXLmnuxySYgYtRrCGsxZRcBIRERGRNZKfLXJ4JLsoJB1OZchXDp73+zz0RAz2nBWhr7KL1Bs1aArqR1RZf/RdKSIiIiKnLTM16+4ezR0kOzSWZe5IxKagj0TM5Pl7ut3zkbojYfw+1SNJfVBwEhEREZGqOY7D2OQ0A8Nzh8hmGEilGZuccce0mUESMYOLdkbpixsk4ibRlka1/pa6VlVwsiyrEfgkcA0wDfzQtu23WJa1E7gX6ADGgNfbtv1U5XNW/ZqIiIiInDnFUomhsdyCkJRmIJUhO10AwAPE20Ns72nh6otMEnGDZMykOazD42XjqXbH6aOUA9NO27Ydy7LilcfvAj5n2/aXLMu6CbgbuHoNr4mIiIjIGpjJFzk8kqkcIlveRTo8kmXWrUfy0hsNc7EVc3eRElGDYMBX45mLnBlLBifLsgzg9UCvbdsOgG3bw5ZlxYCLgGsrQ78M3GlZVpTyP0Cs6jXbtkdO652KiIiICACTuTwDz2r9ffRYzq1HCgX9JOMGV13YQzJePh+pqyOEz6t6JNm8qtlx2k75lrk/tyzrKiAD3AFMAYO2bRcBbNsuWpZ1BEhQDkCrfU3BSURERGQZHMdhdGJ6fhepEpTG0/P1SO3NQZIxk31nx9zzkTqaVY8k8mzVBCc/sA143Lbtd1mWdSnwTeB313Rmp6Gjw6jp60ejZk1fX5ZPa1Z/tGb1R2tWf7Rm9aWtPczAcJrfDk7w2yMT/HZwgqcHJ9x6JK8HemImF5wVZWt3C9t7Wtja06J6pBrS77H64nHm9mRPwbKsCDAEBOZu1bMs65fALcCDQEdlZ8hHeWfqLMo7R0+u5rUqb9XbAjw9NpahVHru97VWolGTkZF0TV5bVkZrVn+0ZvVHa1Z/tGbr23S+wEAqU279PZzmyLEch4YmKRTLP/8E/F56YwbJmFHZRTLpiYYJNqgeab3Q77Ha8Xo9cxstW4Fnqv28JXecbNsetSzrPyjXHT1Y6XoXoxxwngBuBL5U+fXxuYBjWdaqXxMRERHZbCay+cr5SGk3KKXGp5j7J2KjqYEdva1cc3GCZKVpQ2d7k+qRRFZZtV31DgJ/a1nWJ4BZ4Gbbto9blnUQuNeyrPcC45SbSCz8nNW+JiIiIrIhlRyHkeNTbjiaO0h2IpN3x0RaGknEDA6c2+nWI7WZQWKxZu1eiKyxJW/VqzNb0K16skxas/qjNas/WrP6ozVbW4ViicGRLP2p+V2kgVSG6XwRAK/HQ3ckRCJmuq2/k3GDcGPDSZ9P61V/tGa1s2a36omIiIjIyk3NlOuR3ANkhzMMjmYpVv6RN9DgLe8inddJX9wkETPojYZp8KseSWQ9UXASERERWQWO43A8k2cgVW79PReSUsen3DFmqIFk3ORF29pJxsq7SPG2EF6vWn+LrHcKTiIiIiLLVHIcUuNTlfOR0u5hspO5WXdMtLWRZNzksvO73O52rUZA5yOJ1CkFJxEREZHnMFsoMjiaXdS0YSCVYWa2XI/k83rojoTZvb3D3UVKxExCjfoxS2Qj0e9oERERkYrc9Ox8QEqVfx0ay7n1SMGAj2TM4PLdXSTj5V2k7kiYBr9af4tsdApOIiIisuk4jsN4euaEkDQ6Me2OaQkHSMZNLtgRcVt/R1ub8OpWO5FNScFJRERENrRSyeHosdyigNQ/nCEzNV+PFG9rYktXM1fu6S6HpJhBixGs4axFZL1RcBIREZENIz9b5PCzzkc6nMqQL5QA8Ps89EQM9pwVcVt/J2IGTUH9SCQiz01/SoiIiEhdykzNurtHc0FpaCyLUy5HoinoIxEzef6ebjckdUfC+H2qRxKR5VNwEhERkXXNcRzGJqcZGJ47RDbDQCrN2OSMO6bNDJKIGVy0M0pf3CARN4m2NKr1t4isGgUnERERWTeKpRJDYzk3JA1UapKy0wUAPEC8PcT2nhauvsgkETdIxkyaw4HaTlxENjwFJxEREamJmXyRwyOZyiGy5V2kwyNZZt16JC+90TAXWzF3FykRNQgGfDWeuYhsRgpOIiIisuYmc3kGntX6++ixnFuPFAr6ScYNrrqwxz0fqasjhM+reiQRWR8UnERERGTVOI7D6MT0/C5SJSiNp+frkdqbgyRjJvvOjrnnI3U0qx5JRNY3BScRERFZkUKxXI/kdrarhKSpmUo9kge6OsJYyVaSMdPdSTKaGmo8cxGR5VNwEhERkSVN5wuVRg3zB8gOjmYoFMv32gX8XnpjBpeeM7eLZNITDRNsUD2SiGwMCk4iIiKyyEQ2T//YMP/1ZMoNSqnxKSrlSBhNDSTjBtdcnCBZadrQ2d6keiQR2dAUnERERDapkuMwcnxq0flI/ak0E5m8OybS0kgiZnDg3E63HqnNDKoeSUQ2HQUnERGRTaBQLDE4kqU/NV+PNJDKMJ0vAuD1eOiKhNjV105f3GC3FaM56CPcqHokERFQcBIREdlwpmbK9UjlXaQ0A8MZBkezFEuVeqQGb3kX6bxO+uImiZhBbzRMg3++HikaNRkZSdfqLYiIrDsKTiIiInXKcRyOZ/IMpBa0/h7OkDo+5Y4xQw0k4yYv2tbudraLt4XwenWrnYjIcig4iYiI1IGS45Aan6qcj5R2D5OdzM26Y6KtjSTjJped30UyVm793WoEVI8kIrIKFJxERETWmdlCkcHR7KLW3wOpDDOz5Xokn9dDdyTM7u0d7i5SImYSatRf6yIia0V/woqIiNRQbnp20eGx/cNphsZybj1SMOAjGTO4fHeXe4BsdyRMg1+tv0VEziQFJxERkTPAcRzG0zNuy++5sDQ6Me2OaQkHSMZNLtgRcVt/R1ub8OpWOxGRmlNwEhERWWWlksPRY7lFu0j9wxkyU/P1SPG2JrZ0NXPlnu5ySIoZtBjBGs5aRESey7KCk2VZfw68D9ht2/YvLMvaD9wNNAHPADfZtp2qjF31ayIiIutNfrbI4Wedj3Q4lSFfKAHg93noiRjsOSvitv5OxAyagvq3SxGRelL1n9qWZV0E7Af6Kx97gC8Bt9i2/YhlWXcAHwZuXYtrq/WGRUREViozNevuHs0FpaGxLE65HImmoI9EzOT5e7rdkNQdCeP3qR5JRKTeVRWcLMsKAp8Dfh/4j8rDe4Fp27YfqXx8F+UdolvX6JqIiMgZ4TgOY5PTDAzPHSKbYSCVZmxyxh3TZgZJxAwu2hmlL26QiJtEWxrV+ltEZIOqdsfpL4Ev2bb9tGVZc48lgUNzH9i2PWpZlteyrPa1uGbb9rEVvkcREZFTKpZKDI3l3JA0UKlJyk4XAPAA8fYQ23tauPoik0TcIBkzaQ4HajtxERE5o5YMTpZlHQD2AX+y9tNZHR0dRk1fPxo1a/r6snxas/qjNas/62HNpmcKPHN0kt8OTrj/HRqadOuRGvxe+rqauXxPD1u7W9je08KWrmYaN2k90npYM6me1qv+aM3qSzV/E1wJnA3M7Tb1Av8GfAbomxtkWVYEcGzbPmZZVv9qX1vOmxoby1CqnH9xpkWjJiMj6Zq8tqyM1qz+aM3qTy3WLJ3Ln3A+0tFjObceKRT0k4wbvODCHvd8pK6OED7v4nqk9OQUm/G7Tb/P6ovWq/5ozWrH6/WsaKNlyeBk2/aHKTdoAMCyrGeAlwO/BN5iWdbllZqkg8BXKsN+CjSt8jUREZETOI7D6MQ0/cNpDg1nGKgEpfH0fD1Se3OQZMxk39kx93ykjmbVI4mISPVWfO+Bbdsly7JuBu62LKuRSuvwtbomIiJSKJbrkdzOdpWQNDVTqUfyQFdHGCvZSjJWDkiJmIEZUj2SiIicHo/j1OaWtjWyBXhat+rJcmjN6o/WrP6sZM2m84VKo4b5A2QHRzMUiuU/3wN+L70xg2TMqOwimfREwwQbfGvxFjYd/T6rL1qv+qM1q50Ft+ptpbxRU5XNWe0qIiLrykQ2XwlHc2ckZUgdyzH3T2BGUwPJuME1FyfKu0hxk872phPqkURERNaKgpOIiJwxJcdh5PjUovOR+lNpJjJ5d0ykpZFEzODArrhbj9RmBlWPJCIiNaXgJCIia6JQLDE4kqU/lWZkcoYnnzlGfyrDdL4IgNfjoSsSYldfu3uAbDJuEG5sqPHMRURETqTgJCIip21qplyPdKhyu93AcIbB0SzFSr1pMOCjNxrmwHmd9MVNEjGD3miYBr/qkUREpD4oOImISNUcx+F4Js9AakHr7+EMqeNT7hgz1EAybvKibe1uZ7tzd8Y5Npap4cxFREROj4KTiIicVMlxSI1PVc5HKu8i9Q+nmczNumOirY0k4yaX7e50O9u1GoET6pF8XtUniYhIfVNwEhERZgtFBkezi1p/D6QyzMyW65F8Xg/dkTC7t3csOB/JJNSov0ZERGRz0N94IiKbTG56dtHhsf3DaYbGcovqkZIxg8t3d5GMl89I6o6EafCr9beIiGxeCk4iIhuU4ziMp2fclt9zYWl0Ytod0xIOkIybXLAjUr7VLmYQbWvCq9bfIiIiiyg4iYhsAKWSw9FjuUW7SP3DGTJT8/VI8bYmtnQ1c+WebjcktRjBGs5aRESkfig4iYjUmfxskcOV85HmdpEOpzLkCyUA/D4PPRGDPWdF3NbfiZhBU1B/5IuIiKyU/hYVEVnHMlOz7u5Rf6rc2W5oLEfJKdcjNQV9JGImz9/T7Yak7kgYv0/1SCIiIqtJwUlEZB1wHIexyWkGhucOkc0wkEozNjnjjmkzgyRiBhfujNIXN0jETaItjSe0/hYREZHVp+AkInKGFUsljo7l6K+EpIFKTVJ2ugCAB4i3h9je08LVF5kk4gbJmElzOFDbiYuIiGxiCk4iImtoJl/k8Mji1t+HR7LMuvVIXnqjYS62Ym7r70TUIBjw1XjmIiIispCCk4jIKknn8iecj3T0WI5KORKhoJ9k3OCqC3vckNTVEcLnVT2SiIjIeqfgJCKyTI7jMDoxPd+0oRKUxtPz9UjtzUGSMZN9Z8fKrb/jBh3NqkcSERGpVwpOIiLPoVAsMTSWOyEkTc1U6pE80NURxkq2koyVA1IiZmCGVI8kIiKykSg4iYhUTOcLlUYN8wfIDo5mKBTL99oF/F56YwaXnjO3i2TSEw0TbFA9koiIyEan4CQim9JENl8JR3NnJGVIHctRKUci3OgnGTe55uJEeRcpbtLZ3qR6JBERkU1KwUlENrSS4zByfGrR+Uj9qTQTmbw7JtLSSCJmcGBX3K1HajODqkcSERERl4KTiGwYhWKJwZEs/anKAbKVeqTpfBEAr8dDVyTErr529wDZZNwg3NhQ45mLiIjIeqfgJCJ1aWqmXI90qHK73cBwhsHRLMVSpR6pwVveRTqvk2Ss3Pq7Nxqmwa96JBEREVk+BScRWdccx+F4Js9AKs2hyi7S4GiOobGsO8YMNZCMm7xoW7vb2S7eFsLr1a12IiIisjoUnERk3Sg5DqnxKfqH0xyq7CL1D6eZzM26Y6KtjZyVbGP/rvnOdq1GQPVIIiIisqYUnESkJmYLRQZHs4tafw+kMszMluuRfF4P3ZEwu7d3LDgfySTU6CcaNRkZSdf4HYiIiMhmsmRwsiyrA7gP2A7MAP8N/IFt2yOWZe0H7gaagGeAm2zbTlU+b9WviUh9yk3PLjo8tn84zdBYzq1HCgZ8JGMGl+/uIhkv1yN1R8I0+NX6W0RERNaHanacHOCjtm1/F8CyrI8BH7Ys603Al4BbbNt+xLKsO4APA7daluVZ7Wur+aZFZG04jsN4esZt+T0XlkYnpt0xLeEAybjJBTsi5VvtYgbRtia8utVORERE1rElg5Nt28eA7y546D+BtwJ7gWnbth+pPH4X5R2iW9fomoisI6WSw9FjuUUBqX84Q2Zqvh4p3tbElq5mrtzT7YakFiNYw1mLiIiIrMyyapwsy/JSDk3fAJLAoblrtm2PWpbltSyrfS2uVQKciNRAfrbI4QXnI/UPpzmcypAvlADw+zz0RAz2nBWhL26SiBkkYgZNQZVRioiIyMaw3J9qPgtkgDuBG1Z/Oqujo8Oo6etHo2ZNX1+WT2s2L53L89vBifn/jkxwOJWhVKlHCjX62drdwksObGFbTwvbelrojZlnvB5Ja1Z/tGb1R2tWX7Re9UdrVl+qDk6WZX0cOAtpU9euAAAgAElEQVS43rbtkmVZ/UDfgusRwLFt+9haXFvOmxobm/8h70xTt6/6s1nXzHEcxianGRguHyI7UGnaMDY5445pM4MkYgbn7+8rHyLbaRJtaTyh9ffx8eyzn35NbdY1q2das/qjNasvWq/6ozWrHa/Xs6KNlqqCk2VZHwQuBl5m2/bcT1U/BZosy7q8UpN0EPjKGl4TkRUqlkocHcvR/6yQlJ0uAOAB4u0htve0cPVFJom4QTJm0hwO1HbiIiIiIutENe3IzwXeAzwJ/MCyLICnbdu+wbKsm4G7LctqpNI6HKCyI7Wq10SkOjP5IodHFrf+PjySZdatR/LSGw1zsRVzW38nogbBgK/GMxcRERFZvzyOU5tb2tbIFuBp3aony1HPa5bO5U84H+nosRxzv61DQb8bjuZ+7WwP4ffV9/lI9bxmm5XWrP5ozeqL1qv+aM1qZ8Gtelspb9RURS2vROqA4ziMTky7Lb/ngtJ4er4eqb05SDJmsu/smBuUOppPrEcSERERkeVTcBJZZwrFEkNjuRNC0tRMpR7JA10dYaxkK8lYOSAlYgZmSPVIIiIiImtFwUmkhqbzhUqjhvkDZAdHMxSK5XvtAn4vvTGDS8+Z20Uy6YmGCTaoHklERETkTFJwEjlDJrL5Sjiq7CSlMqSO5Zirxgs3+knGTa65OFHeRYqbdLY34fPWdz2SiIiIyEag4CSyykqOw8jxKfd8pHJISjORybtjIi2NJGIGB3bF3XqkNjOoeiQRERGRdUrBSeQ0FIolBkey9KfKAWmgUo80nS8C4PV46IqE2NXXTl9lFykZNwg3NtR45iIiIiKyHApOIlWaminXIx2q3G43MJxhcDRLsdL6PtDgLe8inddJMlZu/d0bDdPgVz2SiIiISL1TcBJ5Fsdx3HqkQ3O7SMMZUsen3DFmqIFk3ORF29rdznbxthBer261ExEREdmIFJxkUys5DoMjGZ741TCHKrtI/cNpJnOz7phoayPJuMlluzvdznatRkD1SCIiIiKbiIKTbBqzhRKDo4tbfw+kMszMluuRfF4P3ZEwu7d3LDgfySTUqN8mIiIiIpudfiKUDSk3Peu2/J5rAT40lnPrkYIBH8mYweW7uzh3R4S2UAPdkTANfrX+FhEREZETKThJXXMch/H0jNvye243aXRi2h3TEg6QjJtcsCNSvtUuZhBta8JbudUuGjUZGUnX6i2IiIiISB1QcJK6USo5HD2WWxSQ+oczZKbm65HibU1s6Wrmyj3dbkhqMYI1nLWIiIiIbAQKTrIu5WeLHF5wPlL/cJrDqQz5QgkAv89DT8Rgz1kR+uImiZhBImbQFNS3tIiIiIisPv2UKTWXmZploNL6uz9V7mw3NJaj5JTrkZqCPhIxk+fv6XZDUnckjN+neiQREREROTMUnOSMcRyHsclpBobLh8gOVBo3jE3OuGPazCCJmMGFO6PlQ2Q7TaItjWr9LSIiIiI1peAka6JYKnF0LEf/s0JSdroAgAeIt4fY3tPC1ReZJOIGyZhJczhQ24mLiIiIiJyEgpOctpnZIofn2n5Xfj08kmXWrUfy0hsNc7EVIxk3SMZNElGDYMBX45mLiIiIiFRHwUmWJZ3Lz3e0q4Sko8dyVMqRCAX9JOMGV13Y44akzvaQ6pFEREREpK4pOMlJOY7D6MS02/J7LiiNp+frkdqbgyRjJvvOjpVbf8cNOppVjyQiIiIiG4+Ck1Aolhgay50QkqZmKvVIHujqCGMlWt2AlIgZmCHVI4mIiIjI5qDgtMlM5wuVRg3zB8gOjmYoFMv32gX8XnpjBpeeU95FSsQNeqMGwQbVI4mIiIjI5qXgtIFNZPOVcFTZSUplSB3LUSlHItzoJxk3uebiRHkXKW7S2d6Ez6t6JBERERGRhRScNoCS4zByfMo9H6m/cpDsRCbvjulobiQZNziwK04ibtAXN2kzg6pHEhERERGpgoJTnSkUSxwZzboBaaBSjzSdLwLg9XjoioTY1ddOX2UXKRk3CDc21HjmIiIiIiL1S8FpHZuaKdcjHarcbjcwnGFwNEuxVKlHavCSiBkcOK+TZKzc+rs3GqbBr3okEREREZHVpOC0DjiO49YjHZrbRRrOkDo+5Y4xQw0k4yYv2tZOMlbeRYq3hfB6daudiIiIiMhaW5fBybKsncC9QAcwBrzetu2najur1VFyHFLjU5WQVN5F6h9OM5mbdcdEWxtJxk0u291Zaf9t0moEVI8kIiIiIlIj6zI4AXcBn7Nt+0uWZd0E3A1cXeM5LYtTadiQOj7FyPgUg6PZck1SKsPMbLkeyef10B0Js3t7h7uLlIiZhBrX67KIiIiIiGxO6+4ndMuyYsBFwLWVh74M3GlZVtS27ZHazWx5fvCLo9zz7V+5HwcDPpIxg8t3d5GMl+uRuiNhGvxq/S0iIiIist6tu+AEJIBB27aLALZtFy3LOlJ5vG6C0/nbO3jLK3bRbjYSbW2ixQjg1a12IiIiIiJ1aT0Gp9PW0WHU9PWjUZMosK2vo6bzkOpFo2atpyDLpDWrP1qz+qM1qy9ar/qjNasv6zE4DQA9lmX5KrtNPqC78nhVxsYylCotu8+0aNRkZCRdk9eWldGa1R+tWf3RmtUfrVl90XrVH61Z7Xi9nhVttKy7AhvbtlPAE8CNlYduBB6vp/omERERERHZWNbjjhPAQeBey7LeC4wDr6/xfEREREREZBNbl8HJtu1fA5eu4FN9QM0Pha3168vyac3qj9as/mjN6o/WrL5oveqP1qw2Fnzdfcv5PI/j1KYWaI1cDjxc60mIiIiIiMi6dwXwSLWDN1pwCgL7gCGgWOO5iIiIiIjI+uMDuoCfADPVftJGC04iIiIiIiKrbt111RMREREREVlvFJxERERERESWoOAkIiIiIiKyBAUnERERERGRJSg4iYiIiIiILEHBSUREREREZAkKTiIiIiIiIktQcBIREREREVmCgpOIiIiIiMgSFJxERERERESWoOAkIiIiIiKyBAUnERERERGRJSg4iYiIiIiILEHBSUREREREZAkKTiIiIiIiIktQcBIREREREVmCgpOIiIiIiMgSFJxERERERESWoOAkIiIiIiKyBAUnERERERGRJSg4iYiIiIiILEHBSUREREREZAkKTiIiIiIiIktQcBIREREREVmCgpOIiIiIiMgS/EsNsCyrA7gP2A7MAP8N/IFt2yOWZe0H7gaagGeAm2zbTlU+b9WviYiIiIiI1ILHcZznHGBZVjtwvm3b3618/DGgHXgT8BRwi23bj1iWdQewzbbtWy3L8qz2tSrfTxDYBwwBxWV9JUREREREZDPwAV3ATyhvDFVlyR0n27aPAd9d8NB/Am8F9gLTtm0/Unn8Lso7RLeu0bVq7AMernKsiIiIiIhsXlcAjyw5qmLJ4LSQZVleyqHpG0ASODR3zbbtUcuyvJUdqlW/VglwSxkCGB/PUio9907aWunoMBgby9TktWVltGb1R2tWf7Rm9UdrVl+0XvVHa1Y7Xq+HtrYwVLJDtZYVnIDPAhngTuCGZX7umVAE5r4QNdPRYdT09WX5tGb1R2tWf7Rm9UdrVl+0XvVHa1ZzyyrtqTo4WZb1ceAs4HrbtkuWZfUDfQuuRwDHtu1ja3FtOW9qbCxTsx2naNRkZCRdk9eWldGa1R+tWf3RmtUfrVl90XrVH61Z7Xi9nhWF1qrakVuW9UHgYuBVtm3PFVD9FGiyLOvyyscHga+s4TUREREREZGaqKYd+bnAe4AngR9YlgXwtG3bN1iWdTNwt2VZjVRahwNUdqRW9ZqIiIiIiEitLNmOvM5sAZ5+9q16xWKB8fERCoX8mk/A6/VSKpXW/HVk9ZxqzbxeH01NBobRgsfjqcHM5FR0e0P90ZrVH61ZfdF61Z+NvGa56VkGUhn6hzP0D6fpT2XIF0q8/7ZL8PuquuFtTS24VW8r5Y2aqiy3OURdGh8fobExRDjcueY/APv9XgoFBad6crI1cxyHYrFAOn2c8fER2ttjNZqdiIiIyPrkOA7j6ZlyQEql3aA0OjHtjmkJB0jEDXb1teP11vc/RG+K4FQo5M9IaJKNw+Px4Pc30NrawfDw4VpPR0RERKSmSiWHoWM5BobTi4JSZmoWAA8Qaw+xtauZK/d0k4ybJGMGLUawthNfRZsiOAEKTbIiHo8X2FC3s4qIiIg8p5nZIodTGfpTGQaG0xwazjA4Ur7dDsDv89ATNbhoZ4REzKQvbtITDdMU3NjRYmO/OxEREREROaXJXJ6BBbVI/cNpjh7LMdcGIRT0k4wbvODCHpJxg2TMpLMjtC5qlc40BacaeM1rricQCNDQEKBUKvKGN9zGNde8mMcee5Q//MOD3HjjzbztbX/kjr/99rfwxBOP8eCD3yMUCp30OWdnZ/n0pz/BE088hs9XbnZw88230tLSwhe+8FkAjh0bo1QqEYlEAXjjG9/MlVdeddLny+Wy3H335/jP//wBgUAAgGuvfQmvf/2t3H//N/nQh/6Cd7zj3bz61a8Fyve4vva1ryKXy/Dtbz+05Pv83Oc+zT333Lfk1+orX/kyhUKB3//9m6v86pb9zd/cxdat23jhC1/0nOO+9rWvMjWV5XWvu2VZzy8iIiJSTxzHYWRimv6j8wFpIJVhPD3jjuloDpKMm+w7O0Zf3CQRN+hobtSdWxUKTjXygQ98hG3bdvDkk7/m4MHb2Lv3UgCSyT4efvi7HDx4Oz6fjyNHBpmZmV7i2eCrX/0yk5MT3Hvvl/H5fORyOcbGRkkkklx66QEA7rnnbqamprj99rc/53M5jsO73vV2tm/fwZe+9FUaGhqYmZnmm9/8F3fMzp0W//qv33aD0+OP/5Tm5mZyuUxV77Ma09PT/PM//yP33fdPVX/OnDe96WBV417xiht43etew6te9RrCYZ3eLSIiIvWvUCxxZDTLoeF0eTcplWEglWZqpgiA1+OhKxLi7GSrW4uUiJsYTQ01nvn6timD0/f/a4hHfj60Js995YXd7N/VWfX4nTvPJhQKMTQ0CEBTU4gtW7by4x//kAMHLueBB77FS17yMn71q18+5/OkUina2zvw+XwAhEIhQqHkit7Do4/+mKGhI3z601/A7y9/iwSDjbzmNb/njunu7uH48eM8/fRv2bp1G/ff/02uu+7l/O3f/lVV77Ma3/3uQ+zZcxHBYCMA99//Tb7znX/FMEx+85uniEZjvP3t7+Lzn/80AwMDnHPOLt773vfj8Xj44Affx9lnn8OrX/3/cM89d9Pff4hsNsORI4P09PTy/vd/hMbGRvx+P5dcsp+HHvoOr3jFDSv6eomIiIjUSm66wEBqwS7ScIbB0SzFytE8gQYviZjB/nM7ScYMknGTnkiYQIOvxjOvP5syOK0njz32KPl8nt7eJE89ZQNw3XXX8/Wvf439+y/joYce5AtfuIdPfvJjz/k811//Kt75ztt57LGfsHv3BVx66fN4/vNfsKI5Pfnkr7Gss93QdCovecnLeOCBb3HLLW/iv/7rZ9xyy5tOGZxO9j6X8vjjP2XXrnMXPfarX/2SL37xH4nF4rz73W/nL/7iDu68869obGzktttu4tFHf8y+fSfuatn2r/jrv/4ihmHwznfezoMPPuAGpd27z+f7339YwUlERETWLcdxOJ7Jl2uRFtQjjRyfvzOpOdRAMm5y7rb28q12MYN4W6ju24CvF5syOF22u4vLdnetyXNXe47THXf8MYFAkHA4zAc/+BFM03SvXXTRXj7xiQ/zve99l23bttPS0rrk823fvoOvfOXr/Oxnj/Pznz/Bpz71MX70ox/wrne9Z9nvodpDka+++lpuvfUmEokkV1zxAne3a6Hnep9LGRlJ8bznXbHosfPPv4BYLA7AWWdZdHZ2YRjlW+x27DiLwcGBkwanSy7Z7772rl3nMTg432K8o6ODVCpV9bxERERE1lKp5DA8npu/1a4SlNK5WXdMrK2Jvs5mrji/0vo7btC6gVp/r0ebMjitB3O1Pyfj8Xi4+upr+ehHP8B73vO+qp8zGAxyySX7ueSS/Rw4cDnveMfbVhScLOscvva1r1IoFJ5z1ykUCnHuuedx112f5bOfvfukY57rfS4lGAySz88semyuUQWA1+slEAgu+NhHsVg86XMtHuddNG5mJk8wqD9oRERE5MybmS0yOJJdtIt0OPWs1t8Rgwt2RNxdpETM2PCtv9cjfcXXqVe+8ndoampyGzss5Wc/e5xEIkl7ewdQvt2uq6t7Ra+9d+8lxONx7rzzU7ztbX9UaQ4xw1e/+mVuuumWRWNvuukWdu06j23bdjA0dGRFr3cq27btoL//0Ko+58k888zT7Nixc81fR0RERDa3zNSsu4s0fHyapwbGGRrLuq2/m4J+kjGDK/dUWn/HTbo2aevv9UjBaZ2KRmO87nVvqHr80NARPvWpjzE7W8Dn89La2s573/v+Fb22x+Ph4x//DHfd9Tle97rX0NhYbs5w7bUvPWHs1q3b2Lp127Jf4ze/eYobbrjO/Xjv3kv40z9936IxV155FZ/4xEe47bY/WPbzL8ePfvQD3vzm/7GmryEiIiKbh+M4jE5MV+qRMgykMhwaTi9q/R1pbaI3EmavFa0cImvQ0aLW3+uZp9p6ljqxBXh6bCxDqTT/vo4ePURnZ98ZmUC1NU5SnXe+83YOHrydnTvPXpPnP3ToGT7+8Q/x2c+evKkFnNnvH6lONGoyMpKu9TRkGbRm9UdrVl+0XrUz1/p7LhzNtf+emikA4PFAV0fYPTw2GS/faretr0NrViNer4eODgNgK/BMtZ+nHSdZ197xjndz+HD/mj1/KnWUd797+XVgIiIisvlMzRQYSGUW1SMdGc1SKC5o/R012L8rTqISlHqjav29USg41Znbbrv5hAYI55573oqaQHzsYx/i//7fXyx6zOfzcc89953WHFdTIpEkkVjZeVTV2Ldvv3YJRUREZJG51t8DqfKtdnNBKTU+5Y4xK62/r93b7na1U+vvjU3Bqc6sZqhZSdgSERER2UjmWn/3D2forwSlgeE0kwtbf7c2kYgbXLa7yz1EttUIqB5pk9k0wclxHH1zy7I5TgnQ942IiMhGkJ8tMjiadZs29KfSHE5lmZkt383j83roiYQ5f3uERNygL27SGzUINW6aH5nlOWyK7wK/P0A2O0k43KzwJFVxHIdisUA6PU4g0Fjr6YiIiMgyZaZmGRhOc2g4495yNzSWo1RpjNYU9JGImVxxQZfbtKE7ElbrbzmlTRGc2tqijI+PkMkcX/PX8nq9lEqql6knp1ozr9dHU5OBYbTUYFYiIiJSDcdxGJuYdps19FeC0tjkfOvvNjNIMmZw4c4ofXGDRNwkqtbfskybIjj5fH4ika4z8lpqB1p/tGYiIiL1oVAscXQsV277vSAo5Ra0/u5sD7Gjt5WrK13tEnGD5lCgxjOXjWBTBCcRERERqS9TMwUOj2QWdbUbHMlSKJbvEgn4vfTGDC45J0YyXg5IvVGDoFp/yxpRcBIRERGRmprIzLi1SIcqXe1S41M4letGUwPJuME1e3vdrnbx9iZ8XtUjyZmj4CQiIiIiZ0TJcUiNTy3qatc/nGEym3fHRFsbScZMDpzXWT4fKWbQZgZVjyQ1p+AkIiIiIqtutjDX+nth04bMotbf3ZEwu7e1u13tEjGDUGNDjWcucnIKTiIiIiJyWrLTs4sCUn8qzdDofOvvxoCPZMzg8vO7SFaaNnRHwjT4daud1A8FJxERERGpiuM4HJuccZs1zAWlsclpd0yrESAZN7nwrIi7kxRpbcKrW+2kzlUVnCzL+jjwamALsNu27V9UHn8GmK78B/DHtm3/W+XafuBuoAl4BrjJtu3U6VwTERERkTOjWCoxNJZjYDizqP13drrS+hvo7AixvaeZqy/qIVHZSWoOq/W3bEzV7jj9C/Bp4OGTXHvNXJCaY1mWB/gScItt249YlnUH8GHg1pVeW8mbExEREZGlTecLHE5l3WYN/cNpDi9o/d3g99IbDbP37Jjb1a43ahAMqPW3bB5VBSfbth8BsCyr2ufdC0zPfR5wF+Xdo1tP45qIiIiInKaJbJ6B4bS7i3RoOEPqWM5t/R1u9JOMm7zw4h63q11nR0itv2XTW40ap7+v7BQ9ArzHtu3jQBI4NDfAtu1Ry7K8lmW1r/SabdvHVmGuIiIiIptCyXEYOT51QtOGicx86+9ISyPJuMmBXfFySIqr9bfIqZxucLrCtu0By7KCwKeAO4GbTn9ap6ejw6jp60ejZk1fX5ZPa1Z/tGb1R2tWf7Rm9WO2UGRipshvByd4enCC3wxO8MzQBFMz862/E3GTi8+Os62nhW3dLWztacFoUuvvWtLvsfpyWsHJtu2Byq8zlmV9HvhG5VI/0Dc3zrKsCODYtn3MsqwVXVvOvMbGMpRKztID10A0ajIykq7Ja8vKaM3qj9as/mjN6o/WbP3KzbX+XtDVbmgsS7Hys08w4CMRM3jeuV0k4gZ9cZPuSIgG/+J6pKnMNFOZ6ZO9hJwB+j1WO16vZ0UbLSsOTpZlhQG/bdsTlVv1fg94onL5p0CTZVmXV+qVDgJfOc1rIiIiIpuG4ziMp2fmb7WrBKXRifmw02IESMZMDpzfRcQMkowbRNX6W2RNVNuO/DPA7wCdwP+xLGsMuB74fy3L8gE+4JfA/wCwbbtkWdbNwN2WZTVSaSt+OtdERERENqpiqcTRsdyiXaSBVIbM1CxQbv0daw+xrbuZK/d00xc3ScRNWiqtv7V7IbL2PI5Tm1va1sgW4GndqifLoTWrP1qz+qM1qz9as7Uzky9yeGTxLtLhkSyzhXLrb7+v3Po7GTcqXe1MemNhGgOn/vdurVf90ZrVzoJb9bZS3qipymp01RMRERGRk5jM5ekfTi86RPbosRxz/2491/r7qgt73KDU2R7C71Prb5H1RsFJRERE5DSVHIfRudbfCw6RPb6g9XdHcyPJuMEl58TdQ2Tbm9X6W6ReKDiJiIiILEOhWGJwJOsGpIHhNAMjGbf1t9fjoTsS4py+dncXKREz1PpbpM4pOImIiIicQm66wEAqvWgn6cjogtbfDeXW3wfO7XQPkO2JhE9o/S0i9U/BSURERDY9t/V3pVnDXE3SwtbfzeEAybjB7m0d7k5SrE2tv0U2CwUnERER2VRKJYejx3KLutr1D8+3/gaItzWxpavc+jsRM+mLG7QYwRrOWkRqTcFJRERENqyZ2XLr74EFh8geTmXIu62/PfREDS48K+LeatcbNWgK6kckEVlMfyqIiIjIhpDO5d1apLlb7Ra2/g4F/STjBi+4sIdEzKAvbtLZodbfIlIdBScRERGpK47jMDIxzcBwmkOVrnb9qQzj6Rl3THtzkGTMZN/ZscohsgYdLY1q/S0iK6bgJCIiIutWoVjiyGh2UVe7gVR6Uevvro4QZydbScRMt2mDWn+LyGpTcBIREZF1YWqmwMCCZg39qTRHRrMUiuV77QINXhIxg/27Ot2A1BMJE2hQ628RWXsKTiIiInJGOY7D8Ux+UVe7geEMqeNT7hgz1EAybnLtvnaSlZ2keFsIr1e32olIbSg4iYiIyJoplRyGx3PlHaQFQSmdm2/9HWtrIhk3uPz8LncnqSUcUD2SiKwrCk4iIiKyKvKzRQ6PZOdrkYbTDIxkyM+WW3/7vB56omEu2BEhGSsHpERMrb9FpD7oTyoRERFZtszU7KJapP7hDENjWbf1d1PQTzJm8PwLuumrBKTuSFitv0Wkbik4iYiIyCk5jsPYxHS57XdqPigdm5xv/d1mBumLm1y8M+reahdR628R2WAUnERERAQot/4eGsvRP5xmJP0MTz5zjP7hDLmZAgAeD3R1hNnZ21q+zS5ukIwZmKFAjWcuIrL2FJxEREQ2obnW3wOpDIcqXe0GRzMLWn/76I2GuWRX3K1H6omGCar1t4hsUgpOIiIiG9zxzMwJXe1S4/Otv42mBvriBtfuTVR2kUzOs+IcG8vUcNYiIuuLgpOIiMgGUXIcUuNT800bKkFpMpt3x0RbG0nGTS47r5Nk3CQZN2k1Tmz97dN5SSIiiyg4iYiI1KHZQqX1dyUcDQyXb7ubmS0CldbfkTC7t7WXA1LMIBEzCTXqr34RkZXQn54iIiLrXGZqloEFt9n1pzIMjeYoVXp/NwV9JGImV5zfVdlFUutvEZHVpuAkIiKyTjiOw9jktHub3UAlKI09q/V3ImZw4VnRctOGznLrb69af4uIrCkFJxERkRooFEscHcu5h8fOBaXsdKX1N9DZEWJHbytXV7raJWIGzWG1/hYRqQUFJxERkTU2nS9wOJUtt/1OpTk0nGFwJEuhWAKgwe+lN2qw9+yYW4/UGzUIBtT6W0RkvVBwEhERWUUT2Xylq11lJymVIXUsh1O5bjQ1kIwbXHNxL8m4QSJu0tnehM+reiQRkfVsyeBkWdbHgVcDW4Ddtm3/ovL4TuBeoAMYA15v2/ZTa3VNRERkPSk5DiPjU5VdpPlDZCcWtP6OtJRbfx84N04yVm7a0GYGT2j9LSIi6181O07/AnwaePhZj98FfM627S9ZlnUTcDdw9RpeExERqYnZQonB0cyis5EGUhlm8vOtv7sjYc7b2u52tUvEDEKNDTWeuYiIrJYlg5Nt248AWJblPmZZVgy4CLi28tCXgTsty4pSrmdd1Wu2bY+s9A2KiIgsR3Z6loEFAal/OM3QWI5iqXyzXWPARyJmcPnurnJXu7hJdyRMg1+32omIbGQrrXFKAIO2bRcBbNsuWpZ1pPK4Zw2uKTiJiMiqchyHY5Mz9KfKt9jN3XI3OjHtjmkxAvTFTS7YEaEvbpKIG0Rbm9T6W0RkE9qQzSE6Ooyavn40atb09WX5tGb1R2tWf2q5ZsViicMjGX47OOH+9/SRCdK5WQA8HuiOGJyztYNtPb/crd4AABgHSURBVC1s627h/2/v3mMjvc77jn95X5Izu9wlOcNdLofW9ci6rLSyZEuyFCWChTRBDbexa1u1rDhBgMjNBW3i1kDgpBfAreG4SOLYroSkSZQ4MSDAqB0UqF0UqeuqToHUlWK7aY/t2FpybxySe+PwtiRn+se8HA5Xu0suOdyZIb8fYLHknHdmzu6jV8Nnz3l/7y3D+zmY3le3OTcCz7PmYr2ajzVrLlttnMaB4RBCW7Iy1AYcSR5v2YGxGzI9XaBYLG184A4YHEwzOTlTl/fW1liz5mPNms/NrNni5RXGJwuMT5Rjv8fzM5ycnGVpuTr6u5fjdwwymqTajVwl+nt5YYnJhaWbMudG5HnWXKxX87Fm9dPa2rKlhZYtNU4xxnwI4TXgGeDzye+vrl6LtBNjkiRd6dJq9HdyLdLYRIGJqujv3n3t5LJpnnpwuJJqN9TfY/S3JOmGbSaO/NPATwFDwH8JIUzHGO8BngdeCiH8BnAeeK7qaTsxJknao4qlEpMX5tddi3RiYoaLhbXo7/79+8hlU7zt7iy5bIpcJs2h/UZ/S5Jqo6VUqs+Wth3yJuCHbtXTjbBmzceaNZ8bqdnScpHTU7PrVpLG8wUWkujv1pYWjgz0lGO/k1S7kWyKXqO/a8rzrLlYr+ZjzeqnaqveLcDrm33ergyHkCQ1h7mFpWT1qHxN0li+wOmp2Ur0d1cS/f3YvUOV+yMND/TS0d62wStLklRbNk6SpB1Xjv5eKN9ANj9TuZHsuujv3k5y2TTHbutnJJNiNJtm8KDR35KkxmDjJEmqqWKxxJlzc+UVpKRROjk5y6XZ8vVILUDmUA+3HN7Pkw8cqWy5O5Dqqu/EJUm6DhsnSdKWLS6tcHKywNjEWvz3qckCl5Po7/a2VoYHe3nk3sMM7u9iNJtmeLCX7i4/fiRJzcVPLknSplyau8x4ssVuNbTh7Lk5VjOGerrayWVT/Ojx4Uqq3VB/D+1trV4ELUlqejZOkqR1SqUSkxcXGDu7PtXu/Mxi5Zj+/V3ksmkevivDaJJq179/n9HfkqRdy8ZJkvaw5ZVy9PeJiZnyalK+wHh+hvnFtejvwwM93JXrq1yLNJJNk+o2+luStLfYOEnSHjG3sMx4vmoVaaLAqero745y9Pcj9wxV7o80PNBLZ4fR35Ik2ThJ0i5TKpW4ULhcvhap6nqkyQtr0d/7ezrIZdPce2t/+XqkbJpMXzetrW61kyTpamycJKmJFYslJs7PrW21SxqlmbmlyjGZg92MDu3niWNHKjeR7TP6W5KkG2LjJElNYnFphVOTs+tWkU7mq6O/WxgeSHH/7QPlwIZMipFMyuhvSZJqwE9TSWpAhfmlN6winZmerUR/d3e1M5pN8eQDw5WtdoeT6G9JklR7Nk6SVEelUompiwuMTazFfp+YmFkX/X1ofxe5TJqHwiAjmTSj2RT9B4z+liTpZrJxkqSbZDX6e7U5Wo3/nl9cBqClBQ739xJyfeQy5WuRRjIp0j2ddZ65JEmycZKkHTC/uMx4vrDueqTTU7Msr5T32nV2tDIymOKRu7OMZFPkMmmODhr9LUlSo7JxkqRtWI3+Hs/PVLbbjeUL5M/PV45JJ9HfTz90qJJqlz3YY/S3JElNxMZJkjZpNfp7bKLAWNIojU/McKk6+ruvm1w2xdvvO1y5iWxfqtPrkSRJanI2TpJ0FZeXVjg1lUR/J43Syfwsi0srALS1tjA82Mux2wYqqXZGf0uStHv5CS9pzyvMLzE+McOJiUJly92Z6TmKSfZ3d1cbI5k0T9x/uBLacGSg1+hvSZL2EBsnSXtGqVRi+uJCJaxhLGmUpi+tRX8fTHeRy6Q4fucgo9kUI9k0g0Z/S5K059k4SdqVlleKnJ2eK8d+VzVKc1XR30OHerjjaB9PVW2122/0tyRJugobJ0lNb+HyavT3WqrdqclZlleKAHS2t3I0k+Ktb86UG6RsiqODKbqM/pYkSZtk4ySpqVwsLDI2Pce3v5uvNEr58/OUkvFUdwe5bIp3PHS0kmqXPdRNW6vXI0mSpK2zcZLUkIqlEvnz8+tS7cYmClyavVw5ZrBvH7lMmsfuHWIkmyaXSXEw3eX1SJIkqeZsnCTV3dLyavR3dWhDYV3095GBXu679RC5TJpjIUO6s42eff4vTJIk3Rz+1CHppppdWFrXII3lZzgztRb9va+zjVwmxePHDpfvj5RJc2Sgl472ta12g4NpJidn6vVHkCRJe5CNk6QdUSqVOHdpsRLWsNooTV9aqBzTl+okl01z/I6Byv2RBvq6aXWrnSRJajDbbpxCCK8DC8kvgI/GGL8aQngEeBHoBl4Hno0x5pPnbGlMUmNaKRY5Mz3H+ERhXfz37EIS/Q0M9fdw2/B+nnpwmJFkJWl/r9HfkiSpOdRqxek9McbvrH4TQmgBPg98KMb4SgjhY8AngJ/d6liN5ilpmxYuL3MyP1sJaxibmOFkVfR3R3srRwd7eeiuTCXV7uhgiq5Oo78lSVLz2qmteg8BCzHGV5LvX6C8evSz2xiTdJNdnL3M+MRMZRXpxESB/Lm5SvR37752ctk073jL0WQVKcVQf4/R35IkadepVeP0p8lq0SvArwE54MTqYIxxKoTQGkI4tNWxGOO5Gs1V0hWKpRKTF+bfENpwsbAW/T1wYB+5bJpH786Sy5avRzL6W5Ik7RW1aJyeiDGOhxC6gN8GPgP8hxq87pb196fq+fYMDqbr+v66cXupZkvLK5w4O8MPT13kB6cu8oPTF/nh6UvML5avR2prbWEkm+Ytd2W5dfgAtx45wC3DB0h1d9R55uvtpZrtFtas+Viz5mK9mo81ay7bbpxijOPJ74shhM8Bfw78DjC6ekwIYQAoxRjPhRDGtjJ2I3Oani5QLJY2PnAHGJPcfHZzzeZWo7+rUu3OTM+ykpwfXUn092P3DJWjv7Npjgz00NG+/nqk+cIC84WFq71FXezmmu1W1qz5WLPmYr2ajzWrn9bWli0ttGyrcQoh9ALtMcaLyVa99wOvAd8EukMIjyfXKz0PvJw8batjkq6hVCpxfmZxbatd0ihNXVxrdg6kOsll0tx/e39lq92g0d+SJEmbst0VpyzwxRBCG9AG/A3wj2KMxRDCB4EXQwj7SGLFAbY6JqlspVjk7PTculWk8XyBwvwSUI7+zhzq4dYj+3nygSOMZtOMZNMcMPpbkiRpy7bVOMUYfwAcv8bYN4D7ajkm7TWLl1c4Obl+Fenk5CxLy+Xo7/a2cvT3g3cOVrbaHR3sZV+n97aWJEmqJX+6khrEpbnLjE3MrLuJ7Nlzc5SSy/VWo79/7PhwsoqU4rDR35IkSTeFjZN0kxVLJaZWo7+rbiJ7oSr6u3//PnLZFG99c7ZyE9lD+43+liRJqhcbJ2kHLa8UOTU5W2mQxidmGJ8sML+4AkBrSwtHBnp48+ihyla7kUyq4aK/JUmS9jobJ6lG5haWGc/PrFtJOj1VFf3d0cZIJsWj9wxVUu2GB3rfEP0tSZKkxmPjJN2gUqnEhcLl8nVIE2uN0uSFtejv/b2d5LIp7ru1v7KSlDlo9LckSVKzsnGSrqNYLHH23Ny6VLuxibXob4DswW7eNLSfH7n/SHklKZPiQKqrjrOWJElSrdk4SYnFpXL093jVTWRP5gtcrkR/tzA8mOL4HQOVrXZHB1N0d3kaSZIk7Xb+xKc9aWbucmWLXf7CIt8dO7cu+runq51cNsWPHh9mJJNiNJtmqL+H9jajvyVJkvYiGyftaqVSicmLC4xPzHAiSbUbyxc4P7NYOWbwYDfD/b08fFemstWu/8A+o78lSZJUYeOkXWN5pcjpqdl1qXbj+Zl10d+H+3u4K9fHSCbNaDbFSDbNLblDTE7O1Hn2kiRJamQ2TmpK84vLjFeFNYzlZzg9NcvySnmvXWdHKyOZFI/cPVRJtRse6KWzw+hvSZIk3TgbJzW01ejv6lS78YkC+QvzlWPSPR3ksmmefvgQuUw5tCF7sIfWVrfaSZIkqTZsnNQwisUSE+fnyitIVY3SzNxa9HfmYDe5bIrHjx2urCQd6O30eiRJkiTtKBsn1cXlpRVOTc0mN5FNVpImC1xeKkd/t7W2MDzYy/23D5DLlBukkYzR35IkSaoPfwrVjivML627Fml8osCZ6TmKSfZ3d1c7uUyKJ+8fJpdNMZJJcWSg1+hvSZIkNQwbJ9VMqVRi+uJCOfY7v9Yonbu0Fv19MN3FaDbNg3cOVm4iO2D0tyRJkhqcjZO2ZHmlyJnpucpK0mqjNLe4DEBLCxzu7+XOo33lbXbZFLlMinRPZ51nLkmSJN04GydtaDX6ezxfqFyTdGqqsBb93d7K0UyKt96drVyPNDzYS5fR35IkSdolbJy0zoXC4htS7fLn16K/U90djGZTPP3QCCPZFKPZtNHfkiRJ2vVsnPaoYqlE/vz8WmhD0ihdmr1cOWawbx+5bJq33zuUXI+Upi9l9LckSZL2HhunPWBpeYWTk7OV5mh8orztbnFpBUiivwd6OXZrf+VapJFMmp59/uchSZIkgY3TrlOYX2K8apvdWL7Amanq6O82RjJpnjh2uJJqZ/S3JEmSdH02Tk2qVCoxfWmB8YkksCFplKaviP4eyaQ4fsdgObRhKM3AgX20utVOkiRJuiE2Tk1geaXI2ek5xvJr1yON5wvMLqxFfw8d6uH2o308laTajWRT7Df6W5IkSaoJG6cGs3B5mZP52WQVaYYTEwVOTc6yvFIEoKO9laODKR6+K8NIstXu6ECKrk6jvyVJkqSdYuNURxdnLyepdslKUr5A/twcpWQ81d1BLpviHW85Si6bYiSbZuhQN22tXo8kSZIk3UwN2TiFEO4EXgL6gWnguRjj9+o7q60rlkpMnp9fC2xItttdrIr+HjhQjv5+9J4suUx5Jelgusvob0mSJKkBNGTjBLwAfDbG+PkQwrPAi8BTdZ7TDVkpFvnr70/z2vem+NbfTnFpbgkoR38fGejl3lsOVVLtRjIpevZ11HnGkiRJkq6l4RqnEEIGeBB4OnnoC8BnQgiDMcbJ+s3sxrzyrTO89JVIT1c7993Wz5tHDzKaTXNkoJeOdrfaSZIkSc2k4RonYAQ4FWNcAYgxroQQTiePN03j9La7s+V0u0zKeyRJkiRJTa4RG6dt6+9P1fX9BwfTAIwMH6zrPLR5qzVT87BmzceaNR9r1lysV/OxZs2lERuncWA4hNCWrDa1AUeSxzdlerpAsVja+MAdMDiYZnJypi7vra2xZs3HmjUfa9Z8rFlzsV7Nx5rVT2try5YWWhpuD1mMMQ+8BjyTPPQM8GozXd8kSZIkaXdpxBUngOeBl0IIvwGcB56r83wkSZIk7WEN2TjFGP8f8LYtPLUNystv9VTv99eNs2bNx5o1H2vWfKxZc7Fezcea1UfV33vbjTyvpVSqz7VAO+Rx4L/XexKSJEmSGt4TwCubPXi3NU5dwMPAGWClznORJEmS1HjagMPAXwGLm33SbmucJEmSJKnmGi5VT5IkSZIajY2TJEmSJG3AxkmSJEmSNmDjJEmSJEkbsHGSJEmSpA3YOEmSJEnSBmycJEmSJGkD7fWewG4SQrgTeAnoB6aB52KM36vvrPaeEMLrwELyC+CjMcavhhAeAV4EuoHXgWdjjPnkOVsa09aEED4FvBt4E3BfjPE7yePXPId2Ykybd52avc5VzrdkzHOuTkII/cCfALdRvrnj94GfjzFO7kRdrNn2bVCzEvBtoJgc/sEY47eT570T+E3KP9N9E/iZGOPcdsa0eSGELwG3UK5NAfilGONrfp7tTq441dYLwGdjjHcCn6X8IaL6eE+M8YHk11dDCC3A54FfSOrzdeATAFsd07Z8CfgR4MQVj1/vHNqJMW3etWoGV5xvsPXzynOuZkrAJ2OMIcZ4DPhb4BM7URdrVjNXrVnV+GNV59lq05QCfg94Z4zxdmAG+Mh2xnTDfjrGeH+M8TjwKeAPksf9PNuFbJxqJISQAR4EvpA89AXgwRDCYP1mpSoPAQsxxleS718A3rvNMW1RjPGVGON49WPXO4d2Ymyn/my71dVqtgHPuTqKMZ6LMX6t6qH/CYyyM3WxZjVwnZpdz08A/6tq1eEF4H3bHNMNiDFerPr2AFD082z3snGqnRHgVIxxBSD5/XTyuG6+Pw0hfCuE8LkQQh+Qo+pfymOMU0BrCOHQNsZUW9c7h3ZiTLVz5fkGnnMNI4TQCnwY+HN2pi7WrMauqNmqr4UQXgsh/JsQQlfy2Lq/e2CMtf+/bXVMNyiE8PshhDHg48BP4+fZrmXjpN3oiRjj/cDDQAvwmTrPR9rNPN8a3+9SvvbC2jSPK2uWizE+RHm77N3Ar9drYnqjGOPPxRhzwK9Rvm5Mu5SNU+2MA8MhhDaA5PcjyeO6iVa3E8UYF4HPAW+n/K9plS0PIYQBoBRjPLeNMdXW9c6hnRhTDVzjfAPPuYaQhHrcAbwvxlhkZ+pizWroKjWrPs8uAb/PNc4zyitJ49sc0xbFGP8E+DHgJH6e7Uo2TjWSpAe9BjyTPPQM8GqMcbJ+s9p7Qgi9IYQDydctwPsp1+WbQHcI4fHk0OeBl5OvtzqmGrreObQTYzv/J9r9rnO+gedc3YUQPg68Bfh7SWMLO1MXa1YjV6tZCOFgCKE7+bodeA9r59lXgIdDCHck31f/3W91TJsUQkiFEEaqvn8ncA7w82yXaimVSvWew64RQriLckzkQeA85ZjIWN9Z7S0hhFuBLwJtya+/AX45xngmhPAY5QSafazF5U4kz9vSmLYmhPBp4KeAIWAKmI4x3nO9c2gnxrR5V6sZ8E6ucb4lz/Gcq5MQwj3Ad4DvAvPJwz+MMf79naiLNdu+a9UM+CTlv9sS0AF8A/jHMcZC8rx3Jce0Aa8CH4oxzm5nTJsTQsgCXwZ6gRXKTdNHYoz/28+z3cnGSZIkSZI24FY9SZIkSdqAjZMkSZIkbcDGSZIkSZI2YOMkSZIkSRuwcZIkSZKkDdg4SZIaUgjhhRDCr19nvBRCuL3G7/mBEMJ/ruVrSpJ2B+PIJUk7LoTwfuCfAPcCs5TvT/MS8O9ijFv6IAohlIA7Yozfv8rY14BHgGVgAfg68Aur95iqhRDCh4CfizE+vtGxkqTm54qTJGlHhRB+Ffgd4Dcp30A3CzwPvB3ovMZz2mrw1r8YY0wBdwJ9wG/V4DUlSXtUe70nIEnavUIIB4B/RfkO91+sGnoV+EDVcX8EzAOjwJPAu0IIzwInY4wfS475p8CvACXgY5udQ4zxXAjhi8CHq+b0u8BPAHPA7wH/OsZYvHIVKVnV+jDwq8AA8GfALwJ3AS8AHSGEArAcY+wLIfwk8ClgBLgE/FaM8VObnaskqXG54iRJ2kmPAl3Alzdx7D8EPg6kgVeqB0IIfwf4CPA0cAfwjs1OIIQwALybcrMG5abpAHAr5SbtOeBnrvMSfxd4GLgfeC/w4zHG/0t51ewvY4ypGGNfcuy/B34+xpimvC3xLzY7T0lSY3PFSZK0kwaAqRjj8uoDIYRvAHdTbqh+PMb49WToyzHG/5F8vRBCqH6d9wJ/GGP8TvIa/wJ4ZoP3/nQI4VOUr6n6GvAryRbA9wHHY4wzwEwI4d8CH6Tc9FzNJ2KMF4ALIYT/CjwAfOUaxy4Bd4cQ/jrGeB44v8EcJUlNwhUnSdJOmgYGQgiVf6iLMT6WrNBMs/5zaPw6r3PkivETm3jvX44x9sUYh2OMH4gxTlJu5DqveP4JYPg6r3O26us5IHWdY98N/CRwIoTw30IIj25inpKkJmDjJEnaSX8JLALv2sSx10vXO0P5uqFVuS3OZ4ryqtDoFa91aguv9Yb5xhj/Ksb4LiADfAl4eSuTlCQ1HrfqSZJ2TIzxQgjhXwKfCyG0UN7iNgccA3pv4KVeBv4whPDHwOvAP9/ifFZCCC8DHw8hPAccohw4sZUAhwngaAihM8Z4OYTQCfwD4D/GGC+GEC4BK1uZpySp8bjiJEnaUTHGT1JuTv4ZkKfccLwIfBT4xiZf4z8Bv005bOH7bC904ZcoX/f0A8ohFH8G/MEWXucvgP8DnA0hTCWPfRB4PWmangee3cY8JUkNxBvgSpIkSdIGXHGSJEmSpA3YOEmSJEnSBmycJEmSJGkDNk6SJEmStAEbJ0mSJEnagI2TJEmSJG3AxkmSJEmSNmDjJEmSJEkbsHGSJEmSpA38fxuQeZJ2rtAMAAAAAElFTkSuQmCC\n", "text/plain": ["<Figure size 1008x432 with 2 Axes>"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n", "df_ldst.set_index(\"Grid Points\")[\"PM_LD_CMPL (min)\"].plot(ax=ax1, legend=True);\n", "df_ldst.set_index(\"Grid Points\")[\"PM_ST_CMPL (min)\"].plot(ax=ax2, legend=True);"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Also this behaviour looks \u2013\u00a0at a first glance \u2013\u00a0linear. We can again fit a first-order polynom (and re-use our previously defined function `curve_fit`)!"]}, {"cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Counter PM_LD_CMPL (min) is proportional to the grid points (nx*ny) by a factor of 2.3437 (\u00b1 0.000037)\n", "Counter PM_ST_CMPL (min) is proportional to the grid points (nx*ny) by a factor of 0.5860 (\u00b1 0.000019)\n"]}], "source": ["_fit, _cov = common.print_and_return_fit(\n", "    [\"PM_LD_CMPL (min)\", \"PM_ST_CMPL (min)\"], \n", "    df_ldst.set_index(\"Grid Points\"), \n", "    linear_function,\n", "    format_value=\".4f\"\n", ")\n", "fit_parameters = {**fit_parameters, **_fit}\n", "fit_covariance = {**fit_covariance, **_cov}"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Let's overlay this in one common plot:"]}, {"cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [{"data": {"image/png": "\n", "text/plain": ["<Figure size 1008x432 with 2 Axes>"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n", "for ax, pmu_counter in zip([ax1, ax2], [\"PM_LD_CMPL (min)\", \"PM_ST_CMPL (min)\"]):\n", "    df_ldst.set_index(\"Grid Points\")[pmu_counter].plot(ax=ax, legend=True);\n", "    ax.plot(\n", "        df_ldst[\"Grid Points\"], \n", "        linear_function(df[\"Grid Points\"], *fit_parameters[pmu_counter]), \n", "        linestyle=\"--\", \n", "        label=\"Fit: {:.2f} * x + {:.2f}\".format(*fit_parameters[pmu_counter])\n", "    )\n", "    ax.legend();"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Did you expect more?\n", "\n", "The reason is simple: Among the load and store instructions counted by `PM_LD_CMPL` and `PM_ST_CMPL` are vector instructions which can load and store multiple (in this case: two) values at a time. To see how many *bytes* are loaded and stored, we need to measure counters for vectorized loads and stores as well.\n", "\n", "### TASK B\n", "<a name=\"task2-b\"></a>\n", "\n", "Please measure counters for _vectorized_ loads and _vectorized_ stores. See the TODOs in [`poisson2d.vld.c`](poisson2d.vld.c) and [`poisson2d.vst.c`](poisson2d.vst.c) (*Note: These vector counters can not be measured together and need separate files and runs*). Can you find out the name of the counters yourself, using `papi_native_avail | grep VECTOR_`?\n", "\n", "Compile, test, and bench-run your program again.\n", "\n", "[Back to top](#toc)"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["| PM_VECTOR_FLOP_CMPL                                                          |\n", "| PM_VECTOR_LD_CMPL                                                            |\n", "| PM_VECTOR_ST_CMPL                                                            |\n"]}], "source": ["!papi_native_avail | grep VECTOR_"]}, {"cell_type": "markdown", "metadata": {}, "source": ["`make bench_task3` will submit benchmark runs of both vectorized counters to the batch system (as two subsequent runs of the individual files)."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vld.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.vld.bin.csv\n", "Job <24641> is submitted to default queue <batch>.\n", "<<Waiting for dispatch ...>>\n", "<<Starting on login1>>\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,4,0.0010,0,0,0\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,8,0.0011,114000,570,570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,12,0.0012,174000,870,870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,16,0.0012,234000,1170,1170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,20,0.0013,294000,1470,1470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,24,0.0014,354000,1770,1770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,28,0.0014,414000,2070,2070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,32,0.0015,474000,2370,2370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,36,0.0016,534000,2670,2670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,40,0.0016,594000,2970,2970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,44,0.0017,654000,3270,3270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,48,0.0018,714000,3570,3570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,52,0.0018,774000,3870,3870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,56,0.0019,834000,4170,4170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,60,0.0020,894000,4470,4470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,64,0.0021,954000,4770,4770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,68,0.0022,1014000,5070,5070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,72,0.0022,1074000,5370,5370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,76,0.0022,1134000,5670,5670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,80,0.0023,1194000,5970,5970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,84,0.0024,1254000,6270,6270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,88,0.0024,1314000,6570,6570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,92,0.0025,1374000,6870,6870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,96,0.0027,1434000,7170,7170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,100,0.0026,1494000,7470,7470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,104,0.0029,1554000,7770,7770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,108,0.0027,1614000,8070,8070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,112,0.0028,1674000,8370,8370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,116,0.0029,1734000,8670,8670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,120,0.0029,1794000,8970,8970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,124,0.0030,1854000,9270,9270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,128,0.0032,1914000,9570,9570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,132,0.0031,1974000,9870,9870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,136,0.0032,2034000,10170,10170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,140,0.0033,2094000,10470,10470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,144,0.0033,2154000,10770,10770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,148,0.0034,2214000,11070,11070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,152,0.0036,2274000,11370,11370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,156,0.0035,2334000,11670,11670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,160,0.0036,2394000,11970,11970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,164,0.0037,2454000,12270,12270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,168,0.0037,2514000,12570,12570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,172,0.0038,2574000,12870,12870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,176,0.0039,2634000,13170,13170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,180,0.0039,2694000,13470,13470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,184,0.0040,2754000,13770,13770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,188,0.0041,2814000,14070,14070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,192,0.0041,2874000,14370,14370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,196,0.0042,2934000,14670,14670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,200,0.0042,2994000,14970,14970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,204,0.0043,3054000,15270,15270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,208,0.0045,3114000,15570,15570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,212,0.0045,3174000,15870,15870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,216,0.0045,3234000,16170,16170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,220,0.0046,3294000,16470,16470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,224,0.0048,3354000,16770,16770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,228,0.0047,3414000,17070,17070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,232,0.0048,3474000,17370,17370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,236,0.0048,3534000,17670,17670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,240,0.0049,3594000,17970,17970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,244,0.0050,3654000,18270,18270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,248,0.0052,3714000,18570,18570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,252,0.0051,3774000,18870,18870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,256,0.0052,3834000,19170,19170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,260,0.0052,3894000,19470,19470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,264,0.0053,3954000,19770,19770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,268,0.0054,4014000,20070,20070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,272,0.0054,4074000,20370,20370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,276,0.0055,4134000,20670,20670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,280,0.0056,4194000,20970,20970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,284,0.0056,4254000,21270,21270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,288,0.0057,4314000,21570,21570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,292,0.0058,4374000,21870,21870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,296,0.0058,4434000,22170,22170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,300,0.0059,4494000,22470,22470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,304,0.0059,4554000,22770,22770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,308,0.0060,4614000,23070,23070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,312,0.0061,4674000,23370,23370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,316,0.0062,4734000,23670,23670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,320,0.0062,4794000,23970,23970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,324,0.0063,4854000,24270,24270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,328,0.0063,4914000,24570,24570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,332,0.0064,4974000,24870,24870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,336,0.0065,5034000,25170,25170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,340,0.0065,5094000,25470,25470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,344,0.0066,5154000,25770,25770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,348,0.0069,5214000,26070,26070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,352,0.0068,5274000,26370,26370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,356,0.0070,5334000,26670,26670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,360,0.0069,5394000,26970,26970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,364,0.0070,5454000,27270,27270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,368,0.0070,5514000,27570,27570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,372,0.0071,5574000,27870,27870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,376,0.0073,5634000,28170,28170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,380,0.0073,5694000,28470,28470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,384,0.0073,5754000,28770,28770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,388,0.0074,5814000,29070,29070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,392,0.0074,5874000,29370,29370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,396,0.0076,5934000,29670,29670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,400,0.0075,5994000,29970,29970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,404,0.0076,6054000,30270,30270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,408,0.0077,6114000,30570,30570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,412,0.0078,6174000,30870,30870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,416,0.0079,6234000,31170,31170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,420,0.0079,6294000,31470,31470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,424,0.0079,6354000,31770,31770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,428,0.0080,6414000,32070,32070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,432,0.0080,6474000,32370,32370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,436,0.0081,6534000,32670,32670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,440,0.0082,6594000,32970,32970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,444,0.0083,6654000,33270,33270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,448,0.0084,6714000,33570,33570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,452,0.0084,6774000,33870,33870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,456,0.0084,6834000,34170,34170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,460,0.0085,6894000,34470,34470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,464,0.0086,6954000,34770,34770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,468,0.0087,7014000,35070,35070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,472,0.0088,7074000,35370,35370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,476,0.0088,7134000,35670,35670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,480,0.0089,7194000,35970,35970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,484,0.0090,7254000,36270,36270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,488,0.0091,7314000,36570,36570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,492,0.0091,7374000,36870,36870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,496,0.0091,7434000,37170,37170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,500,0.0094,7494000,37470,37470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,504,0.0093,7554000,37770,37770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,508,0.0095,7614000,38070,38070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,512,0.0096,7674000,38370,38370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,516,0.0095,7734000,38670,38670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,520,0.0095,7794000,38970,38970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,524,0.0097,7854000,39270,39270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,528,0.0097,7914000,39570,39570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,532,0.0098,7974000,39870,39870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,536,0.0098,8034000,40170,40170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,540,0.0099,8094000,40470,40470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,544,0.0100,8154000,40770,40770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,548,0.0101,8214000,41070,41070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,552,0.0101,8274000,41370,41370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,556,0.0104,8334000,41670,41670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,560,0.0103,8394000,41970,41970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,564,0.0103,8454000,42270,42270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,568,0.0106,8514000,42570,42570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,572,0.0105,8574000,42870,42870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,576,0.0106,8634000,43170,43170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,580,0.0108,8694000,43470,43470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,584,0.0109,8754000,43770,43770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,588,0.0108,8814000,44070,44070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,592,0.0109,8874000,44370,44370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,596,0.0109,8934000,44670,44670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,600,0.0110,8994000,44970,44970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,604,0.0111,9054000,45270,45270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,608,0.0112,9114000,45570,45570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,612,0.0112,9174000,45870,45870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,616,0.0114,9234000,46170,46170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,620,0.0113,9294000,46470,46470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,624,0.0114,9354000,46770,46770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,628,0.0117,9414000,47070,47070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,632,0.0116,9474000,47370,47370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,636,0.0116,9534000,47670,47670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,640,0.0117,9594000,47970,47970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,644,0.0119,9654000,48270,48270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,648,0.0118,9714000,48570,48570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,652,0.0119,9774000,48870,48870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,656,0.0119,9834000,49170,49170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,660,0.0121,9894000,49470,49470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,664,0.0122,9954000,49770,49770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,668,0.0123,10014000,50070,50070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,672,0.0122,10074000,50370,50370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,676,0.0123,10134000,50670,50670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,680,0.0123,10194000,50970,50970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,684,0.0125,10254000,51270,51270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,688,0.0125,10314000,51570,51570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,692,0.0127,10374000,51870,51870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,696,0.0126,10434000,52170,52170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,700,0.0127,10494000,52470,52470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,704,0.0128,10554000,52770,52770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,708,0.0129,10614000,53070,53070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,712,0.0128,10674000,53370,53370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,716,0.0131,10734000,53670,53670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,720,0.0130,10794000,53970,53970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,724,0.0130,10854000,54270,54270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,728,0.0132,10914000,54570,54570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,732,0.0133,10974000,54870,54870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,736,0.0135,11034000,55170,55170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,740,0.0135,11094000,55470,55470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,744,0.0135,11154000,55770,55770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,748,0.0134,11214000,56070,56070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,752,0.0135,11274000,56370,56370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,756,0.0136,11334000,56670,56670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,760,0.0137,11394000,56970,56970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,764,0.0137,11454000,57270,57270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,768,0.0138,11514000,57570,57570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,772,0.0139,11574000,57870,57870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,776,0.0141,11634000,58170,58170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,780,0.0140,11694000,58470,58470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,784,0.0142,11754000,58770,58770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,788,0.0141,11814000,59070,59070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,792,0.0142,11874000,59370,59370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,796,0.0143,11934000,59670,59670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,800,0.0143,11994000,59970,59970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,804,0.0145,12054000,60270,60270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,808,0.0145,12114000,60570,60570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,812,0.0145,12174000,60870,60870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,816,0.0148,12234000,61170,61170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,820,0.0148,12294000,61470,61470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,824,0.0148,12354000,61770,61770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,828,0.0148,12414000,62070,62070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,832,0.0149,12474000,62370,62370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,836,0.0150,12534000,62670,62670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,840,0.0150,12594000,62970,62970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,844,0.0151,12654000,63270,63270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,848,0.0153,12714000,63570,63570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,852,0.0153,12774000,63870,63870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,856,0.0153,12834000,64170,64170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,860,0.0154,12894000,64470,64470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,864,0.0154,12954000,64770,64770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,868,0.0155,13014000,65070,65070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,872,0.0157,13074000,65370,65370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,876,0.0156,13134000,65670,65670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,880,0.0157,13194000,65970,65970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,884,0.0157,13254000,66270,66270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,888,0.0158,13314000,66570,66570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,892,0.0159,13374000,66870,66870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,896,0.0160,13434000,67170,67170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,900,0.0160,13494000,67470,67470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,904,0.0162,13554000,67770,67770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,908,0.0162,13614000,68070,68070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,912,0.0163,13674000,68370,68370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,916,0.0163,13734000,68670,68670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,920,0.0164,13794000,68970,68970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,924,0.0165,13854000,69270,69270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,928,0.0166,13914000,69570,69570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,932,0.0166,13974000,69870,69870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,936,0.0167,14034000,70170,70170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,940,0.0167,14094000,70470,70470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,944,0.0168,14154000,70770,70770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,948,0.0170,14214000,71070,71070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,952,0.0171,14274000,71370,71370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,956,0.0171,14334000,71670,71670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,960,0.0171,14394000,71970,71970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,964,0.0175,14454000,72270,72270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,968,0.0176,14514000,72570,72570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,972,0.0176,14574000,72870,72870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,976,0.0175,14634000,73170,73170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,980,0.0178,14694000,73470,73470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,984,0.0180,14754000,73770,73770\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,988,0.0178,14814000,74070,74070\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,992,0.0179,14874000,74370,74370\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,996,0.0181,14934000,74670,74670\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,1000,0.0180,14994000,74970,74970\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,1004,0.0182,15054000,75270,75270\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,1008,0.0181,15114000,75570,75570\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,1012,0.0183,15174000,75870,75870\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,1016,0.0183,15234000,76170,76170\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,1020,0.0186,15294000,76470,76470\n", "iter,ny,nx,Runtime,PM_VECTOR_LD_CMPL (total),PM_VECTOR_LD_CMPL (min), PM_VECTOR_LD_CMPL (max)\n", "200,32,1024,0.0182,15354000,76770,76770\n", "mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.vld.bin.csv .\n", "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vst.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.vst.bin.csv\n", "Job <24642> is submitted to default queue <batch>.\n", "<<Waiting for dispatch ...>>\n", "<<Starting on login1>>\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,4,0.0010,200,1,1\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,8,0.0011,18200,91,91\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,12,0.0012,30200,151,151\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,16,0.0012,42200,211,211\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,20,0.0013,54200,271,271\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,24,0.0013,66200,331,331\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,28,0.0014,78200,391,391\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,32,0.0015,90200,451,451\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,36,0.0015,102200,511,511\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,40,0.0016,114200,571,571\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,44,0.0017,126200,631,631\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,48,0.0017,138200,691,691\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,52,0.0018,150200,751,751\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,56,0.0019,162200,811,811\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,60,0.0020,174200,871,871\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,64,0.0020,186200,931,931\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,68,0.0022,198200,991,991\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,72,0.0023,210200,1051,1051\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,76,0.0022,222200,1111,1111\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,80,0.0023,234200,1171,1171\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,84,0.0024,246200,1231,1231\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,88,0.0024,258200,1291,1291\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,92,0.0025,270200,1351,1351\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,96,0.0025,282200,1411,1411\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,100,0.0026,294200,1471,1471\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,104,0.0027,306200,1531,1531\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,108,0.0028,318200,1591,1591\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,112,0.0028,330200,1651,1651\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,116,0.0029,342200,1711,1711\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,120,0.0030,354200,1771,1771\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,124,0.0030,366200,1831,1831\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,128,0.0031,378200,1891,1891\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,132,0.0032,390200,1951,1951\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,136,0.0032,402200,2011,2011\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,140,0.0033,414200,2071,2071\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,144,0.0033,426200,2131,2131\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,148,0.0035,438200,2191,2191\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,152,0.0035,450200,2251,2251\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,156,0.0035,462200,2311,2311\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,160,0.0036,474200,2371,2371\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,164,0.0038,486200,2431,2431\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,168,0.0037,498200,2491,2491\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,172,0.0038,510200,2551,2551\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,176,0.0038,522200,2611,2611\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,180,0.0039,534200,2671,2671\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,184,0.0040,546200,2731,2731\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,188,0.0040,558200,2791,2791\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,192,0.0041,570200,2851,2851\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,196,0.0042,582200,2911,2911\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,200,0.0044,594200,2971,2971\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,204,0.0043,606200,3031,3031\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,208,0.0044,618200,3091,3091\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,212,0.0044,630200,3151,3151\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,216,0.0045,642200,3211,3211\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,220,0.0046,654200,3271,3271\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,224,0.0046,666200,3331,3331\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,228,0.0047,678200,3391,3391\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,232,0.0048,690200,3451,3451\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,236,0.0048,702200,3511,3511\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,240,0.0049,714200,3571,3571\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,244,0.0050,726200,3631,3631\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,248,0.0050,738200,3691,3691\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,252,0.0051,750200,3751,3751\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,256,0.0052,762200,3811,3811\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,260,0.0052,774200,3871,3871\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,264,0.0053,786200,3931,3931\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,268,0.0054,798200,3991,3991\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,272,0.0054,810200,4051,4051\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,276,0.0055,822200,4111,4111\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,280,0.0055,834200,4171,4171\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,284,0.0056,846200,4231,4231\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,288,0.0057,858200,4291,4291\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,292,0.0057,870200,4351,4351\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,296,0.0058,882200,4411,4411\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,300,0.0059,894200,4471,4471\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,304,0.0059,906200,4531,4531\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,308,0.0060,918200,4591,4591\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,312,0.0061,930200,4651,4651\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,316,0.0061,942200,4711,4711\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,320,0.0062,954200,4771,4771\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,324,0.0063,966200,4831,4831\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,328,0.0063,978200,4891,4891\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,332,0.0064,990200,4951,4951\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,336,0.0065,1002200,5011,5011\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,340,0.0066,1014200,5071,5071\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,344,0.0066,1026200,5131,5131\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,348,0.0067,1038200,5191,5191\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,352,0.0069,1050200,5251,5251\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,356,0.0068,1062200,5311,5311\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,360,0.0068,1074200,5371,5371\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,364,0.0069,1086200,5431,5431\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,368,0.0070,1098200,5491,5491\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,372,0.0071,1110200,5551,5551\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,376,0.0071,1122200,5611,5611\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,380,0.0072,1134200,5671,5671\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,384,0.0073,1146200,5731,5731\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,388,0.0073,1158200,5791,5791\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,392,0.0074,1170200,5851,5851\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,396,0.0075,1182200,5911,5911\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,400,0.0075,1194200,5971,5971\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,404,0.0076,1206200,6031,6031\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,408,0.0077,1218200,6091,6091\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,412,0.0077,1230200,6151,6151\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,416,0.0080,1242200,6211,6211\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,420,0.0078,1254200,6271,6271\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,424,0.0079,1266200,6331,6331\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,428,0.0080,1278200,6391,6391\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,432,0.0081,1290200,6451,6451\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,436,0.0082,1302200,6511,6511\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,440,0.0082,1314200,6571,6571\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,444,0.0083,1326200,6631,6631\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,448,0.0083,1338200,6691,6691\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,452,0.0084,1350200,6751,6751\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,456,0.0085,1362200,6811,6811\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,460,0.0085,1374200,6871,6871\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,464,0.0087,1386200,6931,6931\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,468,0.0086,1398200,6991,6991\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,472,0.0087,1410200,7051,7051\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,476,0.0088,1422200,7111,7111\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,480,0.0090,1434200,7171,7171\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,484,0.0089,1446200,7231,7231\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,488,0.0090,1458200,7291,7291\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,492,0.0092,1470200,7351,7351\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,496,0.0092,1482200,7411,7411\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,500,0.0092,1494200,7471,7471\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,504,0.0093,1506200,7531,7531\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,508,0.0094,1518200,7591,7591\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,512,0.0095,1530200,7651,7651\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,516,0.0096,1542200,7711,7711\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,520,0.0096,1554200,7771,7771\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,524,0.0096,1566200,7831,7831\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,528,0.0097,1578200,7891,7891\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,532,0.0097,1590200,7951,7951\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,536,0.0098,1602200,8011,8011\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,540,0.0100,1614200,8071,8071\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,544,0.0099,1626200,8131,8131\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,548,0.0100,1638200,8191,8191\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,552,0.0101,1650200,8251,8251\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,556,0.0102,1662200,8311,8311\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,560,0.0102,1674200,8371,8371\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,564,0.0105,1686200,8431,8431\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,568,0.0104,1698200,8491,8491\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,572,0.0105,1710200,8551,8551\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,576,0.0105,1722200,8611,8611\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,580,0.0108,1734200,8671,8671\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,584,0.0108,1746200,8731,8731\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,588,0.0109,1758200,8791,8791\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,592,0.0109,1770200,8851,8851\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,596,0.0109,1782200,8911,8911\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,600,0.0111,1794200,8971,8971\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,604,0.0111,1806200,9031,9031\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,608,0.0112,1818200,9091,9091\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,612,0.0112,1830200,9151,9151\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,616,0.0114,1842200,9211,9211\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,620,0.0113,1854200,9271,9271\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,624,0.0114,1866200,9331,9331\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,628,0.0114,1878200,9391,9391\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,632,0.0116,1890200,9451,9451\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,636,0.0116,1902200,9511,9511\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,640,0.0117,1914200,9571,9571\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,644,0.0118,1926200,9631,9631\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,648,0.0118,1938200,9691,9691\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,652,0.0121,1950200,9751,9751\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,656,0.0121,1962200,9811,9811\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,660,0.0121,1974200,9871,9871\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,664,0.0121,1986200,9931,9931\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,668,0.0122,1998200,9991,9991\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,672,0.0122,2010200,10051,10051\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,676,0.0124,2022200,10111,10111\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,680,0.0123,2034200,10171,10171\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,684,0.0124,2046200,10231,10231\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,688,0.0126,2058200,10291,10291\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,692,0.0127,2070200,10351,10351\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,696,0.0126,2082200,10411,10411\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,700,0.0128,2094200,10471,10471\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,704,0.0127,2106200,10531,10531\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,708,0.0128,2118200,10591,10591\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,712,0.0129,2130200,10651,10651\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,716,0.0130,2142200,10711,10711\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,720,0.0130,2154200,10771,10771\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,724,0.0131,2166200,10831,10831\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,728,0.0131,2178200,10891,10891\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,732,0.0132,2190200,10951,10951\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,736,0.0134,2202200,11011,11011\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,740,0.0134,2214200,11071,11071\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,744,0.0134,2226200,11131,11131\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,748,0.0135,2238200,11191,11191\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,752,0.0136,2250200,11251,11251\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,756,0.0136,2262200,11311,11311\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,760,0.0137,2274200,11371,11371\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,764,0.0138,2286200,11431,11431\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,768,0.0138,2298200,11491,11491\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,772,0.0139,2310200,11551,11551\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,776,0.0139,2322200,11611,11611\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,780,0.0140,2334200,11671,11671\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,784,0.0141,2346200,11731,11731\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,788,0.0142,2358200,11791,11791\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,792,0.0142,2370200,11851,11851\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,796,0.0144,2382200,11911,11911\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,800,0.0144,2394200,11971,11971\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,804,0.0144,2406200,12031,12031\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,808,0.0146,2418200,12091,12091\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,812,0.0146,2430200,12151,12151\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,816,0.0146,2442200,12211,12211\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,820,0.0147,2454200,12271,12271\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,824,0.0148,2466200,12331,12331\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,828,0.0149,2478200,12391,12391\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,832,0.0149,2490200,12451,12451\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,836,0.0150,2502200,12511,12511\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,840,0.0151,2514200,12571,12571\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,844,0.0152,2526200,12631,12631\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,848,0.0151,2538200,12691,12691\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,852,0.0152,2550200,12751,12751\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,856,0.0153,2562200,12811,12811\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,860,0.0154,2574200,12871,12871\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,864,0.0155,2586200,12931,12931\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,868,0.0155,2598200,12991,12991\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,872,0.0156,2610200,13051,13051\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,876,0.0156,2622200,13111,13111\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,880,0.0157,2634200,13171,13171\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,884,0.0158,2646200,13231,13231\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,888,0.0159,2658200,13291,13291\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,892,0.0159,2670200,13351,13351\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,896,0.0160,2682200,13411,13411\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,900,0.0160,2694200,13471,13471\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,904,0.0162,2706200,13531,13531\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,908,0.0162,2718200,13591,13591\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,912,0.0163,2730200,13651,13651\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,916,0.0163,2742200,13711,13711\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,920,0.0164,2754200,13771,13771\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,924,0.0165,2766200,13831,13831\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,928,0.0166,2778200,13891,13891\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,932,0.0168,2790200,13951,13951\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,936,0.0167,2802200,14011,14011\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,940,0.0169,2814200,14071,14071\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,944,0.0169,2826200,14131,14131\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,948,0.0169,2838200,14191,14191\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,952,0.0170,2850200,14251,14251\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,956,0.0170,2862200,14311,14311\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,960,0.0171,2874200,14371,14371\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,964,0.0175,2886200,14431,14431\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,968,0.0175,2898200,14491,14491\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,972,0.0176,2910200,14551,14551\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,976,0.0176,2922200,14611,14611\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,980,0.0178,2934200,14671,14671\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,984,0.0178,2946200,14731,14731\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,988,0.0179,2958200,14791,14791\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,992,0.0178,2970200,14851,14851\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,996,0.0181,2982200,14911,14911\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,1000,0.0180,2994200,14971,14971\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,1004,0.0181,3006200,15031,15031\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,1008,0.0182,3018200,15091,15091\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,1012,0.0183,3030200,15151,15151\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,1016,0.0183,3042200,15211,15211\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,1020,0.0184,3054200,15271,15271\n", "iter,ny,nx,Runtime,PM_VECTOR_ST_CMPL (total),PM_VECTOR_ST_CMPL (min), PM_VECTOR_ST_CMPL (max)\n", "200,32,1024,0.0182,3066200,15331,15331\n", "mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.vst.bin.csv .\n"]}], "source": ["!make bench_task3"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Let's plot it again, as soon as the run finishes! Non-interactively, call `graph_task2b`.\n", "\n", "*Because we couldn't measure the two vector counters at the same time, we have two CSV files to read in now. We combine them into one common dataframe `df_vldvst` in the following.*"]}, {"cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": ["df_vld = pd.read_csv(\"poisson2d.vld.bin.csv\", skiprows=range(2, 50000, 2))\n", "df_vst = pd.read_csv(\"poisson2d.vst.bin.csv\", skiprows=range(2, 50000, 2))\n", "df_vldvst = pd.concat([df_vld.set_index(\"nx\"), df_vst.set_index(\"nx\")[['PM_VECTOR_ST_CMPL (total)', 'PM_VECTOR_ST_CMPL (min)', ' PM_VECTOR_ST_CMPL (max)']]], axis=1).reset_index()"]}, {"cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style scoped>\n", "    .dataframe tbody tr th:only-of-type {\n", "        vertical-align: middle;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: right;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>nx</th>\n", "      <th>iter</th>\n", "      <th>ny</th>\n", "      <th>Runtime</th>\n", "      <th>PM_VECTOR_LD_CMPL (total)</th>\n", "      <th>PM_VECTOR_LD_CMPL (min)</th>\n", "      <th>PM_VECTOR_LD_CMPL (max)</th>\n", "      <th>PM_VECTOR_ST_CMPL (total)</th>\n", "      <th>PM_VECTOR_ST_CMPL (min)</th>\n", "      <th>PM_VECTOR_ST_CMPL (max)</th>\n", "      <th>Grid Points</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>4</td>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>0.0010</td>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>200</td>\n", "      <td>1</td>\n", "      <td>1</td>\n", "      <td>128</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>8</td>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>0.0011</td>\n", "      <td>114000</td>\n", "      <td>570</td>\n", "      <td>570</td>\n", "      <td>18200</td>\n", "      <td>91</td>\n", "      <td>91</td>\n", "      <td>256</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>12</td>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>0.0012</td>\n", "      <td>174000</td>\n", "      <td>870</td>\n", "      <td>870</td>\n", "      <td>30200</td>\n", "      <td>151</td>\n", "      <td>151</td>\n", "      <td>384</td>\n", "    </tr>\n", "    <tr>\n", "      <th>3</th>\n", "      <td>16</td>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>0.0012</td>\n", "      <td>234000</td>\n", "      <td>1170</td>\n", "      <td>1170</td>\n", "      <td>42200</td>\n", "      <td>211</td>\n", "      <td>211</td>\n", "      <td>512</td>\n", "    </tr>\n", "    <tr>\n", "      <th>4</th>\n", "      <td>20</td>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>0.0013</td>\n", "      <td>294000</td>\n", "      <td>1470</td>\n", "      <td>1470</td>\n", "      <td>54200</td>\n", "      <td>271</td>\n", "      <td>271</td>\n", "      <td>640</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["   nx  iter  ny  Runtime  PM_VECTOR_LD_CMPL (total)  PM_VECTOR_LD_CMPL (min)  \\\n", "0   4   200  32   0.0010                          0                        0   \n", "1   8   200  32   0.0011                     114000                      570   \n", "2  12   200  32   0.0012                     174000                      870   \n", "3  16   200  32   0.0012                     234000                     1170   \n", "4  20   200  32   0.0013                     294000                     1470   \n", "\n", "    PM_VECTOR_LD_CMPL (max)  PM_VECTOR_ST_CMPL (total)  \\\n", "0                         0                        200   \n", "1                       570                      18200   \n", "2                       870                      30200   \n", "3                      1170                      42200   \n", "4                      1470                      54200   \n", "\n", "   PM_VECTOR_ST_CMPL (min)   PM_VECTOR_ST_CMPL (max)  Grid Points  \n", "0                        1                         1          128  \n", "1                       91                        91          256  \n", "2                      151                       151          384  \n", "3                      211                       211          512  \n", "4                      271                       271          640  "]}, "execution_count": 32, "metadata": {}, "output_type": "execute_result"}], "source": ["df_vldvst[\"Grid Points\"] = df_vldvst[\"nx\"] * df_vldvst[\"ny\"] \n", "df_vldvst.head()"]}, {"cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [{"data": {"image/png": "\n", "text/plain": ["<Figure size 1008x432 with 2 Axes>"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n", "df_vldvst.set_index(\"Grid Points\")[\"PM_VECTOR_LD_CMPL (min)\"].plot(ax=ax1, legend=True);\n", "df_vldvst.set_index(\"Grid Points\")[\"PM_VECTOR_ST_CMPL (min)\"].plot(ax=ax2, legend=True);"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Also here seems to be a linear correlation. Let's do our fitting and plot directly."]}, {"cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Counter PM_VECTOR_LD_CMPL (min) is proportional to the grid points (nx*ny) by a factor of 2.3439 (\u00b1 0.000111)\n", "Counter PM_VECTOR_ST_CMPL (min) is proportional to the grid points (nx*ny) by a factor of 0.4688 (\u00b1 0.000012)\n"]}], "source": ["_fit, _cov = common.print_and_return_fit(\n", "    [\"PM_VECTOR_LD_CMPL (min)\", \"PM_VECTOR_ST_CMPL (min)\"], \n", "    df_vldvst.set_index(\"Grid Points\"), \n", "    linear_function,\n", "    format_value=\".4f\",\n", ")\n", "fit_parameters = {**fit_parameters, **_fit}\n", "fit_covariance = {**fit_covariance, **_cov}"]}, {"cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [{"data": {"image/png": "\n", "text/plain": ["<Figure size 1008x432 with 2 Axes>"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n", "for ax, pmu_counter in zip([ax1, ax2], [\"PM_VECTOR_LD_CMPL (min)\", \"PM_VECTOR_ST_CMPL (min)\"]):\n", "    df_vldvst.set_index(\"Grid Points\")[pmu_counter].plot(ax=ax, legend=True);\n", "    ax.plot(\n", "        df_vldvst[\"Grid Points\"], \n", "        linear_function(df[\"Grid Points\"], *fit_parameters[pmu_counter]), \n", "        linestyle=\"--\", \n", "        label=\"Fit: {:.2f} * x + {:.2f}\".format(*fit_parameters[pmu_counter])\n", "    )\n", "    ax.legend();"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Let's try to make sense of those numbers.\n", "\n", "Vector loads and vector stores use two 8 Byte values at a time. When we measured loads and stores with `LD_CMPL` and `ST_CMPL` in part A of this task, we measured total number of stores and loads; that is: vector and scalar versions of the instructions. In order to convert the load and store instructions into **bytes** loaded and stored, we need to separate them. The difference of total instructions and vector instructions yield scalar instructions. We multiply the scalar instructions by 8 Byte (double precision) and the vector instructions by 16 Byte (two loads or stores of double precision). That yields the loaded or stored data (or, more precisely, the instruction-equivalent data).\n", "\n", "To formualize it, see the following equations, as an example for load ($ld$), with $b$ denoting data loaded in bytes and $n$ denoting the number of instructions.\n", "\n", "\\begin{align}\n", "b_\\text{ld} &= b_\\text{ld}^\\text{scalar} + b_\\text{ld}^\\text{vector}\\\\\n", "b_\\text{ld}^\\text{scalar} &= n_\\text{ld}^\\text{scalar} * 8\\,\\text{Byte} \\\\\n", "b_\\text{ld}^\\text{vector} &= n_\\text{ld}^\\text{vector} * 16\\,\\text{Byte} \\\\\n", "n_\\text{ld}^\\text{scalar} &= n_\\text{ld}^\\text{total} - n_\\text{ld}^\\text{vector}\\\\\n", "\\Rightarrow b_\\text{ld} &= n_\\text{ld}^\\text{scalar}* 8 \\,\\text{Byte} + n_\\text{ld}^\\text{vector} * 16\\,\\text{Byte} \\\\\n", "& = (n_\\text{ld}^\\text{scalar}+2 n_\\text{ld}^\\text{vector}) * 8\\,Byte \\\\\n", "& = (n_\\text{ld}^\\text{total} - n_\\text{ld}^\\text{vector} + 2 n_\\text{ld}^\\text{vector}) * 8\\,Byte \\\\\n", "& = (n_\\text{ld}^\\text{total} + n_\\text{ld}^\\text{vector}) *8\\,Byte \n", "\\end{align}\n", "\n", "We are going to print this in the next cell. In case you look at this Notebook non-interactively, call `graph_task2b-2`."]}, {"cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [{"data": {"image/png": "\n", "text/plain": ["<Figure size 1008x432 with 1 Axes>"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["df_byte = pd.DataFrame()\n", "df_byte[\"Loads\"]  = (df_vldvst.set_index(\"Grid Points\")[\"PM_VECTOR_LD_CMPL (min)\"] + df_ldst.set_index(\"Grid Points\")[\"PM_LD_CMPL (min)\"])*8\n", "df_byte[\"Stores\"] = (df_vldvst.set_index(\"Grid Points\")[\"PM_VECTOR_ST_CMPL (min)\"] + df_ldst.set_index(\"Grid Points\")[\"PM_ST_CMPL (min)\"])*8\n", "ax = df_byte.plot()\n", "ax.set_ylabel(\"Bytes\");"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Let's quantify the difference by, again, fitting a linear function to the data."]}, {"cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Counter  Loads is proportional to the grid points (nx*ny) by a factor of 37.5010 (\u00b1 0.000592)\n", "Counter Stores is proportional to the grid points (nx*ny) by a factor of  8.4379 (\u00b1 0.000247)\n"]}], "source": ["_fit, _cov = common.print_and_return_fit(\n", "    [\"Loads\", \"Stores\"], \n", "    df_byte, \n", "    linear_function\n", ")\n", "fit_parameters = {**fit_parameters, **_fit}\n", "fit_covariance = {**fit_covariance, **_cov}"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Analagously to the proportionality factors, this mich is loaded/stored per grid point."]}, {"cell_type": "markdown", "metadata": {}, "source": ["*Not really a* <a name=\"task2-c\"></a>**TASK C**: We can combine this information with the cycles measured in Task 1 to create a bandwidth of exchanged bytes per cycle."]}, {"cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": ["df_bandwidth = pd.DataFrame()\n", "df_bandwidth[\"Bandwidth / Byte/Cycle\"] = (df_byte[\"Loads\"] + df_byte[\"Stores\"]) / df.set_index(\"Grid Points\")[\"PM_RUN_CYC (min)\"]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Let's display it as a function of grid points. And also compare it to the available L1 cache bandwidth in a second (sub-)plot. Non-interactive users, call `make graph_task2c`."]}, {"cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [{"data": {"image/png": "\n", "text/plain": ["<Figure size 1008x432 with 2 Axes>"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)\n", "for ax in [ax1, ax2]:\n", "    df_bandwidth[\"Bandwidth / Byte/Cycle\"].plot(ax=ax, legend=True, label=\"Jacobi Bandwidth\")\n", "    ax.set_ylabel(\"Byte/Cycle\")\n", "ax2.axhline(2*16, color=sns.color_palette()[1], label=\"L1 Bandwidth\");\n", "ax2.legend();"]}, {"cell_type": "markdown", "metadata": {}, "source": ["As you can see, we are quite a bit away from the available L1 cache bandwidth. Can you think of reasons why?"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Task E1: Measuring FlOps\n", "<a name=\"taske1\"></a>\n", "\n", "If you still have time, feel free to work on the following extended task.\n", "\n", "\n", "**TASK**: Please measure counters for _vectorized_ floating point operations and _scalar_ floating point operations. The two counters can also not be measured during the same run. So please see the TODOs in [`poisson2d.sflops.c`](/edit/Tasks/poisson2d.sflops.c) and [`poisson2d.vflops.c`](/edit/Tasks/poisson2d.vflops.c). By now you should be able to find out the names of the counters by yourself (*Hint: they include the words \u00bbscalar\u00ab and \u00bbvector\u00ab\u2026*).\n", "\n", "As usual, compile, test, and bench-run your program.\n", "\n", "[Back to top](#toc)"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.sflop.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.sflop.bin.csv\n", "Job <24645> is submitted to default queue <batch>.\n", "<<Waiting for dispatch ...>>\n", "<<Starting on login1>>\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,4,0.0010,96000,480,480\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,8,0.0011,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,12,0.0012,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,16,0.0012,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,20,0.0013,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,24,0.0013,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,28,0.0014,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,32,0.0015,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,36,0.0015,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,40,0.0016,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,44,0.0017,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,48,0.0017,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,52,0.0018,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,56,0.0022,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,60,0.0019,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,64,0.0021,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,68,0.0022,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,72,0.0021,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,76,0.0022,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,80,0.0023,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,84,0.0025,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,88,0.0024,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,92,0.0025,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,96,0.0025,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,100,0.0026,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,104,0.0027,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,108,0.0027,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,112,0.0028,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,116,0.0028,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,120,0.0031,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,124,0.0030,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,128,0.0030,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,132,0.0031,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,136,0.0032,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,140,0.0032,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,144,0.0033,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,148,0.0034,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,152,0.0035,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,156,0.0035,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,160,0.0036,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,164,0.0036,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,168,0.0037,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,172,0.0038,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,176,0.0038,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,180,0.0039,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,184,0.0040,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,188,0.0040,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,192,0.0041,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,196,0.0042,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,200,0.0042,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,204,0.0043,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,208,0.0043,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,212,0.0044,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,216,0.0045,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,220,0.0045,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,224,0.0046,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,228,0.0047,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,232,0.0047,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,236,0.0048,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,240,0.0049,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,244,0.0049,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,248,0.0051,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,252,0.0051,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,256,0.0053,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,260,0.0052,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,264,0.0053,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,268,0.0054,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,272,0.0054,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,276,0.0054,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,280,0.0055,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,284,0.0056,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,288,0.0056,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,292,0.0057,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,296,0.0058,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,300,0.0058,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,304,0.0059,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,308,0.0060,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,312,0.0060,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,316,0.0062,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,320,0.0062,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,324,0.0062,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,328,0.0063,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,332,0.0064,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,336,0.0065,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,340,0.0065,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,344,0.0066,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,348,0.0066,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,352,0.0067,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,356,0.0068,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,360,0.0069,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,364,0.0069,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,368,0.0070,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,372,0.0072,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,376,0.0071,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,380,0.0071,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,384,0.0072,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,388,0.0073,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,392,0.0074,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,396,0.0076,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,400,0.0075,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,404,0.0076,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,408,0.0076,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,412,0.0077,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,416,0.0078,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,420,0.0078,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,424,0.0079,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,428,0.0079,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,432,0.0080,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,436,0.0081,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,440,0.0082,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,444,0.0082,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,448,0.0084,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,452,0.0083,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,456,0.0084,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,460,0.0085,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,464,0.0085,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,468,0.0086,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,472,0.0087,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,476,0.0089,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,480,0.0088,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,484,0.0089,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,488,0.0089,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,492,0.0090,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,496,0.0091,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,500,0.0092,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,504,0.0092,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,508,0.0093,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,512,0.0094,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,516,0.0094,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,520,0.0095,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,524,0.0096,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,528,0.0096,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,532,0.0098,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,536,0.0097,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,540,0.0098,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,544,0.0099,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,548,0.0100,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,552,0.0101,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,556,0.0101,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,560,0.0102,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,564,0.0103,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,568,0.0104,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,572,0.0105,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,576,0.0105,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,580,0.0106,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,584,0.0107,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,588,0.0107,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,592,0.0108,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,596,0.0109,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,600,0.0110,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,604,0.0111,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,608,0.0111,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,612,0.0112,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,616,0.0112,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,620,0.0113,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,624,0.0114,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,628,0.0115,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,632,0.0115,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,636,0.0115,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,640,0.0116,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,644,0.0118,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,648,0.0117,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,652,0.0119,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,656,0.0119,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,660,0.0121,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,664,0.0120,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,668,0.0122,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,672,0.0121,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,676,0.0124,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,680,0.0123,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,684,0.0125,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,688,0.0124,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,692,0.0125,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,696,0.0126,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,700,0.0127,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,704,0.0126,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,708,0.0127,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,712,0.0129,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,716,0.0128,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,720,0.0129,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,724,0.0132,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,728,0.0131,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,732,0.0131,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,736,0.0133,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,740,0.0133,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,744,0.0133,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,748,0.0134,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,752,0.0136,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,756,0.0136,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,760,0.0136,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,764,0.0136,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,768,0.0138,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,772,0.0138,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,776,0.0139,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,780,0.0139,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,784,0.0140,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,788,0.0140,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,792,0.0141,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,796,0.0142,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,800,0.0143,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,804,0.0143,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,808,0.0144,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,812,0.0144,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,816,0.0145,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,820,0.0146,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,824,0.0148,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,828,0.0147,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,832,0.0148,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,836,0.0149,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,840,0.0150,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,844,0.0150,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,848,0.0150,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,852,0.0151,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,856,0.0152,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,860,0.0152,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,864,0.0153,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,868,0.0154,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,872,0.0156,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,876,0.0156,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,880,0.0156,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,884,0.0157,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,888,0.0157,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,892,0.0158,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,896,0.0159,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,900,0.0159,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,904,0.0161,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,908,0.0162,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,912,0.0164,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,916,0.0163,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,920,0.0164,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,924,0.0165,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,928,0.0166,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,932,0.0166,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,936,0.0167,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,940,0.0167,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,944,0.0168,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,948,0.0169,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,952,0.0172,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,956,0.0171,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,960,0.0172,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,964,0.0175,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,968,0.0175,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,972,0.0176,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,976,0.0177,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,980,0.0178,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,984,0.0178,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,988,0.0179,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,992,0.0179,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,996,0.0182,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,1000,0.0181,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,1004,0.0182,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,1008,0.0182,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,1012,0.0184,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,1016,0.0184,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,1020,0.0186,0,0,0\n", "iter,ny,nx,Runtime,PM_SCALAR_FLOP_CMPL (total),PM_SCALAR_FLOP_CMPL (min), PM_SCALAR_FLOP_CMPL (max)\n", "200,32,1024,0.0182,0,0,0\n", "mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.sflop.bin.csv .\n", "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./bench.sh poisson2d.vflop.bin /gpfs/wolf/trn003/scratch/aherten//poisson2d.vflop.bin.csv\n", "Job <24646> is submitted to default queue <batch>.\n", "<<Waiting for dispatch ...>>\n", "<<Starting on login1>>\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,4,0.0010,0,0,0\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,8,0.0011,150000,750,750\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,12,0.0012,246000,1230,1230\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,16,0.0012,342000,1710,1710\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,20,0.0013,438000,2190,2190\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,24,0.0013,534000,2670,2670\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,28,0.0014,630000,3150,3150\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,32,0.0015,726000,3630,3630\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,36,0.0016,822000,4110,4110\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,40,0.0016,918000,4590,4590\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,44,0.0017,1014000,5070,5070\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,48,0.0017,1110000,5550,5550\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,52,0.0018,1206000,6030,6030\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,56,0.0019,1302000,6510,6510\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,60,0.0019,1398000,6990,6990\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,64,0.0020,1494000,7470,7470\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,68,0.0022,1590000,7950,7950\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,72,0.0021,1686000,8430,8430\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,76,0.0022,1782000,8910,8910\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,80,0.0023,1878000,9390,9390\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,84,0.0025,1974000,9870,9870\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,88,0.0024,2070000,10350,10350\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,92,0.0026,2166000,10830,10830\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,96,0.0025,2262000,11310,11310\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,100,0.0026,2358000,11790,11790\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,104,0.0027,2454000,12270,12270\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,108,0.0027,2550000,12750,12750\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,112,0.0029,2646000,13230,13230\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,116,0.0029,2742000,13710,13710\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,120,0.0029,2838000,14190,14190\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,124,0.0030,2934000,14670,14670\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,128,0.0031,3030000,15150,15150\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,132,0.0031,3126000,15630,15630\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,136,0.0032,3222000,16110,16110\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,140,0.0032,3318000,16590,16590\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,144,0.0033,3414000,17070,17070\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,148,0.0036,3510000,17550,17550\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,152,0.0035,3606000,18030,18030\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,156,0.0035,3702000,18510,18510\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,160,0.0036,3798000,18990,18990\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,164,0.0036,3894000,19470,19470\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,168,0.0037,3990000,19950,19950\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,172,0.0038,4086000,20430,20430\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,176,0.0038,4182000,20910,20910\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,180,0.0039,4278000,21390,21390\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,184,0.0040,4374000,21870,21870\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,188,0.0041,4470000,22350,22350\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,192,0.0041,4566000,22830,22830\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,196,0.0042,4662000,23310,23310\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,200,0.0042,4758000,23790,23790\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,204,0.0043,4854000,24270,24270\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,208,0.0044,4950000,24750,24750\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,212,0.0044,5046000,25230,25230\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,216,0.0045,5142000,25710,25710\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,220,0.0046,5238000,26190,26190\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,224,0.0046,5334000,26670,26670\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,228,0.0048,5430000,27150,27150\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,232,0.0049,5526000,27630,27630\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,236,0.0048,5622000,28110,28110\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,240,0.0049,5718000,28590,28590\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,244,0.0049,5814000,29070,29070\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,248,0.0050,5910000,29550,29550\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,252,0.0051,6006000,30030,30030\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,256,0.0051,6102000,30510,30510\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,260,0.0052,6198000,30990,30990\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,264,0.0053,6294000,31470,31470\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,268,0.0054,6390000,31950,31950\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,272,0.0054,6486000,32430,32430\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,276,0.0054,6582000,32910,32910\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,280,0.0055,6678000,33390,33390\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,284,0.0056,6774000,33870,33870\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,288,0.0057,6870000,34350,34350\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,292,0.0057,6966000,34830,34830\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,296,0.0058,7062000,35310,35310\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,300,0.0059,7158000,35790,35790\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,304,0.0059,7254000,36270,36270\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,308,0.0060,7350000,36750,36750\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,312,0.0062,7446000,37230,37230\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,316,0.0061,7542000,37710,37710\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,320,0.0062,7638000,38190,38190\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,324,0.0062,7734000,38670,38670\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,328,0.0063,7830000,39150,39150\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,332,0.0064,7926000,39630,39630\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,336,0.0065,8022000,40110,40110\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,340,0.0065,8118000,40590,40590\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,344,0.0066,8214000,41070,41070\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,348,0.0066,8310000,41550,41550\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,352,0.0067,8406000,42030,42030\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,356,0.0068,8502000,42510,42510\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,360,0.0068,8598000,42990,42990\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,364,0.0069,8694000,43470,43470\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,368,0.0070,8790000,43950,43950\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,372,0.0070,8886000,44430,44430\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,376,0.0071,8982000,44910,44910\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,380,0.0072,9078000,45390,45390\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,384,0.0072,9174000,45870,45870\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,388,0.0073,9270000,46350,46350\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,392,0.0074,9366000,46830,46830\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,396,0.0074,9462000,47310,47310\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,400,0.0075,9558000,47790,47790\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,404,0.0075,9654000,48270,48270\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,408,0.0076,9750000,48750,48750\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,412,0.0077,9846000,49230,49230\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,416,0.0079,9942000,49710,49710\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,420,0.0078,10038000,50190,50190\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,424,0.0080,10134000,50670,50670\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,428,0.0080,10230000,51150,51150\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,432,0.0080,10326000,51630,51630\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,436,0.0083,10422000,52110,52110\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,440,0.0082,10518000,52590,52590\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,444,0.0083,10614000,53070,53070\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,448,0.0083,10710000,53550,53550\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,452,0.0083,10806000,54030,54030\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,456,0.0084,10902000,54510,54510\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,460,0.0085,10998000,54990,54990\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,464,0.0085,11094000,55470,55470\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,468,0.0086,11190000,55950,55950\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,472,0.0087,11286000,56430,56430\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,476,0.0087,11382000,56910,56910\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,480,0.0088,11478000,57390,57390\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,484,0.0089,11574000,57870,57870\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,488,0.0089,11670000,58350,58350\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,492,0.0091,11766000,58830,58830\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,496,0.0091,11862000,59310,59310\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,500,0.0091,11958000,59790,59790\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,504,0.0092,12054000,60270,60270\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,508,0.0093,12150000,60750,60750\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,512,0.0094,12246000,61230,61230\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,516,0.0096,12342000,61710,61710\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,520,0.0096,12438000,62190,62190\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,524,0.0095,12534000,62670,62670\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,528,0.0098,12630000,63150,63150\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,532,0.0097,12726000,63630,63630\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,536,0.0097,12822000,64110,64110\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,540,0.0098,12918000,64590,64590\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,544,0.0100,13014000,65070,65070\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,548,0.0102,13110000,65550,65550\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,552,0.0102,13206000,66030,66030\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,556,0.0101,13302000,66510,66510\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,560,0.0103,13398000,66990,66990\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,564,0.0103,13494000,67470,67470\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,568,0.0104,13590000,67950,67950\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,572,0.0105,13686000,68430,68430\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,576,0.0105,13782000,68910,68910\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,580,0.0107,13878000,69390,69390\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,584,0.0108,13974000,69870,69870\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,588,0.0107,14070000,70350,70350\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,592,0.0108,14166000,70830,70830\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,596,0.0109,14262000,71310,71310\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,600,0.0110,14358000,71790,71790\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,604,0.0110,14454000,72270,72270\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,608,0.0111,14550000,72750,72750\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,612,0.0114,14646000,73230,73230\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,616,0.0112,14742000,73710,73710\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,620,0.0113,14838000,74190,74190\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,624,0.0114,14934000,74670,74670\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,628,0.0116,15030000,75150,75150\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,632,0.0115,15126000,75630,75630\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,636,0.0117,15222000,76110,76110\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,640,0.0116,15318000,76590,76590\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,644,0.0118,15414000,77070,77070\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,648,0.0117,15510000,77550,77550\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,652,0.0119,15606000,78030,78030\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,656,0.0119,15702000,78510,78510\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,660,0.0120,15798000,78990,78990\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,664,0.0120,15894000,79470,79470\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,668,0.0121,15990000,79950,79950\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,672,0.0121,16086000,80430,80430\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,676,0.0123,16182000,80910,80910\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,680,0.0122,16278000,81390,81390\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,684,0.0125,16374000,81870,81870\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,688,0.0124,16470000,82350,82350\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,692,0.0126,16566000,82830,82830\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,696,0.0125,16662000,83310,83310\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,700,0.0127,16758000,83790,83790\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,704,0.0128,16854000,84270,84270\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,708,0.0128,16950000,84750,84750\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,712,0.0128,17046000,85230,85230\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,716,0.0128,17142000,85710,85710\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,720,0.0129,17238000,86190,86190\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,724,0.0130,17334000,86670,86670\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,728,0.0130,17430000,87150,87150\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,732,0.0132,17526000,87630,87630\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,736,0.0132,17622000,88110,88110\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,740,0.0133,17718000,88590,88590\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,744,0.0133,17814000,89070,89070\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,748,0.0134,17910000,89550,89550\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,752,0.0134,18006000,90030,90030\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,756,0.0136,18102000,90510,90510\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,760,0.0136,18198000,90990,90990\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,764,0.0136,18294000,91470,91470\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,768,0.0137,18390000,91950,91950\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,772,0.0139,18486000,92430,92430\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,776,0.0139,18582000,92910,92910\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,780,0.0139,18678000,93390,93390\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,784,0.0140,18774000,93870,93870\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,788,0.0140,18870000,94350,94350\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,792,0.0142,18966000,94830,94830\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,796,0.0142,19062000,95310,95310\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,800,0.0144,19158000,95790,95790\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,804,0.0143,19254000,96270,96270\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,808,0.0144,19350000,96750,96750\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,812,0.0145,19446000,97230,97230\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,816,0.0145,19542000,97710,97710\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,820,0.0146,19638000,98190,98190\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,824,0.0147,19734000,98670,98670\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,828,0.0147,19830000,99150,99150\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,832,0.0148,19926000,99630,99630\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,836,0.0151,20022000,100110,100110\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,840,0.0150,20118000,100590,100590\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,844,0.0150,20214000,101070,101070\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,848,0.0151,20310000,101550,101550\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,852,0.0152,20406000,102030,102030\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,856,0.0152,20502000,102510,102510\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,860,0.0152,20598000,102990,102990\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,864,0.0153,20694000,103470,103470\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,868,0.0154,20790000,103950,103950\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,872,0.0155,20886000,104430,104430\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,876,0.0155,20982000,104910,104910\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,880,0.0157,21078000,105390,105390\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,884,0.0157,21174000,105870,105870\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,888,0.0158,21270000,106350,106350\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,892,0.0158,21366000,106830,106830\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,896,0.0159,21462000,107310,107310\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,900,0.0161,21558000,107790,107790\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,904,0.0162,21654000,108270,108270\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,908,0.0161,21750000,108750,108750\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,912,0.0163,21846000,109230,109230\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,916,0.0164,21942000,109710,109710\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,920,0.0165,22038000,110190,110190\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,924,0.0164,22134000,110670,110670\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,928,0.0166,22230000,111150,111150\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,932,0.0166,22326000,111630,111630\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,936,0.0167,22422000,112110,112110\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,940,0.0168,22518000,112590,112590\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,944,0.0168,22614000,113070,113070\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,948,0.0169,22710000,113550,113550\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,952,0.0170,22806000,114030,114030\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,956,0.0170,22902000,114510,114510\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,960,0.0171,22998000,114990,114990\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,964,0.0176,23094000,115470,115470\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,968,0.0176,23190000,115950,115950\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,972,0.0177,23286000,116430,116430\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,976,0.0177,23382000,116910,116910\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,980,0.0178,23478000,117390,117390\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,984,0.0178,23574000,117870,117870\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,988,0.0179,23670000,118350,118350\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,992,0.0180,23766000,118830,118830\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,996,0.0181,23862000,119310,119310\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,1000,0.0182,23958000,119790,119790\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,1004,0.0182,24054000,120270,120270\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,1008,0.0182,24150000,120750,120750\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,1012,0.0184,24246000,121230,121230\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,1016,0.0185,24342000,121710,121710\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,1020,0.0184,24438000,122190,122190\n", "iter,ny,nx,Runtime,PM_VECTOR_FLOP_CMPL (total),PM_VECTOR_FLOP_CMPL (min), PM_VECTOR_FLOP_CMPL (max)\n", "200,32,1024,0.0182,24534000,122670,122670\n", "mv /gpfs/wolf/trn003/scratch/aherten//poisson2d.vflop.bin.csv .\n"]}], "source": ["!make bench_task4"]}, {"cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style scoped>\n", "    .dataframe tbody tr th:only-of-type {\n", "        vertical-align: middle;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: right;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>nx</th>\n", "      <th>iter</th>\n", "      <th>ny</th>\n", "      <th>Runtime</th>\n", "      <th>PM_SCALAR_FLOP_CMPL (total)</th>\n", "      <th>PM_SCALAR_FLOP_CMPL (min)</th>\n", "      <th>PM_SCALAR_FLOP_CMPL (max)</th>\n", "      <th>PM_VECTOR_FLOP_CMPL (total)</th>\n", "      <th>PM_VECTOR_FLOP_CMPL (min)</th>\n", "      <th>PM_VECTOR_FLOP_CMPL (max)</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>4</td>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>0.0010</td>\n", "      <td>96000</td>\n", "      <td>480</td>\n", "      <td>480</td>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>8</td>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>0.0011</td>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>150000</td>\n", "      <td>750</td>\n", "      <td>750</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>12</td>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>0.0012</td>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>246000</td>\n", "      <td>1230</td>\n", "      <td>1230</td>\n", "    </tr>\n", "    <tr>\n", "      <th>3</th>\n", "      <td>16</td>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>0.0012</td>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>342000</td>\n", "      <td>1710</td>\n", "      <td>1710</td>\n", "    </tr>\n", "    <tr>\n", "      <th>4</th>\n", "      <td>20</td>\n", "      <td>200</td>\n", "      <td>32</td>\n", "      <td>0.0013</td>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>438000</td>\n", "      <td>2190</td>\n", "      <td>2190</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["   nx  iter  ny  Runtime  PM_SCALAR_FLOP_CMPL (total)  \\\n", "0   4   200  32   0.0010                        96000   \n", "1   8   200  32   0.0011                            0   \n", "2  12   200  32   0.0012                            0   \n", "3  16   200  32   0.0012                            0   \n", "4  20   200  32   0.0013                            0   \n", "\n", "   PM_SCALAR_FLOP_CMPL (min)   PM_SCALAR_FLOP_CMPL (max)  \\\n", "0                        480                         480   \n", "1                          0                           0   \n", "2                          0                           0   \n", "3                          0                           0   \n", "4                          0                           0   \n", "\n", "   PM_VECTOR_FLOP_CMPL (total)  PM_VECTOR_FLOP_CMPL (min)  \\\n", "0                            0                          0   \n", "1                       150000                        750   \n", "2                       246000                       1230   \n", "3                       342000                       1710   \n", "4                       438000                       2190   \n", "\n", "    PM_VECTOR_FLOP_CMPL (max)  \n", "0                           0  \n", "1                         750  \n", "2                        1230  \n", "3                        1710  \n", "4                        2190  "]}, "execution_count": 39, "metadata": {}, "output_type": "execute_result"}], "source": ["df_sflop = pd.read_csv(\"poisson2d.sflop.bin.csv\", skiprows=range(2, 50000, 2))\n", "df_vflop = pd.read_csv(\"poisson2d.vflop.bin.csv\", skiprows=range(2, 50000, 2))\n", "df_flop = pd.concat([df_sflop.set_index(\"nx\"), df_vflop.set_index(\"nx\")[['PM_VECTOR_FLOP_CMPL (total)', 'PM_VECTOR_FLOP_CMPL (min)', ' PM_VECTOR_FLOP_CMPL (max)']]], axis=1).reset_index()\n", "df_flop.head()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Again, the name of the vector counter is a bit misleading; not floating point operations are measured but floating point instructions. To get *real* floating point operations, each value needs to be multiplied by the vector width (2). We can plot the values afterwards (non-interactive: `make graph_task4`)."]}, {"cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": ["df_flop[\"Grid Points\"] = df_flop[\"nx\"] * df_flop[\"ny\"]\n", "df_flop[\"Vector FlOps (min)\"] = df_flop[\"PM_VECTOR_FLOP_CMPL (min)\"] * 2\n", "df_flop[\"Scalar FlOps (min)\"] = df_flop[\"PM_SCALAR_FLOP_CMPL (min)\"]"]}, {"cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [{"data": {"image/png": "\n", "text/plain": ["<Figure size 1008x432 with 1 Axes>"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["df_flop.set_index(\"Grid Points\")[[\"Scalar FlOps (min)\", \"Vector FlOps (min)\"]].plot();"]}, {"cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Counter Scalar FlOps (min) is proportional to the grid points (nx*ny) by a factor of -0.0003 (\u00b1 0.0002)\n", "Counter Vector FlOps (min) is proportional to the grid points (nx*ny) by a factor of  7.5004 (\u00b1 0.0002)\n"]}], "source": ["_fit, _cov = common.print_and_return_fit(\n", "    [\"Scalar FlOps (min)\", \"Vector FlOps (min)\"], \n", "    df_flop.set_index(\"Grid Points\"), \n", "    linear_function\n", ")\n", "fit_parameters = {**fit_parameters, **_fit}\n", "fit_covariance = {**fit_covariance, **_cov}"]}, {"cell_type": "markdown", "metadata": {"exercise": "solution"}, "source": ["Interesting! We seem to be using the vector registers of our system very well. Basically all operations are vector operations!"]}, {"cell_type": "markdown", "metadata": {}, "source": ["With that measured, we can determine the Arithmetic Intensity; the balance of floating point operations to bytes transmitted:\n", "\n", "\\begin{align}\n", "\\text{AI}^\\text{emp} = I_\\text{flop} / I_\\text{mem} \\text{,}\n", "\\end{align}\n", "\n", "with $I$ denoting the respective amount. This is the emperically determined Arithmetic Intensity.\n", "\n", "In the non-interactive version of the Notebook, please plot the graph calling `make graph_task4-2` in the terminal."]}, {"cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": ["I_flop_scalar = df_flop.set_index(\"Grid Points\")[\"Scalar FlOps (min)\"]\n", "I_flop_vector = df_flop.set_index(\"Grid Points\")[\"Vector FlOps (min)\"]\n", "I_mem_load    = df_byte[\"Loads\"]\n", "I_mem_store   = df_byte[\"Stores\"]"]}, {"cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [{"data": {"image/png": "\n", "text/plain": ["<Figure size 1008x432 with 1 Axes>"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["df_ai = pd.DataFrame()\n", "df_ai[\"Arithmetic Intensity\"] = (I_flop_scalar + I_flop_vector) / (I_mem_load + I_mem_store)\n", "ax = df_ai.plot();\n", "ax.set_ylabel(\"Byte/FlOp\");"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Thinking back to the first lecture of the tutorial, what Arithemtic Intensity did you expect?"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Task E2: Measuring a Larger Range\n", "<a name=\"taske2\"></a>\n", "\n", "If you still still have time, you might venture into your own benchmarking adventure.\n", "\n", "Maybe you noticed already, for instance in Task 2 C: At the very right to very large numbers of grid points, the behaviour of the graph changed. Something is happening there!\n", "\n", "\n", "**TASK**: Revisit the counters measured above for a larger range of `nx`. Right now, we only studied `nx` until 1000. New effects appear above that value\u00a0\u2013\u00a0partly only well above, though ($nx > 15000$).\n", "\n", "You're on your own here. Edit the `bench.sh` script to change the range and the stepping increments.\n", "\n", "**Good luck!**\n", "\n", "[Back to top](#toc)"]}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0"}}, "nbformat": 4, "nbformat_minor": 4}
\ No newline at end of file
diff --git a/2-Performance_Counters/Handson/Solutions/Hands-On-Performance-Counters.pdf b/2-Performance_Counters/Handson/Solutions/Hands-On-Performance-Counters.pdf
index 0335248e742ef80058639dc237914d48fa62ff8b..24f50ef681567d697e5ce9e199dcbb65483c43fb 100644
Binary files a/2-Performance_Counters/Handson/Solutions/Hands-On-Performance-Counters.pdf and b/2-Performance_Counters/Handson/Solutions/Hands-On-Performance-Counters.pdf differ
diff --git a/2-Performance_Counters/Handson/Solutions/Makefile b/2-Performance_Counters/Handson/Solutions/Makefile
index e725a4125a9fcdf5c7adc4b7a1d989fea2c8ee67..1db4b2f76ed5e40ed11f543e3d3837e46fa33080 100644
--- a/2-Performance_Counters/Handson/Solutions/Makefile
+++ b/2-Performance_Counters/Handson/Solutions/Makefile
@@ -34,42 +34,42 @@ clean:
 	${RM} -f *.bin
 
 run_task1: poisson2d.ins_cyc.bin
-	$(SC18_SUBMIT_CMD) ./$< 200 1024
+	$(SC19_SUBMIT_CMD) ./$< 200 1024
 run_task2: poisson2d.ld_st.bin
-	$(SC18_SUBMIT_CMD) ./$< 200 1024
+	$(SC19_SUBMIT_CMD) ./$< 200 1024
 run_task3_1: poisson2d.vld.bin
-	$(SC18_SUBMIT_CMD) ./$< 200 1024
+	$(SC19_SUBMIT_CMD) ./$< 200 1024
 run_task3_2: poisson2d.vst.bin
-	$(SC18_SUBMIT_CMD) ./$< 200 1024
+	$(SC19_SUBMIT_CMD) ./$< 200 1024
 run_task3: run_task3_1 run_task3_2
 run_task4_1: poisson2d.sflop.bin
-	$(SC18_SUBMIT_CMD) ./$< 200 1024
+	$(SC19_SUBMIT_CMD) ./$< 200 1024
 run_task4_2: poisson2d.vflop.bin
-	$(SC18_SUBMIT_CMD) ./$< 200 1024
+	$(SC19_SUBMIT_CMD) ./$< 200 1024
 run_task4: run_task4_1 run_task4_2
 bench_task1: poisson2d.ins_cyc.bin
-	$(SC18_SUBMIT_CMD) ./bench.sh $< $(SC18_DIR_SCRATCH)/$<.csv
-	mv $(SC18_DIR_SCRATCH)/$<.csv .
+	$(SC19_SUBMIT_CMD) ./bench.sh $< $(SC19_DIR_SCRATCH)/$<.csv
+	mv $(SC19_DIR_SCRATCH)/$<.csv .
 bench_task2: poisson2d.ld_st.bin
-	$(SC18_SUBMIT_CMD) ./bench.sh $< $(SC18_DIR_SCRATCH)/$<.csv
-	mv $(SC18_DIR_SCRATCH)/$<.csv .
+	$(SC19_SUBMIT_CMD) ./bench.sh $< $(SC19_DIR_SCRATCH)/$<.csv
+	mv $(SC19_DIR_SCRATCH)/$<.csv .
 bench_task3_1: poisson2d.vld.bin
-	$(SC18_SUBMIT_CMD) ./bench.sh $< $(SC18_DIR_SCRATCH)/$<.csv
-	mv $(SC18_DIR_SCRATCH)/$<.csv .
+	$(SC19_SUBMIT_CMD) ./bench.sh $< $(SC19_DIR_SCRATCH)/$<.csv
+	mv $(SC19_DIR_SCRATCH)/$<.csv .
 bench_task3_2: poisson2d.vst.bin
-	$(SC18_SUBMIT_CMD) ./bench.sh $< $(SC18_DIR_SCRATCH)/$<.csv
-	mv $(SC18_DIR_SCRATCH)/$<.csv .
+	$(SC19_SUBMIT_CMD) ./bench.sh $< $(SC19_DIR_SCRATCH)/$<.csv
+	mv $(SC19_DIR_SCRATCH)/$<.csv .
 bench_task3: bench_task3_1 bench_task3_2
 bench_task4_1: poisson2d.sflop.bin
-	$(SC18_SUBMIT_CMD) ./bench.sh $< $(SC18_DIR_SCRATCH)/$<.csv
-	mv $(SC18_DIR_SCRATCH)/$<.csv .
+	$(SC19_SUBMIT_CMD) ./bench.sh $< $(SC19_DIR_SCRATCH)/$<.csv
+	mv $(SC19_DIR_SCRATCH)/$<.csv .
 bench_task4_2: poisson2d.vflop.bin
-	$(SC18_SUBMIT_CMD) ./bench.sh $< $(SC18_DIR_SCRATCH)/$<.csv
-	mv $(SC18_DIR_SCRATCH)/$<.csv .
+	$(SC19_SUBMIT_CMD) ./bench.sh $< $(SC19_DIR_SCRATCH)/$<.csv
+	mv $(SC19_DIR_SCRATCH)/$<.csv .
 bench_task4: bench_task4_1 bench_task4_2
 
 clean_scratch_csv:
-	${RM} $(SC18_DIR_SCRATCH)/*.csv
+	${RM} $(SC19_DIR_SCRATCH)/*.csv
 clean_csv: clean_scratch_csv
 	${RM} *.csv
 
@@ -82,32 +82,25 @@ graph_task2c: plot-task2c.pdf
 graph_task4: plot-task4.pdf
 graph_task4-2: plot-task4-2.pdf
 plot-task1.pdf: poisson2d.ins_cyc.bin.csv
-	@test "$$SC18_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task1()"
 	@test -n "$$DISPLAY" || "No X forwarding found. Either reconnect with X forwarding (-X / -Y) or download $@ with scp."
 	display $@
 plot-task2a.pdf: poisson2d.ld_st.bin.csv
-	@test "$$SC18_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task2a()"
 	display $@
 plot-task2b.pdf: poisson2d.vld.bin.csv poisson2d.vst.bin.csv
-	@test "$$SC18_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task2b()"
 	display $@
 plot-task2b-2.pdf: poisson2d.vld.bin.csv poisson2d.vst.bin.csv poisson2d.ld_st.bin.csv
-	@test "$$SC18_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task2b(bytes=True)"
 	display $@
 plot-task2c.pdf: poisson2d.vld.bin.csv poisson2d.vst.bin.csv poisson2d.ld_st.bin.csv poisson2d.ins_cyc.bin.csv
-	@test "$$SC18_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task2c()"
 	display $@
 plot-task4.pdf: poisson2d.sflop.bin.csv poisson2d.vflop.bin.csv
-	@test "$$SC18_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task4()"
 	display $@
 plot-task4-2.pdf: poisson2d.sflop.bin.csv poisson2d.vflop.bin.csv
-	@test "$$SC18_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task4(ai=True)"
 	display $@
 
diff --git a/2-Performance_Counters/Handson/Solutions/common.py b/2-Performance_Counters/Handson/Solutions/common.py
index 1891a0341f369f7564b4a29b3f4a60e314f4bc9b..9033865e014fce9ece4137cdb11a42884acceae4 100644
--- a/2-Performance_Counters/Handson/Solutions/common.py
+++ b/2-Performance_Counters/Handson/Solutions/common.py
@@ -1,2 +1,22 @@
 def normalize(df, old_column, new_column):
 	df[new_column] = df[old_column] / (df["ny"] * df["nx"])
+    
+def print_and_return_fit(list_of_quantities, dataframe, function, format_value=">7.4f", format_uncertainty="f", _print=True):
+    """Use `curve_fit` to fit each quantity in `list_of_quantity` wrt to `dataframe.index`. Print (selectable) and return the result."""
+    import numpy as np
+    from scipy.optimize import curve_fit 
+    _fit_parameters = {}
+    _fit_covariance = {}
+    _quantity_padding = np.max([len(_str) for _str in list_of_quantities])
+    for quantity in list_of_quantities:
+        _fit_parameters[quantity], _fit_covariance[quantity] = curve_fit(function, dataframe.index, dataframe[quantity])
+        if (_print):
+            print("Counter {:>{_quantity_padding}} is proportional to the grid points (nx*ny) by a factor of {:{format_value}} (± {:{format_uncertainty}})".format(
+                quantity, 
+                _fit_parameters[quantity][0], 
+                np.sqrt(np.diag(_fit_covariance[quantity]))[0],
+                _quantity_padding=_quantity_padding,
+                format_value=format_value,
+                format_uncertainty=format_uncertainty
+        ))
+    return (_fit_parameters, _fit_covariance)
\ No newline at end of file
diff --git a/2-Performance_Counters/Handson/Tasks/Makefile b/2-Performance_Counters/Handson/Tasks/Makefile
index e725a4125a9fcdf5c7adc4b7a1d989fea2c8ee67..1db4b2f76ed5e40ed11f543e3d3837e46fa33080 100644
--- a/2-Performance_Counters/Handson/Tasks/Makefile
+++ b/2-Performance_Counters/Handson/Tasks/Makefile
@@ -34,42 +34,42 @@ clean:
 	${RM} -f *.bin
 
 run_task1: poisson2d.ins_cyc.bin
-	$(SC18_SUBMIT_CMD) ./$< 200 1024
+	$(SC19_SUBMIT_CMD) ./$< 200 1024
 run_task2: poisson2d.ld_st.bin
-	$(SC18_SUBMIT_CMD) ./$< 200 1024
+	$(SC19_SUBMIT_CMD) ./$< 200 1024
 run_task3_1: poisson2d.vld.bin
-	$(SC18_SUBMIT_CMD) ./$< 200 1024
+	$(SC19_SUBMIT_CMD) ./$< 200 1024
 run_task3_2: poisson2d.vst.bin
-	$(SC18_SUBMIT_CMD) ./$< 200 1024
+	$(SC19_SUBMIT_CMD) ./$< 200 1024
 run_task3: run_task3_1 run_task3_2
 run_task4_1: poisson2d.sflop.bin
-	$(SC18_SUBMIT_CMD) ./$< 200 1024
+	$(SC19_SUBMIT_CMD) ./$< 200 1024
 run_task4_2: poisson2d.vflop.bin
-	$(SC18_SUBMIT_CMD) ./$< 200 1024
+	$(SC19_SUBMIT_CMD) ./$< 200 1024
 run_task4: run_task4_1 run_task4_2
 bench_task1: poisson2d.ins_cyc.bin
-	$(SC18_SUBMIT_CMD) ./bench.sh $< $(SC18_DIR_SCRATCH)/$<.csv
-	mv $(SC18_DIR_SCRATCH)/$<.csv .
+	$(SC19_SUBMIT_CMD) ./bench.sh $< $(SC19_DIR_SCRATCH)/$<.csv
+	mv $(SC19_DIR_SCRATCH)/$<.csv .
 bench_task2: poisson2d.ld_st.bin
-	$(SC18_SUBMIT_CMD) ./bench.sh $< $(SC18_DIR_SCRATCH)/$<.csv
-	mv $(SC18_DIR_SCRATCH)/$<.csv .
+	$(SC19_SUBMIT_CMD) ./bench.sh $< $(SC19_DIR_SCRATCH)/$<.csv
+	mv $(SC19_DIR_SCRATCH)/$<.csv .
 bench_task3_1: poisson2d.vld.bin
-	$(SC18_SUBMIT_CMD) ./bench.sh $< $(SC18_DIR_SCRATCH)/$<.csv
-	mv $(SC18_DIR_SCRATCH)/$<.csv .
+	$(SC19_SUBMIT_CMD) ./bench.sh $< $(SC19_DIR_SCRATCH)/$<.csv
+	mv $(SC19_DIR_SCRATCH)/$<.csv .
 bench_task3_2: poisson2d.vst.bin
-	$(SC18_SUBMIT_CMD) ./bench.sh $< $(SC18_DIR_SCRATCH)/$<.csv
-	mv $(SC18_DIR_SCRATCH)/$<.csv .
+	$(SC19_SUBMIT_CMD) ./bench.sh $< $(SC19_DIR_SCRATCH)/$<.csv
+	mv $(SC19_DIR_SCRATCH)/$<.csv .
 bench_task3: bench_task3_1 bench_task3_2
 bench_task4_1: poisson2d.sflop.bin
-	$(SC18_SUBMIT_CMD) ./bench.sh $< $(SC18_DIR_SCRATCH)/$<.csv
-	mv $(SC18_DIR_SCRATCH)/$<.csv .
+	$(SC19_SUBMIT_CMD) ./bench.sh $< $(SC19_DIR_SCRATCH)/$<.csv
+	mv $(SC19_DIR_SCRATCH)/$<.csv .
 bench_task4_2: poisson2d.vflop.bin
-	$(SC18_SUBMIT_CMD) ./bench.sh $< $(SC18_DIR_SCRATCH)/$<.csv
-	mv $(SC18_DIR_SCRATCH)/$<.csv .
+	$(SC19_SUBMIT_CMD) ./bench.sh $< $(SC19_DIR_SCRATCH)/$<.csv
+	mv $(SC19_DIR_SCRATCH)/$<.csv .
 bench_task4: bench_task4_1 bench_task4_2
 
 clean_scratch_csv:
-	${RM} $(SC18_DIR_SCRATCH)/*.csv
+	${RM} $(SC19_DIR_SCRATCH)/*.csv
 clean_csv: clean_scratch_csv
 	${RM} *.csv
 
@@ -82,32 +82,25 @@ graph_task2c: plot-task2c.pdf
 graph_task4: plot-task4.pdf
 graph_task4-2: plot-task4-2.pdf
 plot-task1.pdf: poisson2d.ins_cyc.bin.csv
-	@test "$$SC18_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task1()"
 	@test -n "$$DISPLAY" || "No X forwarding found. Either reconnect with X forwarding (-X / -Y) or download $@ with scp."
 	display $@
 plot-task2a.pdf: poisson2d.ld_st.bin.csv
-	@test "$$SC18_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task2a()"
 	display $@
 plot-task2b.pdf: poisson2d.vld.bin.csv poisson2d.vst.bin.csv
-	@test "$$SC18_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task2b()"
 	display $@
 plot-task2b-2.pdf: poisson2d.vld.bin.csv poisson2d.vst.bin.csv poisson2d.ld_st.bin.csv
-	@test "$$SC18_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task2b(bytes=True)"
 	display $@
 plot-task2c.pdf: poisson2d.vld.bin.csv poisson2d.vst.bin.csv poisson2d.ld_st.bin.csv poisson2d.ins_cyc.bin.csv
-	@test "$$SC18_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task2c()"
 	display $@
 plot-task4.pdf: poisson2d.sflop.bin.csv poisson2d.vflop.bin.csv
-	@test "$$SC18_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task4()"
 	display $@
 plot-task4-2.pdf: poisson2d.sflop.bin.csv poisson2d.vflop.bin.csv
-	@test "$$SC18_MODULE_ACTIVE_1" -eq "1" || "Please load the module of this task, sc18/handson1."
 	python3 -c "import graphing; graphing.task4(ai=True)"
 	display $@
 
diff --git a/2-Performance_Counters/Handson/Tasks/common.py b/2-Performance_Counters/Handson/Tasks/common.py
index 1891a0341f369f7564b4a29b3f4a60e314f4bc9b..9033865e014fce9ece4137cdb11a42884acceae4 100644
--- a/2-Performance_Counters/Handson/Tasks/common.py
+++ b/2-Performance_Counters/Handson/Tasks/common.py
@@ -1,2 +1,22 @@
 def normalize(df, old_column, new_column):
 	df[new_column] = df[old_column] / (df["ny"] * df["nx"])
+    
+def print_and_return_fit(list_of_quantities, dataframe, function, format_value=">7.4f", format_uncertainty="f", _print=True):
+    """Use `curve_fit` to fit each quantity in `list_of_quantity` wrt to `dataframe.index`. Print (selectable) and return the result."""
+    import numpy as np
+    from scipy.optimize import curve_fit 
+    _fit_parameters = {}
+    _fit_covariance = {}
+    _quantity_padding = np.max([len(_str) for _str in list_of_quantities])
+    for quantity in list_of_quantities:
+        _fit_parameters[quantity], _fit_covariance[quantity] = curve_fit(function, dataframe.index, dataframe[quantity])
+        if (_print):
+            print("Counter {:>{_quantity_padding}} is proportional to the grid points (nx*ny) by a factor of {:{format_value}} (± {:{format_uncertainty}})".format(
+                quantity, 
+                _fit_parameters[quantity][0], 
+                np.sqrt(np.diag(_fit_covariance[quantity]))[0],
+                _quantity_padding=_quantity_padding,
+                format_value=format_value,
+                format_uncertainty=format_uncertainty
+        ))
+    return (_fit_parameters, _fit_covariance)
\ No newline at end of file