diff --git a/tutorials/day3/tutorial1/Tutorial1.ipynb b/tutorials/day3/tutorial1/Tutorial1.ipynb index 09b35a2de8dc971da270f6f1c3b96b743f4d04b1..487df51817f4d857038a084a0508e852517a3ee5 100644 --- a/tutorials/day3/tutorial1/Tutorial1.ipynb +++ b/tutorials/day3/tutorial1/Tutorial1.ipynb @@ -60,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -91,20 +91,25 @@ " # - $T(K)$ is the throughput for $K$ GPUs\n", " # - $T(1)$ the reference throughput (for $K=1$)\n", " df['efficiency'] = df[\"TOTAL_IMAGES_PER_SEC\"] / ( (df[\"GPUs\"]/refrence_nb_gpus) * reference_throughput)\n", - " fig = plt.figure()\n", - " plt.plot(df.GPUs, df.Speedup, 'go--', label='speedup', color='blue')\n", - " plt.plot(df.GPUs, df.GPUs / refrence_nb_gpus, 'go-', label='ideal', color='orange')\n", + " \n", + " # plot\n", + " fig, ax1 = plt.subplots() \n", + " ax1.plot(df.GPUs, df.Speedup, 'go--', label='speedup', color='blue')\n", + " ax1.set_xlabel('GPUs')\n", + " ax1.set_ylabel('Speedup',color='blue')\n", + " ax1.set_xticks(df.GPUs)\n", + " ax1.plot(df.GPUs, df.GPUs / refrence_nb_gpus, 'go-', label='ideal speedup', color='orange')\n", + " ax1.tick_params(axis='y', labelcolor='blue')\n", " scale = df.Speedup.max() * 1.5\n", " top = 0\n", " text_spacing = 0.1\n", - " y = df['efficiency'] * scale + top\n", - " plt.plot(df.GPUs, y, c='red', marker=\"o\", label='efficiency')\n", + " y = df['efficiency'] * 100#* scale + top\n", + " ax2 = ax1.twinx() \n", + " ax2.set_ylabel('Efficiency (%)', color='red')\n", + " ax2.plot(df.GPUs, y, c='red', marker=\"o\", label='efficiency')\n", + " ax2.tick_params(axis='y', labelcolor='red')\n", " for nb_gpus, effval, yval in zip(df.GPUs, df['efficiency'], y):\n", - " plt.text(nb_gpus, yval+text_spacing, f\"{effval*100:.2f}%\", size=11, c='red')\n", - " plt.xlabel('GPUs')\n", - " plt.ylabel('Speedup')\n", - " plt.xticks(df.GPUs)\n", - " plt.legend()\n", + " ax2.text(nb_gpus, yval+text_spacing, f\"{effval*100:.2f}%\", size=11, c='red')\n", "\n", "def plot_images_per_sec(df):\n", " \"\"\"\n", @@ -119,8 +124,8 @@ " df = df.copy()\n", " df[\"GPUs\"] = df[\"NODES\"] * df[\"GPUS_PER_NODE\"]\n", " df = df.sort_values(by=\"GPUs\")\n", - " df.plot(x=\"BATCH_SIZE\", y=\"TOTAL_IMAGES_PER_SEC\", color='pink', edgecolor='red', kind='bar')\n", - " ticks = [f\"({int(row['BATCH_SIZE'])})x({int(row['NODES'])})x({int(row['GPUS_PER_NODE'])})\" for _, row in df.iterrows()]\n", + " df.plot(x=\"LOCAL_BATCH_SIZE\", y=\"TOTAL_IMAGES_PER_SEC\", color='pink', edgecolor='red', kind='bar')\n", + " ticks = [f\"({int(row['LOCAL_BATCH_SIZE'])})x({int(row['NODES'])})x({int(row['GPUS_PER_NODE'])})\" for _, row in df.iterrows()]\n", " plt.xticks(range(len(df)), ticks)\n", " plt.xlabel(\"B_ref x Nodes x GPUs per node\")" ] @@ -275,9 +280,9 @@ "metadata": {}, "outputs": [], "source": [ - "plot_scaling_and_efficiency(df[df.BATCH_SIZE==128])\n", - "plot_scaling_and_efficiency(df[df.BATCH_SIZE==512])\n", - "plot_scaling_and_efficiency(df[df.BATCH_SIZE==2048])" + "plot_scaling_and_efficiency(df[df.LOCAL_BATCH_SIZE==128])\n", + "plot_scaling_and_efficiency(df[df.LOCAL_BATCH_SIZE==512])\n", + "plot_scaling_and_efficiency(df[df.LOCAL_BATCH_SIZE==2048])" ] }, { diff --git a/tutorials/day3/tutorial1/benchmark.sh b/tutorials/day3/tutorial1/benchmark.sh index 0a3c2451c6778b2926aed88aef7a089225b77dc1..ae8c0ac5840da988506b2c71a3b76cc9d0e5f416 100755 --- a/tutorials/day3/tutorial1/benchmark.sh +++ b/tutorials/day3/tutorial1/benchmark.sh @@ -26,7 +26,7 @@ OUTPUT=$2 # This is necessary to ignore the header when parsing the CSV config_contents="$(sed -e '$a\' $CONFIG)" -echo "NODES,BATCH_SIZE,GPUS_PER_NODE,TOTAL_IMAGES_PER_SEC">$OUTPUT +echo "NODES,LOCAL_BATCH_SIZE,GPUS_PER_NODE,TOTAL_IMAGES_PER_SEC">$OUTPUT # try different configuration of number of nodes and batch size using $CONFIG # and write the result to $OUTPUT @@ -35,16 +35,17 @@ nb_lines=$((nb_lines-1)) # to ignore header for line in $(echo "$config_contents"|tail -n $nb_lines);do NODES=$(echo $line|cut -d, -f1) GPUS_PER_NODE=$(echo $line|cut -d, -f2) - BATCH_SIZE=$(echo $line|cut -d, -f3) - RUN_CONFIG="NODES_${NODES}_BATCH_SIZE_${BATCH_SIZE}_GPUS_PER_NODE_${GPUS_PER_NODE}" + LOCAL_BATCH_SIZE=$(echo $line|cut -d, -f3) + RUN_CONFIG="NODES_${NODES}_LOCAL_BATCH_SIZE_${LOCAL_BATCH_SIZE}_GPUS_PER_NODE_${GPUS_PER_NODE}" echo "Running configuration: $RUN_CONFIG" # for each configuration, the standard output will be written in $STDOUT STDOUT="${RUN_CONFIG}.out" # do the run for the current configuration - srun --output=$STDOUT --error=$STDOUT -N $NODES -n $((GPUS_PER_NODE*NODES)) --ntasks-per-node=$GPUS_PER_NODE --account=$ACCOUNT --cpus-per-task=$CPUS_PER_TASK --time=$TIME --gres=gpu:$GPUS_PER_NODE --partition=$PARTITION --cpu-bind=none,v --accel-bind=gn python -u train.py --batch_size=$BATCH_SIZE + srun --output=$STDOUT --error=$STDOUT -N $NODES -n $((GPUS_PER_NODE*NODES)) --ntasks-per-node=$GPUS_PER_NODE --account=$ACCOUNT --cpus-per-task=$CPUS_PER_TASK --time=$TIME --gres=gpu:$GPUS_PER_NODE --partition=$PARTITION --cpu-bind=none,v --accel-bind=gn python -u train.py --batch_size=$LOCAL_BATCH_SIZE # extract the throughput from the standard output file and add it to `results.csv` total_images_per_sec=$(cat $STDOUT|grep "total images/sec:"|cut -d ":" -f 2) echo "Total images per sec: $total_images_per_sec" - echo "$NODES,$BATCH_SIZE,$GPUS_PER_NODE,$total_images_per_sec">>$OUTPUT + echo "$NODES,$LOCAL_BATCH_SIZE,$GPUS_PER_NODE,$total_images_per_sec">>$OUTPUT done +echo "Total time spent: $SECONDS secs" diff --git a/tutorials/day3/tutorial1/config_example.csv b/tutorials/day3/tutorial1/config_example.csv index 0d4bda5ad236a090d2ce60c788217fa3852af327..0efe7e9c7c616c98320aa8d3bf72c03655c1132d 100644 --- a/tutorials/day3/tutorial1/config_example.csv +++ b/tutorials/day3/tutorial1/config_example.csv @@ -1,4 +1,4 @@ -NODES,GPUS_PER_NODE,BATCH_SIZE +NODES,GPUS_PER_NODE,LOCAL_BATCH_SIZE 1,2,512 2,4,256 2,1,256 diff --git a/tutorials/day3/tutorial1/config_task1.2.csv b/tutorials/day3/tutorial1/config_task1.2.csv index 724d84bc5f27ebed4bbc67531735ad5d7a93cfad..3b29abb3430152cde02a2681c115f335cb345fc1 100644 --- a/tutorials/day3/tutorial1/config_task1.2.csv +++ b/tutorials/day3/tutorial1/config_task1.2.csv @@ -1 +1 @@ -NODES,GPUS_PER_NODE,BATCH_SIZE +NODES,GPUS_PER_NODE,LOCAL_BATCH_SIZE diff --git a/tutorials/day3/tutorial1/config_task2.1.csv b/tutorials/day3/tutorial1/config_task2.1.csv index 724d84bc5f27ebed4bbc67531735ad5d7a93cfad..3b29abb3430152cde02a2681c115f335cb345fc1 100644 --- a/tutorials/day3/tutorial1/config_task2.1.csv +++ b/tutorials/day3/tutorial1/config_task2.1.csv @@ -1 +1 @@ -NODES,GPUS_PER_NODE,BATCH_SIZE +NODES,GPUS_PER_NODE,LOCAL_BATCH_SIZE diff --git a/tutorials/day3/tutorial1/config_task2.2.csv b/tutorials/day3/tutorial1/config_task2.2.csv index 724d84bc5f27ebed4bbc67531735ad5d7a93cfad..3b29abb3430152cde02a2681c115f335cb345fc1 100644 --- a/tutorials/day3/tutorial1/config_task2.2.csv +++ b/tutorials/day3/tutorial1/config_task2.2.csv @@ -1 +1 @@ -NODES,GPUS_PER_NODE,BATCH_SIZE +NODES,GPUS_PER_NODE,LOCAL_BATCH_SIZE diff --git a/tutorials/day3/tutorial1/solutions/Tutorial1_Test.ipynb b/tutorials/day3/tutorial1/solutions/Tutorial1_Test.ipynb index f0e1344b58eebc1b250e5ce72dcebe821258c90a..e5b98a9d85cad3c36a5e9a05c43cbf9a25946539 100644 --- a/tutorials/day3/tutorial1/solutions/Tutorial1_Test.ipynb +++ b/tutorials/day3/tutorial1/solutions/Tutorial1_Test.ipynb @@ -119,8 +119,8 @@ " df = df.copy()\n", " df[\"GPUs\"] = df[\"NODES\"] * df[\"GPUS_PER_NODE\"]\n", " df = df.sort_values(by=\"GPUs\")\n", - " df.plot(x=\"BATCH_SIZE\", y=\"TOTAL_IMAGES_PER_SEC\", color='pink', edgecolor='red', kind='bar')\n", - " ticks = [f\"({int(row['BATCH_SIZE'])})x({int(row['NODES'])})x({int(row['GPUS_PER_NODE'])})\" for _, row in df.iterrows()]\n", + " df.plot(x=\"LOCAL_BATCH_SIZE\", y=\"TOTAL_IMAGES_PER_SEC\", color='pink', edgecolor='red', kind='bar')\n", + " ticks = [f\"({int(row['LOCAL_BATCH_SIZE'])})x({int(row['NODES'])})x({int(row['GPUS_PER_NODE'])})\" for _, row in df.iterrows()]\n", " plt.xticks(range(len(df)), ticks)\n", " plt.xlabel(\"B_ref x Nodes x GPUs per node\")" ] @@ -471,9 +471,9 @@ "metadata": {}, "outputs": [], "source": [ - "plot_scaling_and_efficiency(df[df.BATCH_SIZE==128])\n", - "plot_scaling_and_efficiency(df[df.BATCH_SIZE==512])\n", - "plot_scaling_and_efficiency(df[df.BATCH_SIZE==2048])" + "plot_scaling_and_efficiency(df[df.LOCAL_BATCH_SIZE==128])\n", + "plot_scaling_and_efficiency(df[df.LOCAL_BATCH_SIZE==512])\n", + "plot_scaling_and_efficiency(df[df.LOCAL_BATCH_SIZE==2048])" ] }, { diff --git a/tutorials/day3/tutorial1/solutions/config_task1.2.csv b/tutorials/day3/tutorial1/solutions/config_task1.2.csv index 94fb61822cd22cc2b63893cd9592b429123eba31..3918e54887a026f6e7e2cec2dbf5e5d91e62c5e3 100644 --- a/tutorials/day3/tutorial1/solutions/config_task1.2.csv +++ b/tutorials/day3/tutorial1/solutions/config_task1.2.csv @@ -1,4 +1,4 @@ -NODES,GPUS_PER_NODE,BATCH_SIZE +NODES,GPUS_PER_NODE,LOCAL_BATCH_SIZE 1,1,1024 1,2,1024 1,4,1024 diff --git a/tutorials/day3/tutorial1/solutions/config_task2.1.csv b/tutorials/day3/tutorial1/solutions/config_task2.1.csv index d12c09e6fa7c6d1725397f005f45672f40c23d78..92bd42a4d8debd00a08e4da536655959ea9433c9 100644 --- a/tutorials/day3/tutorial1/solutions/config_task2.1.csv +++ b/tutorials/day3/tutorial1/solutions/config_task2.1.csv @@ -1,4 +1,4 @@ -NODES,GPUS_PER_NODE,BATCH_SIZE +NODES,GPUS_PER_NODE,LOCAL_BATCH_SIZE 1,2,2048 1,4,1024 2,1,2048 diff --git a/tutorials/day3/tutorial1/solutions/config_task2.2.csv b/tutorials/day3/tutorial1/solutions/config_task2.2.csv index 7e1c01d2d824c4989564b3262f2bbdf7ad8ae8c9..ea186b6b033c4d3fc08499e3239ef5e66366dc7e 100644 --- a/tutorials/day3/tutorial1/solutions/config_task2.2.csv +++ b/tutorials/day3/tutorial1/solutions/config_task2.2.csv @@ -1,4 +1,4 @@ -NODES,GPUS_PER_NODE,BATCH_SIZE +NODES,GPUS_PER_NODE,LOCAL_BATCH_SIZE 1,1,128 2,4,128 1,1,512 diff --git a/tutorials/day3/tutorial1/solutions/results_task2.1-Test.csv b/tutorials/day3/tutorial1/solutions/results_task2.1-Test.csv index 277b5ff8e579332f75862b0eb49113b59820ffba..1eebc61b760795dc8bf62bef7cf3333e376f01ba 100644 --- a/tutorials/day3/tutorial1/solutions/results_task2.1-Test.csv +++ b/tutorials/day3/tutorial1/solutions/results_task2.1-Test.csv @@ -1,4 +1,4 @@ -NODES,BATCH_SIZE,GPUS_PER_NODE,TOTAL_IMAGES_PER_SEC +NODES,LOCAL_BATCH_SIZE,GPUS_PER_NODE,TOTAL_IMAGES_PER_SEC 1,2048,2,8865.762937030366 1,1024,4,14890.190529880427 2,2048,1,8962.76324357789 diff --git a/tutorials/day3/tutorial1/solutions/train.py b/tutorials/day3/tutorial1/solutions/train.py index 52b0e97bb7876a75ddc3a228289995e026d9ee4b..30e9a6bae0c4628f179d3dcc0ba6f993f982b50f 100644 --- a/tutorials/day3/tutorial1/solutions/train.py +++ b/tutorials/day3/tutorial1/solutions/train.py @@ -116,6 +116,10 @@ batch_size = args.batch_size # Number of epochs: one epoch is going through the whole dataset epochs = args.epochs +print("Local batch size:", batch_size) +print("Number of GPUs:", hvd.size()) +print("Effective batch size:", batch_size * hvd.size()) + # Instantiate the model model = tf.keras.applications.ResNet50(classes=NUM_CLASSES, weights=None, input_shape=INPUT_SHAPE) diff --git a/tutorials/day3/tutorial1/train.py b/tutorials/day3/tutorial1/train.py index 1cf1fc394ab4166ed06a95ee0cce9c792eda3270..f920a507112e1de2b086d63873528133b1125e39 100644 --- a/tutorials/day3/tutorial1/train.py +++ b/tutorials/day3/tutorial1/train.py @@ -115,6 +115,10 @@ batch_size = args.batch_size # Number of epochs: one epoch is going through the whole dataset epochs = args.epochs +print("Local batch size:", batch_size) +print("Number of GPUs:", hvd.size()) +print("Effective batch size:", batch_size * hvd.size()) + # Instantiate the model model = tf.keras.applications.ResNet50(classes=NUM_CLASSES, weights=None, input_shape=INPUT_SHAPE)