diff --git a/video_prediction_tools/main_scripts/main_train_models.py b/video_prediction_tools/main_scripts/main_train_models.py index 751506d4b65df80cc4489f05054cedb9c0799ba8..c37e2375514c48244f8840ba70948346d69b5c90 100644 --- a/video_prediction_tools/main_scripts/main_train_models.py +++ b/video_prediction_tools/main_scripts/main_train_models.py @@ -279,6 +279,14 @@ class TrainModel(object): """ Start session and train the model """ + + method = TrainModel.train_model.__name__ + + # print some info to stdout + if self.rank_glob == 0: + print("%{0}: Number of GPUs for training: {1:d}".format(method, self.ngpus)) + print("%{0}: Global rank: {1:d}, local rank:".format(method, self.rank_glob, self.rank_loc)) + # for initilizing model at the coorect stage (i.e. from scratch or from pre-trained model) self.global_step = tf.train.get_or_create_global_step() with tf.Session(config=self.config) as sess: print("parameter_count =", sess.run(self.parameter_count)) @@ -316,17 +324,22 @@ class TrainModel(object): TrainModel.save_results_to_pkl(train_losses,val_losses,self.output_dir) TrainModel.plot_train(train_losses,val_losses,step,self.output_dir) - #Totally train time over all the iterations - train_time = time.time() - run_start_time - results_dict = {"train_time":train_time, - "total_steps":self.total_steps} - TrainModel.save_results_to_dict(results_dict,self.output_dir) - print("train_losses:",train_losses) - print("val_losses:",val_losses) - print("Done") - print("Total training time:", train_time/60., "min") - return train_time, time_per_iteration - + # barrier to ensure that the follwoing is done at the very end + # tip following the user 'ppwwyxx' in the github-post under + # https://github.com/horovod/horovod/issues/159 from 21st November 2018 + barrier = hvd.allreduce(tf.random_normal(shape=[1])) + if self.rank_glob == 0: + # track time (save to pickle-files) + train_time = time.time() - run_start_time #Total train time over all the iterations + + avg_samples = int(1600/self.ngpus) + TrainModel.save_timing_to_pkl(self, train_time, time_per_iteration) + print("%{0}: Training loss decreased from {1:.6f} to {2:.6f}:" + .format(method, np.mean(train_losses[0:10]), np.mean(train_losses[-avg_samples:]))) + print("%{0}: Validation loss decreased from {1:.6f} to {2:.6f}:" + .format(method, np.mean(val_losses[0:10]), np.mean(val_losses[-avg_samples:]))) + print("%{0}: Training finsished".format(method)) + print("%{0}: Total training time: {1:.2f} min".format(method, train_time/60.)) def create_fetches_for_train(self): """