diff --git a/video_prediction_tools/main_scripts/main_train_models.py b/video_prediction_tools/main_scripts/main_train_models.py index 0b1abfa45e813e23c5a8c979fc1180ae7b57acaf..06a7efc9dc7296c333888f0aeabdc9cb9fc91d98 100644 --- a/video_prediction_tools/main_scripts/main_train_models.py +++ b/video_prediction_tools/main_scripts/main_train_models.py @@ -73,6 +73,7 @@ class TrainModel(object): self.create_saver_and_writer() self.setup_gpu_config() self.calculate_samples_and_epochs() + self.save_gpus_info() print("setup done") def set_seed(self): @@ -285,9 +286,9 @@ class TrainModel(object): #print("hvd_size:",hvd.size()) #print("hvd_local_rank:",hvd.local_rank()) # also track computing node - cnode_file = os.path.join(self.output_dir, "GPU_worker{0}.json".format(str(hvd.local_rank()))) + cnode_file = os.path.join(self.output_dir, "GPU_worker{0}.json".format(str(hvd.rank()))) with open(cnode_file, "w") as fjs: - json.dump({"worker{0}".format(str(hvd.local_rank())): host}, fjs) + json.dump({"worker{0}".format(str(hvd.rank())): host}, fjs) def save_timing_to_pkl(self, training_time, time_per_iteration): """