Skip to content
Snippets Groups Projects
Commit a6121ba2 authored by Michael Langguth's avatar Michael Langguth
Browse files

Add save_gpus_info to set-up mof model.

parent aa6d9dd6
Branches
No related merge requests found
Pipeline #58798 passed
...@@ -73,6 +73,7 @@ class TrainModel(object): ...@@ -73,6 +73,7 @@ class TrainModel(object):
self.create_saver_and_writer() self.create_saver_and_writer()
self.setup_gpu_config() self.setup_gpu_config()
self.calculate_samples_and_epochs() self.calculate_samples_and_epochs()
self.save_gpus_info()
print("setup done") print("setup done")
def set_seed(self): def set_seed(self):
...@@ -285,9 +286,9 @@ class TrainModel(object): ...@@ -285,9 +286,9 @@ class TrainModel(object):
#print("hvd_size:",hvd.size()) #print("hvd_size:",hvd.size())
#print("hvd_local_rank:",hvd.local_rank()) #print("hvd_local_rank:",hvd.local_rank())
# also track computing node # also track computing node
cnode_file = os.path.join(self.output_dir, "GPU_worker{0}.json".format(str(hvd.local_rank()))) cnode_file = os.path.join(self.output_dir, "GPU_worker{0}.json".format(str(hvd.rank())))
with open(cnode_file, "w") as fjs: with open(cnode_file, "w") as fjs:
json.dump({"worker{0}".format(str(hvd.local_rank())): host}, fjs) json.dump({"worker{0}".format(str(hvd.rank())): host}, fjs)
def save_timing_to_pkl(self, training_time, time_per_iteration): def save_timing_to_pkl(self, training_time, time_per_iteration):
""" """
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment