From 1721a1aab148af87f1049fb2e729e87f6ac540e3 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 16 Oct 2024 20:26:09 +0200
Subject: [PATCH] Properly shut down distributed process group

This avoids pending NCCL operations being lost.
---
 pytorch-ddp-example/main.py  | 2 ++
 pytorch-fsdp-example/main.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/pytorch-ddp-example/main.py b/pytorch-ddp-example/main.py
index c7b1b1a..bea1349 100644
--- a/pytorch-ddp-example/main.py
+++ b/pytorch-ddp-example/main.py
@@ -289,6 +289,8 @@ def main():
     print0('Final test loss:', test_loss)
     save0(model, 'model-final.pt')
 
+    torch.distributed.destroy_process_group()
+
 
 if __name__ == '__main__':
     main()
diff --git a/pytorch-fsdp-example/main.py b/pytorch-fsdp-example/main.py
index 2e15d5f..b167d18 100644
--- a/pytorch-fsdp-example/main.py
+++ b/pytorch-fsdp-example/main.py
@@ -341,6 +341,8 @@ def main():
     print0('Final test loss:', test_loss)
     save_model(model, 'model-final')
 
+    torch.distributed.destroy_process_group()
+
 
 if __name__ == '__main__':
     main()
-- 
GitLab