From 1721a1aab148af87f1049fb2e729e87f6ac540e3 Mon Sep 17 00:00:00 2001 From: janEbert <janpublicebert@posteo.net> Date: Wed, 16 Oct 2024 20:26:09 +0200 Subject: [PATCH] Properly shut down distributed process group This avoids pending NCCL operations being lost. --- pytorch-ddp-example/main.py | 2 ++ pytorch-fsdp-example/main.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/pytorch-ddp-example/main.py b/pytorch-ddp-example/main.py index c7b1b1a..bea1349 100644 --- a/pytorch-ddp-example/main.py +++ b/pytorch-ddp-example/main.py @@ -289,6 +289,8 @@ def main(): print0('Final test loss:', test_loss) save0(model, 'model-final.pt') + torch.distributed.destroy_process_group() + if __name__ == '__main__': main() diff --git a/pytorch-fsdp-example/main.py b/pytorch-fsdp-example/main.py index 2e15d5f..b167d18 100644 --- a/pytorch-fsdp-example/main.py +++ b/pytorch-fsdp-example/main.py @@ -341,6 +341,8 @@ def main(): print0('Final test loss:', test_loss) save_model(model, 'model-final') + torch.distributed.destroy_process_group() + if __name__ == '__main__': main() -- GitLab