Add optional HSDP support

Documentation in README is missing for now, but will be added in the future.

Add optional HSDP support
99836847 · Jan Ebert · 3fe25198 · 99836847
Commit 99836847 authored 4 months ago by Jan Ebert
--- a/pytorch-fsdp-example/main.py
+++ b/pytorch-fsdp-example/main.py
@@ -67,6 +67,17 @@ def parse_args():
        default=0,
        help='Random number generator initialization value.',
    )
+    parser.add_argument(
+        '--num-fsdp-replicas',
+        type=int,
+        help=(
+            'How many FSDP replicas to use for hybrid sharded data '
+            'parallelism (HSDP). The model will be sharded into '
+            '`world_size / num_fsdp_replicas` partitions per replica. '
+            'Gradients will be all-reduced across the replicas. '
+            'If not given, use standard FSDP.'
+        ),
+    )
    args = parser.parse_args()
    return args
@@ -283,14 +294,25 @@ def main():
    train_dset, valid_dset, test_dset = prepare_datasets(args, device)
    model = build_model()
-    mesh_1d = device_mesh.init_device_mesh(
+    # Set up FSDP or HSDP.
-        "cuda",
+    if args.num_fsdp_replicas is None:
-        (torch.distributed.get_world_size(),),
+        fsdp_mesh_dims = (torch.distributed.get_world_size(),)
-    )
+        sharding_strategy = fsdp.ShardingStrategy.FULL_SHARD
+    else:
+        assert (
+            torch.distributed.get_world_size() % args.num_fsdp_replicas
+            == 0
+        ), 'world size must be divisible by number of FSDP replicas'
+        fsdp_shards_per_replica = \
+            torch.distributed.get_world_size() // args.num_fsdp_replicas
+        fsdp_mesh_dims = (args.num_fsdp_replicas, fsdp_shards_per_replica)
+        sharding_strategy = fsdp.ShardingStrategy.HYBRID_SHARD
+    fsdp_mesh = device_mesh.init_device_mesh("cuda", fsdp_mesh_dims)
    model = fsdp.FullyShardedDataParallel(
        model,
        device_id=local_rank,
-        device_mesh=mesh_1d,
+        device_mesh=fsdp_mesh,
+        sharding_strategy=sharding_strategy,
        auto_wrap_policy=functools.partial(
            fsdp.wrap.size_based_auto_wrap_policy,
            # Wrap every 1B parameters.