2021-08-12 17:56:55 +00:00
|
|
|
|
|
|
|
|
import torch.distributed as c10d
|
2021-08-12 18:39:31 +00:00
|
|
|
import torch
|
|
|
|
|
import argparse
|
|
|
|
|
import os
|
|
|
|
|
import logging
|
|
|
|
|
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
|
2019-08-22 23:10:29 +00:00
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
parser = argparse.ArgumentParser(
|
2021-08-12 18:39:31 +00:00
|
|
|
description='Simple script to simulate NCCL errors. The script is '
|
|
|
|
|
'supposed to be run on multiple different nodes simultaneously with '
|
|
|
|
|
'appropriate rank and world_size. The script run an allreduce() on '
|
|
|
|
|
'the rank 0 node and aborts all the other nodes to simulate an error '
|
|
|
|
|
'in NCCL')
|
|
|
|
|
parser.add_argument('addr', help='address of the master node to connect to.')
|
|
|
|
|
parser.add_argument('port', help='port of the master node to connect to.')
|
|
|
|
|
parser.add_argument('rank', help='rank of this node')
|
|
|
|
|
parser.add_argument('world_size', help='number of nodes in process group')
|
2019-08-22 23:10:29 +00:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
rank = int(args.rank)
|
|
|
|
|
world_size = int(args.world_size)
|
|
|
|
|
port = int(args.port)
|
|
|
|
|
|
|
|
|
|
store = c10d.TCPStore(args.addr, port, world_size, rank == 0)
|
|
|
|
|
process_group = c10d.ProcessGroupNCCL(store, rank, world_size)
|
2021-08-12 18:39:31 +00:00
|
|
|
logging.info('Running first allreduce')
|
2019-08-22 23:10:29 +00:00
|
|
|
process_group.allreduce(torch.rand(10).cuda(rank)).wait()
|
|
|
|
|
if rank == 0:
|
2021-08-12 18:39:31 +00:00
|
|
|
logging.info('Running second allreduce only on rank 0')
|
2019-08-22 23:10:29 +00:00
|
|
|
work = process_group.allreduce(torch.rand(10).cuda(rank))
|
2021-08-12 18:39:31 +00:00
|
|
|
logging.info('Waiting for allreduce to complete...')
|
2019-08-22 23:10:29 +00:00
|
|
|
work.wait()
|
2023-03-31 16:53:36 +00:00
|
|
|
logging.info('Second allreduce successful: %s', work.is_success())
|
2019-08-22 23:10:29 +00:00
|
|
|
else:
|
2021-08-12 18:39:31 +00:00
|
|
|
logging.info('Aborting all other ranks.')
|
2019-08-22 23:10:29 +00:00
|
|
|
os.abort()
|