diff --git a/tools/scripts/pytorch-all-reduce/README.md b/tools/scripts/pytorch-all-reduce/README.md new file mode 100644 index 0000000000..33d1b40316 --- /dev/null +++ b/tools/scripts/pytorch-all-reduce/README.md @@ -0,0 +1,32 @@ +Small benchmark utility for gpt-fast's all reduce. + +### How to run +Out of box run (This will try various sequence lengths and dump perf results to terminal output) +``` +torchrun --nproc_per_node=8 all_reduce.py +``` + +To enable intra node all-reduce algorithms use: +``` +ENABLE_INTRA_NODE_COMM=1 torchrun --nproc_per_node=8 python3 all_reduce.py +``` + +### Rocprof trace script +To create perfetto traces for each rank of each all reduce a bash script is provided. +``` +ENABLE_INTRA_NODE_COMM=1 bash trace_runs.sh +``` + +### Additional options: +The tensor size is dependent on sequence_length and dim supplied in gpt fast. There are 4 different all-reduce calls in gpt-fast at runtime: +- 1: [seq_len, dim] +- 2: [seq_len, 2, dim] +- 3: [1, dim] +- 4: [1, 2, dim] +``` +--sequence_lengths (defaults to [50, 64, 128, 256, 512, 1024, 2048, 4096]) +--dim (defaults to 6144) +--all_reduce (defaults to [0,1,2,3]) - Can be modified to only run a single all-reduce, mapping to the 4 all reduces listed above +--tracing - Enables tracing mode to skip CPU timers in recording +``` + diff --git a/tools/scripts/pytorch-all-reduce/all_reduce.py b/tools/scripts/pytorch-all-reduce/all_reduce.py new file mode 100644 index 0000000000..9477a9660d --- /dev/null +++ b/tools/scripts/pytorch-all-reduce/all_reduce.py @@ -0,0 +1,106 @@ +import os +import torch +import torch.distributed as dist +import time +import argparse +import statistics + +def init_process(rank, size, fn, backend='nccl'): + """ Init the distributed environment. """ + os.environ['MASTER_ADDR'] = '127.0.0.1' + os.environ['MASTER_PORT'] = '29500' + os.environ['RANK'] = str(rank) + os.environ['WORLD_SIZE'] = str(size) + + dist.init_process_group(backend, rank=rank, world_size=size) + return fn(rank, size) + +def get_algo_type(data_size): + if data_size <= 256 * 1024: + return "one-shot allreduce" + elif data_size <= 10 * 1024 * 1024: + return "two-shot allreduce" + else: + return "nccl" + +def benchmark_all_reduce(rank, size, sequence_lengths, dim, all_reduce_algos, tracing): + """ Benchmark all-reduce operation - 4 different datasizes will be benched per run """ + torch.cuda.set_device(rank) + + n_runs = 1000 + + results = [] + + # All-reduce sizes for gpt fast + algo_shapes = { + 1: (lambda seq_len: (seq_len, dim)), + 2: (lambda seq_len: (seq_len, 2, dim)), + 3: (lambda _: (1, dim)), + 4: (lambda _: (1, 2, dim)) + } + + for seq_len in sequence_lengths: + for algo in all_reduce_algos: + shape = algo_shapes[algo](seq_len) + main_times = [] + tensor = torch.randn(*shape, device='cuda').to(torch.bfloat16) + + # Warm-up - before result collection + for _ in range(5): + dist.all_reduce(tensor, op=dist.ReduceOp.SUM) + dist.barrier() + + # Benchmark - result collection and timers disabled if --tracing applied + for _ in range(n_runs): + if not tracing: + start = time.time() + + dist.all_reduce(tensor, op=dist.ReduceOp.SUM) + dist.barrier() + + if not tracing: + end = time.time() + main_time = (end - start) * 1e6 # Convert to microseconds + main_times.append(main_time) + + if rank == 0 and not tracing: + mean_time = statistics.mean(main_times) + median_time = statistics.median(main_times) + max_time = max(main_times) + min_time = min(main_times) + std_time = statistics.stdev(main_times) + data_size_bytes = torch.tensor(shape).prod().item() * 2 # * 2 as bfloat16 takes 2 bytes + data_size_mb = data_size_bytes / (1024 ** 2) + algo_type = get_algo_type(data_size_bytes) + results.append([f"all_reduce_{algo}", seq_len, data_size_mb, algo_type, mean_time, median_time, max_time, min_time, std_time]) + + return results if rank == 0 and not tracing else None + +def main(): + parser = argparse.ArgumentParser(description='PyTorch All-Reduce Benchmark') + parser.add_argument('--sequence_lengths', type=int, nargs='+', default=[50, 64, 128, 256, 512, 1024, 2048, 4096], + help='List of sequence lengths to benchmark') + parser.add_argument('--dim', type=int, default=6144, help='Dimension for tensor shapes') + parser.add_argument('--all_reduce', type=int, nargs='+', choices=[1, 2, 3, 4], default=[1, 2, 3, 4], + help='List of all-reduce algorithms to run (1, 2, 3, 4)') + parser.add_argument('--tracing', action='store_true', help='Enable tracing mode (skip CPU timers and output table)') + args = parser.parse_args() + + sequence_lengths = args.sequence_lengths + dim = args.dim + all_reduce_algos = args.all_reduce + tracing = args.tracing + + size = int(os.environ['WORLD_SIZE']) # number of processes (GPUs) + rank = int(os.environ['RANK']) + results = init_process(rank, size, fn=lambda rank, size: benchmark_all_reduce(rank, size, sequence_lengths, dim, all_reduce_algos, tracing), backend='nccl') + + if rank == 0 and not tracing: + header = ["algo", "sequence_length", "data_size (MB)", "algo type", "mean (us)", "median (us)", "max (us)", "min (us)", "std (us)"] + print(",".join(header)) + for result in results: + formatted_result = [f"{item:.2f}" if isinstance(item, float) else str(item) for item in result] + print(",".join(formatted_result)) + +if __name__ == "__main__": + main() diff --git a/tools/scripts/pytorch-all-reduce/trace_runs.sh b/tools/scripts/pytorch-all-reduce/trace_runs.sh new file mode 100644 index 0000000000..fe54b5669d --- /dev/null +++ b/tools/scripts/pytorch-all-reduce/trace_runs.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +SEQUENCE_LENGTHS=(50 128 256 512 1024 2048 4096) +ALL_REDUCE_ALGOS=(1 2 3 4) + +HIP_DEV_FORCE_KERNARG=1 + +for SEQ_LEN in "${SEQUENCE_LENGTHS[@]}"; do + for ALGO in "${ALL_REDUCE_ALGOS[@]}"; do + echo "Running sequence length $SEQ_LEN with intra-node all_reduce $ALGO" + ENABLE_INTRA_NODE_COMM=1 rocprofv3 --hip-trace --kernel-trace --stats --output-format PFTRACE -d rocprof_trace/intranode_input"$SEQ_LEN"_allreduce"$ALGO"/ -- torchrun --nproc_per_node=8 all_reduce.py --sequence_lengths $SEQ_LEN --all_reduce $ALGO --tracing + done +done