Merge pull request #1299 from edgargabriel/topic/remove-multirank-examples
Remove MultiRank examples
Этот коммит содержится в:
@@ -1,24 +0,0 @@
|
||||
# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
# Set to where RCCL is installed
|
||||
RCCL_INSTALL=../../build/release
|
||||
|
||||
HIP_PATH?= $(wildcard /opt/rocm)
|
||||
ifeq (,$(HIP_PATH))
|
||||
HIP_PATH=../../..
|
||||
endif
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
|
||||
EXE=rccl-allreduce-multirank rccl-reducescatter-multirank
|
||||
CXXFLAGS = -std=c++11 -O3 -I$(RCCL_INSTALL)/include/rccl/ -L$(RCCL_INSTALL) -lrccl
|
||||
|
||||
all: $(EXE)
|
||||
|
||||
rccl-allreduce-multirank: rccl-allreduce-multirank.cc $(shell find -regex ".*\.\hpp")
|
||||
$(HIPCC) $(CXXFLAGS) $< -o $@
|
||||
|
||||
rccl-reducescatter-multirank: rccl-reducescatter-multirank.cc $(shell find -regex ".*\.\hpp")
|
||||
$(HIPCC) $(CXXFLAGS) $< -o $@
|
||||
|
||||
clean:
|
||||
rm -f *.o $(EXE)
|
||||
@@ -1,19 +0,0 @@
|
||||
#************************************************************************
|
||||
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#***********************************************************************/
|
||||
|
||||
#!/bin/bash
|
||||
export MPI_INSTALL_PATH=
|
||||
export RCCL_INSTALL_PATH=
|
||||
export ROCM_INSTALL_PATH=
|
||||
|
||||
$MPI_INSTALL_PATH/bin/mpiCC -o rccl-allreduce-mpi-multirank rccl-allreduce-mpi-multirank.cc -I$ROCM_INSTALL_PATH/include -I$RCCL_INSTALL_PATH/include -D__HIP_PLATFORM_AMD__ -L$ROCM_INSTALL_PATH/lib -lamdhip64 -L$RCCL_INSTALL_PATH/lib -lrccl
|
||||
|
||||
$MPI_INSTALL_PATH/bin/mpiCC -o rccl-reducescatter-mpi-multirank rccl-reducescatter-mpi-multirank.cc -I$ROCM_INSTALL_PATH/include -I$RCCL_INSTALL_PATH/include -D__HIP_PLATFORM_AMD__ -L$ROCM_INSTALL_PATH/lib -lamdhip64 -L$RCCL_INSTALL_PATH/lib -lrccl
|
||||
|
||||
|
||||
export GPU_MAX_HW_QUEUES=16
|
||||
$MPI_INSTALL_PATH/bin/mpirun --mca pml ucx -np 4 ./rccl-allreduce-mpi-multirank 0 0 2
|
||||
$MPI_INSTALL_PATH/bin/mpirun --mca pml ucx -np 4 ./rccl-reducescatter-mpi-multirank 0 0 2
|
||||
@@ -1,228 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "rccl.h"
|
||||
#include "mpi.h"
|
||||
|
||||
|
||||
#define MPICHECK(cmd) do { \
|
||||
int e = cmd; \
|
||||
if( e != MPI_SUCCESS ) { \
|
||||
printf("Failed: MPI error %s:%d '%d'\n", \
|
||||
__FILE__,__LINE__, e); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define HIPCHECK(cmd) do { \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
printf("Failed: HIP error %s:%d '%s'\n", \
|
||||
__FILE__,__LINE__,hipGetErrorString(e)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define NCCLCHECK(cmd) do { \
|
||||
ncclResult_t r = cmd; \
|
||||
if (r!= ncclSuccess) { \
|
||||
printf("Failed, NCCL error %s:%d '%s'\n", \
|
||||
__FILE__,__LINE__,ncclGetErrorString(r)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
static void init_sendbuf (float *sendbuf, int count, int val)
|
||||
{
|
||||
for (int i = 0; i < count; i++) {
|
||||
sendbuf[i] = (float)val+1;
|
||||
}
|
||||
}
|
||||
|
||||
static void init_zero (float *recvbuf, int count)
|
||||
{
|
||||
for (int i = 0; i < count; i++) {
|
||||
recvbuf[i] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool check_recvbuf (float *recvbuf, int count, int ndevices)
|
||||
{
|
||||
bool result = true;
|
||||
float expected=0.0;
|
||||
for (int i=0; i<ndevices; i++){
|
||||
expected += (float)i+1;
|
||||
}
|
||||
for (int i = 0; i < count; i++) {
|
||||
if (recvbuf[i] != expected) {
|
||||
result = false;
|
||||
printf("Element %d is %f expected %f\n", i, recvbuf[i], expected);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static void print_help()
|
||||
{
|
||||
printf("Usage: rccl-allreduce-mpi-multirank <distMode> <startDev> <numDevs> \n");
|
||||
printf(" all arguments are optional, but have to be provided in this order\n");
|
||||
printf(" distMode : 0 - 1 (default: 0 - block distribution of rank to devices)\n");
|
||||
printf(" startDev : id of first Device to use (default: 0) \n");
|
||||
printf(" numDevs : number of Devices to use (default: 2) \n");
|
||||
}
|
||||
|
||||
static int distmode=0;
|
||||
static int startdev=0;
|
||||
static int numdevices=2;
|
||||
static int maxdevices=0;
|
||||
|
||||
static void devicemode_init( int argc, char **argv)
|
||||
{
|
||||
char *modeexpl[4];
|
||||
int myRank;
|
||||
MPICHECK(MPI_Comm_rank (MPI_COMM_WORLD, &myRank));
|
||||
|
||||
modeexpl[0] = strdup("0: contiguous assignment of ranks to devices");
|
||||
modeexpl[1] = strdup("1: round robin assignment of ranks to devices");
|
||||
|
||||
if (argc > 1 ) {
|
||||
distmode = atoi(argv[1]);
|
||||
}
|
||||
if (argc > 2 ) {
|
||||
startdev = atoi(argv[2]);
|
||||
}
|
||||
if ( argc > 3 ) {
|
||||
numdevices = atoi(argv[3]);
|
||||
}
|
||||
if ( distmode > 1) {
|
||||
if ( myRank == 0 ) {
|
||||
printf("Unknown distribution mode %d. Known distribution modes are 0-1\n", distmode);
|
||||
print_help();
|
||||
}
|
||||
MPI_Abort (MPI_COMM_WORLD, -1);
|
||||
}
|
||||
HIPCHECK(hipGetDeviceCount(&maxdevices));
|
||||
if ( numdevices > maxdevices) {
|
||||
if ( myRank == 0 ) {
|
||||
printf("Requesting %d devices, %d devices available. Aborting.\n", numdevices, maxdevices);
|
||||
print_help();
|
||||
}
|
||||
MPI_Abort (MPI_COMM_WORLD, -1);
|
||||
}
|
||||
if ( startdev > maxdevices-1) {
|
||||
if ( myRank == 0 ) {
|
||||
printf("Startdevice is %d, max. number of devices is %d. Valid values are 0 - %d\n", startdev, maxdevices, maxdevices-1);
|
||||
print_help();
|
||||
}
|
||||
MPI_Abort (MPI_COMM_WORLD, -1);
|
||||
}
|
||||
|
||||
if ( myRank == 0 ) {
|
||||
printf("Using binding mode %s\n", modeexpl[distmode]);
|
||||
printf("Starting devices is %d, %d devices used.\n\n", startdev, numdevices);
|
||||
}
|
||||
}
|
||||
|
||||
static bool report_binding=true;
|
||||
|
||||
static void device_set(int id, int nDev)
|
||||
{
|
||||
int dev=0;
|
||||
if (distmode == 0 ) {
|
||||
int tmp = (id*numdevices)/nDev;
|
||||
dev = (startdev+tmp)%maxdevices;
|
||||
}
|
||||
else if (distmode == 1) {
|
||||
dev = (startdev+id)%numdevices;
|
||||
}
|
||||
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
if (report_binding) {
|
||||
printf("Rank %d using device %d\n", id, dev);
|
||||
if ( id == nDev-1) {
|
||||
report_binding=false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int size = 32*1024*1024;
|
||||
int myRank, nRanks, localRank = 0;
|
||||
ncclUniqueId id;
|
||||
ncclComm_t comm;
|
||||
float *h_sendbuff, *h_recvbuff;
|
||||
float *sendbuff, *recvbuff;
|
||||
hipStream_t s;
|
||||
|
||||
//initializing MPI
|
||||
MPICHECK(MPI_Init(&argc, &argv));
|
||||
MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank));
|
||||
MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks));
|
||||
|
||||
//get deviceId to be used for each rank, e.g. localRank%numberOfDevices
|
||||
devicemode_init( argc, argv);
|
||||
int nDev = numdevices;
|
||||
|
||||
//get NCCL unique ID at rank 0 and broadcast it to all others
|
||||
if (myRank == 0) ncclGetUniqueId(&id);
|
||||
MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));
|
||||
|
||||
//initializing RCCL
|
||||
device_set(myRank, nRanks);
|
||||
NCCLCHECK(ncclCommInitRankMulti(&comm, nRanks, id, myRank, myRank));
|
||||
|
||||
//allocate buffers
|
||||
HIPCHECK(hipMalloc(&sendbuff, size * sizeof(float)));
|
||||
h_sendbuff = (float*) malloc ( size *sizeof(float));
|
||||
init_sendbuf(h_sendbuff, size, myRank);
|
||||
HIPCHECK(hipMemcpy(sendbuff, h_sendbuff, size * sizeof(float), hipMemcpyDefault));
|
||||
|
||||
HIPCHECK(hipMalloc(&recvbuff, size * sizeof(float)));
|
||||
h_recvbuff = (float*) malloc ( size *sizeof(float));
|
||||
init_zero(h_recvbuff, size);
|
||||
HIPCHECK(hipMemcpy(recvbuff, h_recvbuff, size * sizeof(float), hipMemcpyDefault));
|
||||
|
||||
HIPCHECK(hipStreamCreate(&s));
|
||||
|
||||
NCCLCHECK(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, ncclFloat,
|
||||
ncclSum, comm, s));
|
||||
|
||||
//completing NCCL operation by synchronizing on the HIP stream
|
||||
HIPCHECK(hipStreamSynchronize(s));
|
||||
|
||||
//check result
|
||||
|
||||
HIPCHECK(hipMemcpy(h_recvbuff, recvbuff, size*sizeof(float), hipMemcpyDefault));
|
||||
bool res = check_recvbuf(h_recvbuff, size, nRanks);
|
||||
printf("[%d] Checking buffer result is %s\n", myRank, res == true ? "correct" : "wrong" );
|
||||
|
||||
//free buffers
|
||||
HIPCHECK(hipFree(sendbuff));
|
||||
free (h_sendbuff);
|
||||
HIPCHECK(hipFree(recvbuff));
|
||||
free (h_recvbuff);
|
||||
|
||||
//finalizing NCCL
|
||||
ncclCommDestroy(comm);
|
||||
HIPCHECK(hipStreamDestroy(s));
|
||||
|
||||
//finalizing MPI
|
||||
printf("[MPI Rank %d] Success \n", myRank);
|
||||
MPICHECK(MPI_Finalize());
|
||||
return 0;
|
||||
}
|
||||
@@ -1,243 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
#include <stdio.h>
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "rccl.h"
|
||||
|
||||
#define HIPCHECK(cmd) do { \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
printf("Failed: HIP error %s:%d '%s'\n", \
|
||||
__FILE__,__LINE__,hipGetErrorString(e)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define NCCLCHECK(cmd) do { \
|
||||
ncclResult_t r = cmd; \
|
||||
if (r!= ncclSuccess) { \
|
||||
printf("Failed, NCCL error %s:%d '%s'\n", \
|
||||
__FILE__,__LINE__,ncclGetErrorString(r)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
static void init_sendbuf (float *sendbuf, int count, int val)
|
||||
{
|
||||
for (int i = 0; i < count; i++) {
|
||||
sendbuf[i] = (float)val;
|
||||
}
|
||||
}
|
||||
|
||||
static void init_zero (float *recvbuf, int count)
|
||||
{
|
||||
for (int i = 0; i < count; i++) {
|
||||
recvbuf[i] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool check_recvbuf (float *recvbuf, int count, int ndevices)
|
||||
{
|
||||
bool result = true;
|
||||
float expected=0.0;
|
||||
|
||||
for (int i=0; i<ndevices; i++){
|
||||
expected += (float)i+1;
|
||||
}
|
||||
for (int i = 0; i < count; i++) {
|
||||
if (recvbuf[i] != expected) {
|
||||
result = false;
|
||||
printf("Element %d is %f expected %f\n", i, recvbuf[i], expected);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static int distmode=0;
|
||||
static int startdev=0;
|
||||
static int numdevices=2;
|
||||
static int maxdevices=0;
|
||||
static int ranksperdev=1;
|
||||
|
||||
static void print_help()
|
||||
{
|
||||
printf("Usage: rccl-allreduce-multirank <distMode> <startDev> <numDevs> <ranksPerDev> \n");
|
||||
printf(" all arguments are optional, but have to be provided in this order\n");
|
||||
printf(" distMode : 0 - 3 (default: 0 - all ranks are on different devices)\n");
|
||||
printf(" startDev : id of first Device to use (default: 0) \n");
|
||||
printf(" numDevs : number of Devices to use (default: 2) \n");
|
||||
printf(" ranksPerDev: number of Ranks per Device (default: 1) \n");
|
||||
}
|
||||
|
||||
static void devicemode_init( int argc, char **argv)
|
||||
{
|
||||
char *modeexpl[4];
|
||||
|
||||
modeexpl[0] = strdup("0: all ranks are on different devices");
|
||||
modeexpl[1] = strdup("1: all ranks are on same device");
|
||||
modeexpl[2] = strdup("2: contiguous assignment of ranks to devices");
|
||||
modeexpl[3] = strdup("3: round robin assignment of ranks to devices");
|
||||
|
||||
if (argc > 1 ) {
|
||||
distmode = atoi(argv[1]);
|
||||
}
|
||||
if (argc > 2 ) {
|
||||
startdev = atoi(argv[2]);
|
||||
}
|
||||
if ( argc > 3 ) {
|
||||
numdevices = atoi(argv[3]);
|
||||
}
|
||||
if ( argc > 4 ) {
|
||||
ranksperdev = atoi(argv[4]);
|
||||
}
|
||||
|
||||
if ( distmode > 3) {
|
||||
printf("Unknown distribution mode %d. Known distribution modes are 0-3\n", distmode);
|
||||
print_help();
|
||||
exit(-1);
|
||||
}
|
||||
HIPCHECK(hipGetDeviceCount(&maxdevices));
|
||||
if ( numdevices > maxdevices) {
|
||||
printf("Requesting %d devices, %d devices available. Aborting.\n", numdevices, maxdevices);
|
||||
print_help();
|
||||
exit(-1);
|
||||
}
|
||||
if ( startdev > maxdevices-1) {
|
||||
printf("Startdevice is %d, max. number of devices is %d. Valid values are 0 - %d\n", startdev, maxdevices, maxdevices-1);
|
||||
print_help();
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
if (distmode == 1) numdevices = 1;
|
||||
if (distmode == 0) ranksperdev = 1;
|
||||
|
||||
printf("Using binding mode %s\n", modeexpl[distmode]);
|
||||
printf("Starting devices is %d, %d devices used, %d ranks per device.\n\n", startdev, numdevices, ranksperdev);
|
||||
}
|
||||
|
||||
static bool report_binding=true;
|
||||
|
||||
static void device_set(int id, int nDev)
|
||||
{
|
||||
int dev=0;
|
||||
if (distmode == 0 )
|
||||
dev = (startdev+id)%numdevices;
|
||||
else if (distmode == 1) {
|
||||
dev = startdev;
|
||||
}
|
||||
else if (distmode == 2) {
|
||||
int tmp = (id*numdevices)/nDev;
|
||||
dev = (startdev+tmp)%maxdevices;
|
||||
}
|
||||
else if (distmode == 3) {
|
||||
dev = (startdev+id)%numdevices;
|
||||
}
|
||||
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
if (report_binding) {
|
||||
printf("Rank %d using device %d\n", id, dev);
|
||||
if ( id == nDev-1) {
|
||||
report_binding=false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int nDev;
|
||||
int size = 32*1024*1024;
|
||||
|
||||
devicemode_init( argc, argv);
|
||||
nDev = numdevices * ranksperdev;
|
||||
|
||||
//allocating and initializing device buffers
|
||||
float** h_sendbuff = (float**)malloc(nDev * sizeof(float*));
|
||||
float** sendbuff = (float**)malloc(nDev * sizeof(float*));
|
||||
float** h_recvbuff = (float**)malloc(nDev * sizeof(float*));
|
||||
float** recvbuff = (float**)malloc(nDev * sizeof(float*));
|
||||
hipStream_t* s = (hipStream_t*)malloc(sizeof(hipStream_t)*nDev);
|
||||
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
|
||||
|
||||
for (int i = 0; i < nDev; ++i) {
|
||||
device_set(i, nDev);
|
||||
|
||||
HIPCHECK(hipMalloc(sendbuff+i, size * sizeof(float)));
|
||||
h_sendbuff[i] = (float*) malloc (size *sizeof(float));
|
||||
init_sendbuf(h_sendbuff[i], size, i+1);
|
||||
HIPCHECK(hipMemcpy(sendbuff[i], h_sendbuff[i], size * sizeof(float), hipMemcpyDefault));
|
||||
|
||||
HIPCHECK(hipMalloc(recvbuff+i, size*sizeof(float)));
|
||||
h_recvbuff[i] = (float*) malloc (size *sizeof(float));
|
||||
HIPCHECK(hipMemset(recvbuff[i], 0, size*sizeof(float)));
|
||||
HIPCHECK(hipStreamSynchronize(NULL));
|
||||
HIPCHECK(hipStreamCreate(s+i));
|
||||
}
|
||||
|
||||
|
||||
//initializing NCCL
|
||||
ncclUniqueId id;
|
||||
ncclGetUniqueId(&id);
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int i=0; i<nDev; i++ ) {
|
||||
device_set(i, nDev);
|
||||
NCCLCHECK(ncclCommInitRankMulti(&comms[i], nDev, id, i, i));
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
|
||||
//calling NCCL communication API. Group API is required when using
|
||||
//multiple devices per thread
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int i = 0; i < nDev; ++i) {
|
||||
NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], size, ncclFloat, ncclSum, comms[i], s[i]));
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
|
||||
|
||||
//synchronizing on HIP streams to wait for completion of NCCL operation
|
||||
for (int i = 0; i < nDev; ++i) {
|
||||
device_set(i, nDev);
|
||||
HIPCHECK(hipStreamSynchronize(s[i]));
|
||||
}
|
||||
|
||||
for (int i = 0; i < nDev; ++i) {
|
||||
device_set(i, nDev);
|
||||
HIPCHECK(hipMemcpy(h_recvbuff[i], recvbuff[i], size*sizeof(float), hipMemcpyDefault ));
|
||||
bool res = check_recvbuf(h_recvbuff[i], size, nDev);
|
||||
printf("Checking buffer %d result is %s\n",i, res == true ? "correct" : "wrong" );
|
||||
}
|
||||
|
||||
//free buffers
|
||||
for (int i = 0; i < nDev; ++i) {
|
||||
device_set(i, nDev);
|
||||
HIPCHECK(hipFree(sendbuff[i]));
|
||||
free (h_sendbuff[i]);
|
||||
HIPCHECK(hipFree(recvbuff[i]));
|
||||
free (h_recvbuff[i]);
|
||||
}
|
||||
|
||||
|
||||
//finalizing NCCL
|
||||
for(int i = 0; i < nDev; ++i) {
|
||||
ncclCommDestroy(comms[i]);
|
||||
HIPCHECK(hipStreamDestroy(s[i]));
|
||||
}
|
||||
|
||||
free (h_sendbuff);
|
||||
free (sendbuff);
|
||||
free (h_recvbuff);
|
||||
free (recvbuff);
|
||||
free (s);
|
||||
free (comms);
|
||||
|
||||
printf("Success \n");
|
||||
return 0;
|
||||
}
|
||||
@@ -1,232 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "rccl.h"
|
||||
#include "mpi.h"
|
||||
|
||||
|
||||
#define MPICHECK(cmd) do { \
|
||||
int e = cmd; \
|
||||
if( e != MPI_SUCCESS ) { \
|
||||
printf("Failed: MPI error %s:%d '%d'\n", \
|
||||
__FILE__,__LINE__, e); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define HIPCHECK(cmd) do { \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
printf("Failed: HIP error %s:%d '%s'\n", \
|
||||
__FILE__,__LINE__,hipGetErrorString(e)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define NCCLCHECK(cmd) do { \
|
||||
ncclResult_t r = cmd; \
|
||||
if (r!= ncclSuccess) { \
|
||||
printf("Failed, NCCL error %s:%d '%s'\n", \
|
||||
__FILE__,__LINE__,ncclGetErrorString(r)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
static void init_sendbuf (float *sendbuf, int count, int val)
|
||||
{
|
||||
for (int i = 0; i < count; i++) {
|
||||
sendbuf[i] = (float)val+1;
|
||||
}
|
||||
}
|
||||
|
||||
static void init_zero (float *recvbuf, int count)
|
||||
{
|
||||
for (int i = 0; i < count; i++) {
|
||||
recvbuf[i] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool check_recvbuf (float *recvbuf, int count, int ndevices)
|
||||
{
|
||||
bool result = true;
|
||||
float expected=0.0;
|
||||
|
||||
for (int i=0; i<ndevices; i++){
|
||||
expected += (float)i+1;
|
||||
}
|
||||
for (int i = 0; i < count; i++) {
|
||||
if (recvbuf[i] != expected) {
|
||||
result = false;
|
||||
printf("Element %d is %f expected %f\n", i, recvbuf[i], expected);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static void print_help()
|
||||
{
|
||||
printf("Usage: rccl-reducescatter-mpi-multirank <distMode> <startDev> <numDevs> \n");
|
||||
printf(" all arguments are optional, but have to be provided in this order\n");
|
||||
printf(" distMode : 0 - 1 (default: 0 - block distribution of rank to devices)\n");
|
||||
printf(" startDev : id of first Device to use (default: 0) \n");
|
||||
printf(" numDevs : number of Devices to use (default: 2) \n");
|
||||
}
|
||||
|
||||
static int distmode=0;
|
||||
static int startdev=0;
|
||||
static int numdevices=2;
|
||||
static int maxdevices=0;
|
||||
|
||||
static void devicemode_init( int argc, char **argv)
|
||||
{
|
||||
char *modeexpl[4];
|
||||
int myRank;
|
||||
MPICHECK(MPI_Comm_rank (MPI_COMM_WORLD, &myRank));
|
||||
|
||||
modeexpl[0] = strdup("0: contiguous assignment of ranks to devices");
|
||||
modeexpl[1] = strdup("1: round robin assignment of ranks to devices");
|
||||
|
||||
if (argc > 1 ) {
|
||||
distmode = atoi(argv[1]);
|
||||
}
|
||||
if (argc > 2 ) {
|
||||
startdev = atoi(argv[2]);
|
||||
}
|
||||
if ( argc > 3 ) {
|
||||
numdevices = atoi(argv[3]);
|
||||
}
|
||||
if ( distmode > 1) {
|
||||
if ( myRank == 0 ) {
|
||||
printf("Unknown distribution mode %d. Known distribution modes are 0-1\n", distmode);
|
||||
print_help();
|
||||
}
|
||||
MPI_Abort (MPI_COMM_WORLD, -1);
|
||||
}
|
||||
HIPCHECK(hipGetDeviceCount(&maxdevices));
|
||||
if ( numdevices > maxdevices) {
|
||||
if ( myRank == 0 ) {
|
||||
printf("Requesting %d devices, %d devices available. Aborting.\n", numdevices, maxdevices);
|
||||
print_help();
|
||||
}
|
||||
MPI_Abort (MPI_COMM_WORLD, -1);
|
||||
}
|
||||
if ( startdev > maxdevices-1) {
|
||||
if ( myRank == 0 ) {
|
||||
printf("Startdevice is %d, max. number of devices is %d. Valid values are 0 - %d\n", startdev, maxdevices, maxdevices-1);
|
||||
print_help();
|
||||
}
|
||||
MPI_Abort (MPI_COMM_WORLD, -1);
|
||||
}
|
||||
|
||||
if ( myRank == 0 ) {
|
||||
printf("Using binding mode %s\n", modeexpl[distmode]);
|
||||
printf("Starting devices is %d, %d devices used.\n\n", startdev, numdevices);
|
||||
}
|
||||
}
|
||||
|
||||
static bool report_binding=true;
|
||||
|
||||
static void device_set(int id, int nDev)
|
||||
{
|
||||
int dev=0;
|
||||
if (distmode == 0 ) {
|
||||
int tmp = (id*numdevices)/nDev;
|
||||
dev = (startdev+tmp)%maxdevices;
|
||||
}
|
||||
else if (distmode == 1) {
|
||||
dev = (startdev+id)%numdevices;
|
||||
}
|
||||
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
if (report_binding) {
|
||||
printf("Rank %d using device %d\n", id, dev);
|
||||
if ( id == nDev-1) {
|
||||
report_binding=false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int myRank, nRanks, localRank = 0;
|
||||
ncclUniqueId id;
|
||||
ncclComm_t comm;
|
||||
float *h_sendbuff, *h_recvbuff;
|
||||
float *sendbuff, *recvbuff;
|
||||
hipStream_t s;
|
||||
|
||||
//initializing MPI
|
||||
MPICHECK(MPI_Init(&argc, &argv));
|
||||
MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank));
|
||||
MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks));
|
||||
|
||||
//get deviceId to be used for each rank, e.g. localRank%numberOfDevices
|
||||
devicemode_init( argc, argv);
|
||||
int nDev = numdevices;
|
||||
|
||||
int sendsize = 32*1024*1024;
|
||||
int recvsize = sendsize / nRanks;
|
||||
|
||||
//get NCCL unique ID at rank 0 and broadcast it to all others
|
||||
if (myRank == 0) ncclGetUniqueId(&id);
|
||||
MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));
|
||||
|
||||
//initializing RCCL
|
||||
device_set(myRank, nRanks);
|
||||
NCCLCHECK(ncclCommInitRankMulti(&comm, nRanks, id, myRank, myRank));
|
||||
|
||||
//allocate buffers
|
||||
HIPCHECK(hipMalloc(&sendbuff, sendsize * sizeof(float)));
|
||||
h_sendbuff = (float*) malloc ( sendsize *sizeof(float));
|
||||
init_sendbuf(h_sendbuff, sendsize, myRank);
|
||||
HIPCHECK(hipMemcpy(sendbuff, h_sendbuff, sendsize * sizeof(float), hipMemcpyDefault));
|
||||
|
||||
HIPCHECK(hipMalloc(&recvbuff, recvsize * sizeof(float)));
|
||||
h_recvbuff = (float*) malloc ( recvsize *sizeof(float));
|
||||
init_zero(h_recvbuff, recvsize);
|
||||
HIPCHECK(hipMemcpy(recvbuff, h_recvbuff, recvsize * sizeof(float), hipMemcpyDefault));
|
||||
|
||||
|
||||
HIPCHECK(hipStreamCreate(&s));
|
||||
|
||||
NCCLCHECK(ncclReduceScatter((const void*)sendbuff, (void*)recvbuff, recvsize, ncclFloat,
|
||||
ncclSum, comm, s));
|
||||
|
||||
//completing NCCL operation by synchronizing on the HIP stream
|
||||
HIPCHECK(hipStreamSynchronize(s));
|
||||
|
||||
//check result
|
||||
|
||||
HIPCHECK(hipMemcpy(h_recvbuff, recvbuff, recvsize*sizeof(float), hipMemcpyDefault));
|
||||
bool res = check_recvbuf(h_recvbuff, recvsize, nRanks);
|
||||
printf("[%d] Checking buffer result is %s\n", myRank, res == true ? "correct" : "wrong" );
|
||||
|
||||
//free buffers
|
||||
HIPCHECK(hipFree(sendbuff));
|
||||
free (h_sendbuff);
|
||||
HIPCHECK(hipFree(recvbuff));
|
||||
free (h_recvbuff);
|
||||
|
||||
//finalizing NCCL
|
||||
ncclCommDestroy(comm);
|
||||
HIPCHECK(hipStreamDestroy(s));
|
||||
|
||||
//finalizing MPI
|
||||
printf("[MPI Rank %d] Success \n", myRank);
|
||||
MPICHECK(MPI_Finalize());
|
||||
return 0;
|
||||
}
|
||||
@@ -1,244 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
#include <stdio.h>
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "rccl.h"
|
||||
|
||||
#define HIPCHECK(cmd) do { \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
printf("Failed: HIP error %s:%d '%s'\n", \
|
||||
__FILE__,__LINE__,hipGetErrorString(e)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define NCCLCHECK(cmd) do { \
|
||||
ncclResult_t r = cmd; \
|
||||
if (r!= ncclSuccess) { \
|
||||
printf("Failed, NCCL error %s:%d '%s'\n", \
|
||||
__FILE__,__LINE__,ncclGetErrorString(r)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
static void init_sendbuf (float *sendbuf, int count, int val)
|
||||
{
|
||||
for (int i = 0; i < count; i++) {
|
||||
sendbuf[i] = (float)val;
|
||||
}
|
||||
}
|
||||
|
||||
static void init_zero (float *recvbuf, int count)
|
||||
{
|
||||
for (int i = 0; i < count; i++) {
|
||||
recvbuf[i] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool check_recvbuf (float *recvbuf, int count, int ndevices)
|
||||
{
|
||||
bool result = true;
|
||||
float expected=0.0;
|
||||
|
||||
for (int i=0; i<ndevices; i++){
|
||||
expected += (float)i+1;
|
||||
}
|
||||
for (int i = 0; i < count; i++) {
|
||||
if (recvbuf[i] != expected) {
|
||||
result = false;
|
||||
printf("Element %d is %f expected %f\n", i, recvbuf[i], expected);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static int distmode=0;
|
||||
static int startdev=0;
|
||||
static int numdevices=2;
|
||||
static int maxdevices=0;
|
||||
static int ranksperdev=1;
|
||||
|
||||
static void print_help()
|
||||
{
|
||||
printf("Usage: rccl-reducescatter-multirank <distMode> <startDev> <numDevs> <ranksPerDev> \n");
|
||||
printf(" all arguments are optional, but have to be provided in this order\n");
|
||||
printf(" distMode : 0 - 3 (default: 0 - all ranks are on different devices)\n");
|
||||
printf(" startDev : id of first Device to use (default: 0) \n");
|
||||
printf(" numDevs : number of Devices to use (default: 2) \n");
|
||||
printf(" ranksPerDev: number of Ranks per Device (default: 1) \n");
|
||||
}
|
||||
|
||||
static void devicemode_init( int argc, char **argv)
|
||||
{
|
||||
char *modeexpl[4];
|
||||
|
||||
modeexpl[0] = strdup("0: all ranks are on different devices");
|
||||
modeexpl[1] = strdup("1: all ranks are on same device");
|
||||
modeexpl[2] = strdup("2: contiguous assignment of ranks to devices");
|
||||
modeexpl[3] = strdup("3: round robin assignment of ranks to devices");
|
||||
|
||||
if (argc > 1 ) {
|
||||
distmode = atoi(argv[1]);
|
||||
}
|
||||
if (argc > 2 ) {
|
||||
startdev = atoi(argv[2]);
|
||||
}
|
||||
if ( argc > 3 ) {
|
||||
numdevices = atoi(argv[3]);
|
||||
}
|
||||
if ( argc > 4 ) {
|
||||
ranksperdev = atoi(argv[4]);
|
||||
}
|
||||
|
||||
if ( distmode > 3) {
|
||||
printf("Unknown distribution mode %d. Known distribution modes are 0-3\n", distmode);
|
||||
print_help();
|
||||
exit(-1);
|
||||
}
|
||||
HIPCHECK(hipGetDeviceCount(&maxdevices));
|
||||
if ( numdevices > maxdevices) {
|
||||
printf("Requesting %d devices, %d devices available. Aborting.\n", numdevices, maxdevices);
|
||||
print_help();
|
||||
exit(-1);
|
||||
}
|
||||
if ( startdev > maxdevices-1) {
|
||||
printf("Startdevice is %d, max. number of devices is %d. Valid values are 0 - %d\n", startdev, maxdevices, maxdevices-1);
|
||||
print_help();
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
if (distmode == 1) numdevices = 1;
|
||||
if (distmode == 0) ranksperdev = 1;
|
||||
|
||||
printf("Using binding mode %s\n", modeexpl[distmode]);
|
||||
printf("Starting devices is %d, %d devices used, %d ranks per device.\n\n", startdev, numdevices, ranksperdev);
|
||||
}
|
||||
|
||||
static bool report_binding=true;
|
||||
|
||||
static void device_set(int id, int nDev)
|
||||
{
|
||||
int dev=0;
|
||||
if (distmode == 0 )
|
||||
dev = (startdev+id)%numdevices;
|
||||
else if (distmode == 1) {
|
||||
dev = startdev;
|
||||
}
|
||||
else if (distmode == 2) {
|
||||
int tmp = (id*numdevices)/nDev;
|
||||
dev = (startdev+tmp)%maxdevices;
|
||||
}
|
||||
else if (distmode == 3) {
|
||||
dev = (startdev+id)%numdevices;
|
||||
}
|
||||
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
if (report_binding) {
|
||||
printf("Rank %d using device %d\n", id, dev);
|
||||
if ( id == nDev-1) {
|
||||
report_binding=false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int nDev;
|
||||
|
||||
devicemode_init( argc, argv);
|
||||
nDev = numdevices * ranksperdev;
|
||||
|
||||
int sendsize = 32*1024*1024;
|
||||
int recvsize = sendsize / nDev;
|
||||
|
||||
//allocating and initializing device buffers
|
||||
float** h_sendbuff = (float**)malloc(nDev * sizeof(float*));
|
||||
float** sendbuff = (float**)malloc(nDev * sizeof(float*));
|
||||
float** h_recvbuff = (float**)malloc(nDev * sizeof(float*));
|
||||
float** recvbuff = (float**)malloc(nDev * sizeof(float*));
|
||||
hipStream_t* s = (hipStream_t*)malloc(sizeof(hipStream_t)*nDev);
|
||||
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
|
||||
|
||||
for (int i = 0; i < nDev; ++i) {
|
||||
device_set(i, nDev);
|
||||
|
||||
HIPCHECK(hipMalloc(sendbuff+i, sendsize * sizeof(float)));
|
||||
h_sendbuff[i] = (float*) malloc (sendsize *sizeof(float));
|
||||
init_sendbuf(h_sendbuff[i], sendsize, i+1);
|
||||
HIPCHECK(hipMemcpy(sendbuff[i], h_sendbuff[i], sendsize * sizeof(float), hipMemcpyDefault));
|
||||
|
||||
HIPCHECK(hipMalloc(recvbuff+i, recvsize*sizeof(float)));
|
||||
h_recvbuff[i] = (float*) malloc (recvsize *sizeof(float));
|
||||
HIPCHECK(hipMemset(recvbuff[i], 0, recvsize*sizeof(float)));
|
||||
HIPCHECK(hipStreamSynchronize(NULL));
|
||||
HIPCHECK(hipStreamCreate(s+i));
|
||||
}
|
||||
|
||||
|
||||
//initializing NCCL
|
||||
ncclUniqueId id;
|
||||
ncclGetUniqueId(&id);
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int i=0; i<nDev; i++ ) {
|
||||
device_set(i, nDev);
|
||||
NCCLCHECK(ncclCommInitRankMulti(&comms[i], nDev, id, i, i));
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
|
||||
//calling NCCL communication API. Group API is required when using
|
||||
//multiple devices per thread
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int i = 0; i < nDev; ++i) {
|
||||
NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], recvsize, ncclFloat, ncclSum, comms[i], s[i]));
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
|
||||
|
||||
//synchronizing on HIP streams to wait for completion of NCCL operation
|
||||
for (int i = 0; i < nDev; ++i) {
|
||||
device_set(i, nDev);
|
||||
HIPCHECK(hipStreamSynchronize(s[i]));
|
||||
}
|
||||
|
||||
for (int i = 0; i < nDev; ++i) {
|
||||
device_set(i, nDev);
|
||||
HIPCHECK(hipMemcpy(h_recvbuff[i], recvbuff[i], recvsize*sizeof(float), hipMemcpyDefault ));
|
||||
bool res = check_recvbuf(h_recvbuff[i], recvsize, nDev);
|
||||
printf("Checking buffer %d result is %s\n",i, res == true ? "correct" : "wrong" );
|
||||
}
|
||||
|
||||
//free buffers
|
||||
for (int i = 0; i < nDev; ++i) {
|
||||
device_set(i, nDev);
|
||||
HIPCHECK(hipFree(sendbuff[i]));
|
||||
free (h_sendbuff[i]);
|
||||
HIPCHECK(hipFree(recvbuff[i]));
|
||||
free (h_recvbuff[i]);
|
||||
}
|
||||
|
||||
//finalizing RCCL
|
||||
for(int i = 0; i < nDev; ++i) {
|
||||
ncclCommDestroy(comms[i]);
|
||||
HIPCHECK(hipStreamDestroy(s[i]));
|
||||
}
|
||||
|
||||
free (h_sendbuff);
|
||||
free (sendbuff);
|
||||
free (h_recvbuff);
|
||||
free (recvbuff);
|
||||
free (s);
|
||||
free (comms);
|
||||
|
||||
printf("Success \n");
|
||||
return 0;
|
||||
}
|
||||
Ссылка в новой задаче
Block a user