diff --git a/tools/MultiRank/Makefile b/tools/MultiRank/Makefile deleted file mode 100644 index b3c9ccab0f..0000000000 --- a/tools/MultiRank/Makefile +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. - -# Set to where RCCL is installed -RCCL_INSTALL=../../build/release - -HIP_PATH?= $(wildcard /opt/rocm) -ifeq (,$(HIP_PATH)) -HIP_PATH=../../.. -endif -HIPCC=$(HIP_PATH)/bin/hipcc - -EXE=rccl-allreduce-multirank rccl-reducescatter-multirank -CXXFLAGS = -std=c++11 -O3 -I$(RCCL_INSTALL)/include/rccl/ -L$(RCCL_INSTALL) -lrccl - -all: $(EXE) - -rccl-allreduce-multirank: rccl-allreduce-multirank.cc $(shell find -regex ".*\.\hpp") - $(HIPCC) $(CXXFLAGS) $< -o $@ - -rccl-reducescatter-multirank: rccl-reducescatter-multirank.cc $(shell find -regex ".*\.\hpp") - $(HIPCC) $(CXXFLAGS) $< -o $@ - -clean: - rm -f *.o $(EXE) diff --git a/tools/MultiRank/README.mpi b/tools/MultiRank/README.mpi deleted file mode 100755 index 4672aa9eb4..0000000000 --- a/tools/MultiRank/README.mpi +++ /dev/null @@ -1,19 +0,0 @@ -#************************************************************************ -# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. -# -# See LICENSE.txt for license information -#***********************************************************************/ - -#!/bin/bash -export MPI_INSTALL_PATH= -export RCCL_INSTALL_PATH= -export ROCM_INSTALL_PATH= - -$MPI_INSTALL_PATH/bin/mpiCC -o rccl-allreduce-mpi-multirank rccl-allreduce-mpi-multirank.cc -I$ROCM_INSTALL_PATH/include -I$RCCL_INSTALL_PATH/include -D__HIP_PLATFORM_AMD__ -L$ROCM_INSTALL_PATH/lib -lamdhip64 -L$RCCL_INSTALL_PATH/lib -lrccl - -$MPI_INSTALL_PATH/bin/mpiCC -o rccl-reducescatter-mpi-multirank rccl-reducescatter-mpi-multirank.cc -I$ROCM_INSTALL_PATH/include -I$RCCL_INSTALL_PATH/include -D__HIP_PLATFORM_AMD__ -L$ROCM_INSTALL_PATH/lib -lamdhip64 -L$RCCL_INSTALL_PATH/lib -lrccl - - -export GPU_MAX_HW_QUEUES=16 -$MPI_INSTALL_PATH/bin/mpirun --mca pml ucx -np 4 ./rccl-allreduce-mpi-multirank 0 0 2 -$MPI_INSTALL_PATH/bin/mpirun --mca pml ucx -np 4 ./rccl-reducescatter-mpi-multirank 0 0 2 diff --git a/tools/MultiRank/rccl-allreduce-mpi-multirank.cc b/tools/MultiRank/rccl-allreduce-mpi-multirank.cc deleted file mode 100644 index dad099f012..0000000000 --- a/tools/MultiRank/rccl-allreduce-mpi-multirank.cc +++ /dev/null @@ -1,228 +0,0 @@ -/************************************************************************* - * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -#include -#include -#include -#include - -#include "hip/hip_runtime.h" -#include "rccl.h" -#include "mpi.h" - - -#define MPICHECK(cmd) do { \ - int e = cmd; \ - if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%d'\n", \ - __FILE__,__LINE__, e); \ - exit(EXIT_FAILURE); \ - } \ - } while(0) - - -#define HIPCHECK(cmd) do { \ - hipError_t e = cmd; \ - if( e != hipSuccess ) { \ - printf("Failed: HIP error %s:%d '%s'\n", \ - __FILE__,__LINE__,hipGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ - } while(0) - - -#define NCCLCHECK(cmd) do { \ - ncclResult_t r = cmd; \ - if (r!= ncclSuccess) { \ - printf("Failed, NCCL error %s:%d '%s'\n", \ - __FILE__,__LINE__,ncclGetErrorString(r)); \ - exit(EXIT_FAILURE); \ - } \ - } while(0) - -static void init_sendbuf (float *sendbuf, int count, int val) -{ - for (int i = 0; i < count; i++) { - sendbuf[i] = (float)val+1; - } -} - -static void init_zero (float *recvbuf, int count) -{ - for (int i = 0; i < count; i++) { - recvbuf[i] = 0.0; - } -} - -static bool check_recvbuf (float *recvbuf, int count, int ndevices) -{ - bool result = true; - float expected=0.0; - for (int i=0; i \n"); - printf(" all arguments are optional, but have to be provided in this order\n"); - printf(" distMode : 0 - 1 (default: 0 - block distribution of rank to devices)\n"); - printf(" startDev : id of first Device to use (default: 0) \n"); - printf(" numDevs : number of Devices to use (default: 2) \n"); -} - -static int distmode=0; -static int startdev=0; -static int numdevices=2; -static int maxdevices=0; - -static void devicemode_init( int argc, char **argv) -{ - char *modeexpl[4]; - int myRank; - MPICHECK(MPI_Comm_rank (MPI_COMM_WORLD, &myRank)); - - modeexpl[0] = strdup("0: contiguous assignment of ranks to devices"); - modeexpl[1] = strdup("1: round robin assignment of ranks to devices"); - - if (argc > 1 ) { - distmode = atoi(argv[1]); - } - if (argc > 2 ) { - startdev = atoi(argv[2]); - } - if ( argc > 3 ) { - numdevices = atoi(argv[3]); - } - if ( distmode > 1) { - if ( myRank == 0 ) { - printf("Unknown distribution mode %d. Known distribution modes are 0-1\n", distmode); - print_help(); - } - MPI_Abort (MPI_COMM_WORLD, -1); - } - HIPCHECK(hipGetDeviceCount(&maxdevices)); - if ( numdevices > maxdevices) { - if ( myRank == 0 ) { - printf("Requesting %d devices, %d devices available. Aborting.\n", numdevices, maxdevices); - print_help(); - } - MPI_Abort (MPI_COMM_WORLD, -1); - } - if ( startdev > maxdevices-1) { - if ( myRank == 0 ) { - printf("Startdevice is %d, max. number of devices is %d. Valid values are 0 - %d\n", startdev, maxdevices, maxdevices-1); - print_help(); - } - MPI_Abort (MPI_COMM_WORLD, -1); - } - - if ( myRank == 0 ) { - printf("Using binding mode %s\n", modeexpl[distmode]); - printf("Starting devices is %d, %d devices used.\n\n", startdev, numdevices); - } -} - -static bool report_binding=true; - -static void device_set(int id, int nDev) -{ - int dev=0; - if (distmode == 0 ) { - int tmp = (id*numdevices)/nDev; - dev = (startdev+tmp)%maxdevices; - } - else if (distmode == 1) { - dev = (startdev+id)%numdevices; - } - - HIPCHECK(hipSetDevice(dev)); - if (report_binding) { - printf("Rank %d using device %d\n", id, dev); - if ( id == nDev-1) { - report_binding=false; - } - } -} - - -int main(int argc, char* argv[]) -{ - int size = 32*1024*1024; - int myRank, nRanks, localRank = 0; - ncclUniqueId id; - ncclComm_t comm; - float *h_sendbuff, *h_recvbuff; - float *sendbuff, *recvbuff; - hipStream_t s; - - //initializing MPI - MPICHECK(MPI_Init(&argc, &argv)); - MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank)); - MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks)); - - //get deviceId to be used for each rank, e.g. localRank%numberOfDevices - devicemode_init( argc, argv); - int nDev = numdevices; - - //get NCCL unique ID at rank 0 and broadcast it to all others - if (myRank == 0) ncclGetUniqueId(&id); - MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD)); - - //initializing RCCL - device_set(myRank, nRanks); - NCCLCHECK(ncclCommInitRankMulti(&comm, nRanks, id, myRank, myRank)); - - //allocate buffers - HIPCHECK(hipMalloc(&sendbuff, size * sizeof(float))); - h_sendbuff = (float*) malloc ( size *sizeof(float)); - init_sendbuf(h_sendbuff, size, myRank); - HIPCHECK(hipMemcpy(sendbuff, h_sendbuff, size * sizeof(float), hipMemcpyDefault)); - - HIPCHECK(hipMalloc(&recvbuff, size * sizeof(float))); - h_recvbuff = (float*) malloc ( size *sizeof(float)); - init_zero(h_recvbuff, size); - HIPCHECK(hipMemcpy(recvbuff, h_recvbuff, size * sizeof(float), hipMemcpyDefault)); - - HIPCHECK(hipStreamCreate(&s)); - - NCCLCHECK(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, ncclFloat, - ncclSum, comm, s)); - - //completing NCCL operation by synchronizing on the HIP stream - HIPCHECK(hipStreamSynchronize(s)); - - //check result - - HIPCHECK(hipMemcpy(h_recvbuff, recvbuff, size*sizeof(float), hipMemcpyDefault)); - bool res = check_recvbuf(h_recvbuff, size, nRanks); - printf("[%d] Checking buffer result is %s\n", myRank, res == true ? "correct" : "wrong" ); - - //free buffers - HIPCHECK(hipFree(sendbuff)); - free (h_sendbuff); - HIPCHECK(hipFree(recvbuff)); - free (h_recvbuff); - - //finalizing NCCL - ncclCommDestroy(comm); - HIPCHECK(hipStreamDestroy(s)); - - //finalizing MPI - printf("[MPI Rank %d] Success \n", myRank); - MPICHECK(MPI_Finalize()); - return 0; -} diff --git a/tools/MultiRank/rccl-allreduce-multirank.cc b/tools/MultiRank/rccl-allreduce-multirank.cc deleted file mode 100644 index 84680c46a1..0000000000 --- a/tools/MultiRank/rccl-allreduce-multirank.cc +++ /dev/null @@ -1,243 +0,0 @@ -/************************************************************************* - * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -#include -#include "hip/hip_runtime.h" -#include "rccl.h" - -#define HIPCHECK(cmd) do { \ - hipError_t e = cmd; \ - if( e != hipSuccess ) { \ - printf("Failed: HIP error %s:%d '%s'\n", \ - __FILE__,__LINE__,hipGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - - -#define NCCLCHECK(cmd) do { \ - ncclResult_t r = cmd; \ - if (r!= ncclSuccess) { \ - printf("Failed, NCCL error %s:%d '%s'\n", \ - __FILE__,__LINE__,ncclGetErrorString(r)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - - -static void init_sendbuf (float *sendbuf, int count, int val) -{ - for (int i = 0; i < count; i++) { - sendbuf[i] = (float)val; - } -} - -static void init_zero (float *recvbuf, int count) -{ - for (int i = 0; i < count; i++) { - recvbuf[i] = 0.0; - } -} - -static bool check_recvbuf (float *recvbuf, int count, int ndevices) -{ - bool result = true; - float expected=0.0; - - for (int i=0; i \n"); - printf(" all arguments are optional, but have to be provided in this order\n"); - printf(" distMode : 0 - 3 (default: 0 - all ranks are on different devices)\n"); - printf(" startDev : id of first Device to use (default: 0) \n"); - printf(" numDevs : number of Devices to use (default: 2) \n"); - printf(" ranksPerDev: number of Ranks per Device (default: 1) \n"); -} - -static void devicemode_init( int argc, char **argv) -{ - char *modeexpl[4]; - - modeexpl[0] = strdup("0: all ranks are on different devices"); - modeexpl[1] = strdup("1: all ranks are on same device"); - modeexpl[2] = strdup("2: contiguous assignment of ranks to devices"); - modeexpl[3] = strdup("3: round robin assignment of ranks to devices"); - - if (argc > 1 ) { - distmode = atoi(argv[1]); - } - if (argc > 2 ) { - startdev = atoi(argv[2]); - } - if ( argc > 3 ) { - numdevices = atoi(argv[3]); - } - if ( argc > 4 ) { - ranksperdev = atoi(argv[4]); - } - - if ( distmode > 3) { - printf("Unknown distribution mode %d. Known distribution modes are 0-3\n", distmode); - print_help(); - exit(-1); - } - HIPCHECK(hipGetDeviceCount(&maxdevices)); - if ( numdevices > maxdevices) { - printf("Requesting %d devices, %d devices available. Aborting.\n", numdevices, maxdevices); - print_help(); - exit(-1); - } - if ( startdev > maxdevices-1) { - printf("Startdevice is %d, max. number of devices is %d. Valid values are 0 - %d\n", startdev, maxdevices, maxdevices-1); - print_help(); - exit(-1); - } - - if (distmode == 1) numdevices = 1; - if (distmode == 0) ranksperdev = 1; - - printf("Using binding mode %s\n", modeexpl[distmode]); - printf("Starting devices is %d, %d devices used, %d ranks per device.\n\n", startdev, numdevices, ranksperdev); -} - -static bool report_binding=true; - -static void device_set(int id, int nDev) -{ - int dev=0; - if (distmode == 0 ) - dev = (startdev+id)%numdevices; - else if (distmode == 1) { - dev = startdev; - } - else if (distmode == 2) { - int tmp = (id*numdevices)/nDev; - dev = (startdev+tmp)%maxdevices; - } - else if (distmode == 3) { - dev = (startdev+id)%numdevices; - } - - HIPCHECK(hipSetDevice(dev)); - if (report_binding) { - printf("Rank %d using device %d\n", id, dev); - if ( id == nDev-1) { - report_binding=false; - } - } -} - -int main(int argc, char* argv[]) -{ - int nDev; - int size = 32*1024*1024; - - devicemode_init( argc, argv); - nDev = numdevices * ranksperdev; - - //allocating and initializing device buffers - float** h_sendbuff = (float**)malloc(nDev * sizeof(float*)); - float** sendbuff = (float**)malloc(nDev * sizeof(float*)); - float** h_recvbuff = (float**)malloc(nDev * sizeof(float*)); - float** recvbuff = (float**)malloc(nDev * sizeof(float*)); - hipStream_t* s = (hipStream_t*)malloc(sizeof(hipStream_t)*nDev); - ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev); - - for (int i = 0; i < nDev; ++i) { - device_set(i, nDev); - - HIPCHECK(hipMalloc(sendbuff+i, size * sizeof(float))); - h_sendbuff[i] = (float*) malloc (size *sizeof(float)); - init_sendbuf(h_sendbuff[i], size, i+1); - HIPCHECK(hipMemcpy(sendbuff[i], h_sendbuff[i], size * sizeof(float), hipMemcpyDefault)); - - HIPCHECK(hipMalloc(recvbuff+i, size*sizeof(float))); - h_recvbuff[i] = (float*) malloc (size *sizeof(float)); - HIPCHECK(hipMemset(recvbuff[i], 0, size*sizeof(float))); - HIPCHECK(hipStreamSynchronize(NULL)); - HIPCHECK(hipStreamCreate(s+i)); - } - - - //initializing NCCL - ncclUniqueId id; - ncclGetUniqueId(&id); - NCCLCHECK(ncclGroupStart()); - for (int i=0; i -#include -#include -#include - -#include "hip/hip_runtime.h" -#include "rccl.h" -#include "mpi.h" - - -#define MPICHECK(cmd) do { \ - int e = cmd; \ - if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%d'\n", \ - __FILE__,__LINE__, e); \ - exit(EXIT_FAILURE); \ - } \ - } while(0) - - -#define HIPCHECK(cmd) do { \ - hipError_t e = cmd; \ - if( e != hipSuccess ) { \ - printf("Failed: HIP error %s:%d '%s'\n", \ - __FILE__,__LINE__,hipGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ - } while(0) - - -#define NCCLCHECK(cmd) do { \ - ncclResult_t r = cmd; \ - if (r!= ncclSuccess) { \ - printf("Failed, NCCL error %s:%d '%s'\n", \ - __FILE__,__LINE__,ncclGetErrorString(r)); \ - exit(EXIT_FAILURE); \ - } \ - } while(0) - -static void init_sendbuf (float *sendbuf, int count, int val) -{ - for (int i = 0; i < count; i++) { - sendbuf[i] = (float)val+1; - } -} - -static void init_zero (float *recvbuf, int count) -{ - for (int i = 0; i < count; i++) { - recvbuf[i] = 0.0; - } -} - -static bool check_recvbuf (float *recvbuf, int count, int ndevices) -{ - bool result = true; - float expected=0.0; - - for (int i=0; i \n"); - printf(" all arguments are optional, but have to be provided in this order\n"); - printf(" distMode : 0 - 1 (default: 0 - block distribution of rank to devices)\n"); - printf(" startDev : id of first Device to use (default: 0) \n"); - printf(" numDevs : number of Devices to use (default: 2) \n"); -} - -static int distmode=0; -static int startdev=0; -static int numdevices=2; -static int maxdevices=0; - -static void devicemode_init( int argc, char **argv) -{ - char *modeexpl[4]; - int myRank; - MPICHECK(MPI_Comm_rank (MPI_COMM_WORLD, &myRank)); - - modeexpl[0] = strdup("0: contiguous assignment of ranks to devices"); - modeexpl[1] = strdup("1: round robin assignment of ranks to devices"); - - if (argc > 1 ) { - distmode = atoi(argv[1]); - } - if (argc > 2 ) { - startdev = atoi(argv[2]); - } - if ( argc > 3 ) { - numdevices = atoi(argv[3]); - } - if ( distmode > 1) { - if ( myRank == 0 ) { - printf("Unknown distribution mode %d. Known distribution modes are 0-1\n", distmode); - print_help(); - } - MPI_Abort (MPI_COMM_WORLD, -1); - } - HIPCHECK(hipGetDeviceCount(&maxdevices)); - if ( numdevices > maxdevices) { - if ( myRank == 0 ) { - printf("Requesting %d devices, %d devices available. Aborting.\n", numdevices, maxdevices); - print_help(); - } - MPI_Abort (MPI_COMM_WORLD, -1); - } - if ( startdev > maxdevices-1) { - if ( myRank == 0 ) { - printf("Startdevice is %d, max. number of devices is %d. Valid values are 0 - %d\n", startdev, maxdevices, maxdevices-1); - print_help(); - } - MPI_Abort (MPI_COMM_WORLD, -1); - } - - if ( myRank == 0 ) { - printf("Using binding mode %s\n", modeexpl[distmode]); - printf("Starting devices is %d, %d devices used.\n\n", startdev, numdevices); - } -} - -static bool report_binding=true; - -static void device_set(int id, int nDev) -{ - int dev=0; - if (distmode == 0 ) { - int tmp = (id*numdevices)/nDev; - dev = (startdev+tmp)%maxdevices; - } - else if (distmode == 1) { - dev = (startdev+id)%numdevices; - } - - HIPCHECK(hipSetDevice(dev)); - if (report_binding) { - printf("Rank %d using device %d\n", id, dev); - if ( id == nDev-1) { - report_binding=false; - } - } -} - - -int main(int argc, char* argv[]) -{ - int myRank, nRanks, localRank = 0; - ncclUniqueId id; - ncclComm_t comm; - float *h_sendbuff, *h_recvbuff; - float *sendbuff, *recvbuff; - hipStream_t s; - - //initializing MPI - MPICHECK(MPI_Init(&argc, &argv)); - MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank)); - MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks)); - - //get deviceId to be used for each rank, e.g. localRank%numberOfDevices - devicemode_init( argc, argv); - int nDev = numdevices; - - int sendsize = 32*1024*1024; - int recvsize = sendsize / nRanks; - - //get NCCL unique ID at rank 0 and broadcast it to all others - if (myRank == 0) ncclGetUniqueId(&id); - MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD)); - - //initializing RCCL - device_set(myRank, nRanks); - NCCLCHECK(ncclCommInitRankMulti(&comm, nRanks, id, myRank, myRank)); - - //allocate buffers - HIPCHECK(hipMalloc(&sendbuff, sendsize * sizeof(float))); - h_sendbuff = (float*) malloc ( sendsize *sizeof(float)); - init_sendbuf(h_sendbuff, sendsize, myRank); - HIPCHECK(hipMemcpy(sendbuff, h_sendbuff, sendsize * sizeof(float), hipMemcpyDefault)); - - HIPCHECK(hipMalloc(&recvbuff, recvsize * sizeof(float))); - h_recvbuff = (float*) malloc ( recvsize *sizeof(float)); - init_zero(h_recvbuff, recvsize); - HIPCHECK(hipMemcpy(recvbuff, h_recvbuff, recvsize * sizeof(float), hipMemcpyDefault)); - - - HIPCHECK(hipStreamCreate(&s)); - - NCCLCHECK(ncclReduceScatter((const void*)sendbuff, (void*)recvbuff, recvsize, ncclFloat, - ncclSum, comm, s)); - - //completing NCCL operation by synchronizing on the HIP stream - HIPCHECK(hipStreamSynchronize(s)); - - //check result - - HIPCHECK(hipMemcpy(h_recvbuff, recvbuff, recvsize*sizeof(float), hipMemcpyDefault)); - bool res = check_recvbuf(h_recvbuff, recvsize, nRanks); - printf("[%d] Checking buffer result is %s\n", myRank, res == true ? "correct" : "wrong" ); - - //free buffers - HIPCHECK(hipFree(sendbuff)); - free (h_sendbuff); - HIPCHECK(hipFree(recvbuff)); - free (h_recvbuff); - - //finalizing NCCL - ncclCommDestroy(comm); - HIPCHECK(hipStreamDestroy(s)); - - //finalizing MPI - printf("[MPI Rank %d] Success \n", myRank); - MPICHECK(MPI_Finalize()); - return 0; -} diff --git a/tools/MultiRank/rccl-reducescatter-multirank.cc b/tools/MultiRank/rccl-reducescatter-multirank.cc deleted file mode 100644 index 1c14f201b8..0000000000 --- a/tools/MultiRank/rccl-reducescatter-multirank.cc +++ /dev/null @@ -1,244 +0,0 @@ -/************************************************************************* - * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -#include -#include "hip/hip_runtime.h" -#include "rccl.h" - -#define HIPCHECK(cmd) do { \ - hipError_t e = cmd; \ - if( e != hipSuccess ) { \ - printf("Failed: HIP error %s:%d '%s'\n", \ - __FILE__,__LINE__,hipGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - - -#define NCCLCHECK(cmd) do { \ - ncclResult_t r = cmd; \ - if (r!= ncclSuccess) { \ - printf("Failed, NCCL error %s:%d '%s'\n", \ - __FILE__,__LINE__,ncclGetErrorString(r)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - - -static void init_sendbuf (float *sendbuf, int count, int val) -{ - for (int i = 0; i < count; i++) { - sendbuf[i] = (float)val; - } -} - -static void init_zero (float *recvbuf, int count) -{ - for (int i = 0; i < count; i++) { - recvbuf[i] = 0.0; - } -} - -static bool check_recvbuf (float *recvbuf, int count, int ndevices) -{ - bool result = true; - float expected=0.0; - - for (int i=0; i \n"); - printf(" all arguments are optional, but have to be provided in this order\n"); - printf(" distMode : 0 - 3 (default: 0 - all ranks are on different devices)\n"); - printf(" startDev : id of first Device to use (default: 0) \n"); - printf(" numDevs : number of Devices to use (default: 2) \n"); - printf(" ranksPerDev: number of Ranks per Device (default: 1) \n"); -} - -static void devicemode_init( int argc, char **argv) -{ - char *modeexpl[4]; - - modeexpl[0] = strdup("0: all ranks are on different devices"); - modeexpl[1] = strdup("1: all ranks are on same device"); - modeexpl[2] = strdup("2: contiguous assignment of ranks to devices"); - modeexpl[3] = strdup("3: round robin assignment of ranks to devices"); - - if (argc > 1 ) { - distmode = atoi(argv[1]); - } - if (argc > 2 ) { - startdev = atoi(argv[2]); - } - if ( argc > 3 ) { - numdevices = atoi(argv[3]); - } - if ( argc > 4 ) { - ranksperdev = atoi(argv[4]); - } - - if ( distmode > 3) { - printf("Unknown distribution mode %d. Known distribution modes are 0-3\n", distmode); - print_help(); - exit(-1); - } - HIPCHECK(hipGetDeviceCount(&maxdevices)); - if ( numdevices > maxdevices) { - printf("Requesting %d devices, %d devices available. Aborting.\n", numdevices, maxdevices); - print_help(); - exit(-1); - } - if ( startdev > maxdevices-1) { - printf("Startdevice is %d, max. number of devices is %d. Valid values are 0 - %d\n", startdev, maxdevices, maxdevices-1); - print_help(); - exit(-1); - } - - if (distmode == 1) numdevices = 1; - if (distmode == 0) ranksperdev = 1; - - printf("Using binding mode %s\n", modeexpl[distmode]); - printf("Starting devices is %d, %d devices used, %d ranks per device.\n\n", startdev, numdevices, ranksperdev); -} - -static bool report_binding=true; - -static void device_set(int id, int nDev) -{ - int dev=0; - if (distmode == 0 ) - dev = (startdev+id)%numdevices; - else if (distmode == 1) { - dev = startdev; - } - else if (distmode == 2) { - int tmp = (id*numdevices)/nDev; - dev = (startdev+tmp)%maxdevices; - } - else if (distmode == 3) { - dev = (startdev+id)%numdevices; - } - - HIPCHECK(hipSetDevice(dev)); - if (report_binding) { - printf("Rank %d using device %d\n", id, dev); - if ( id == nDev-1) { - report_binding=false; - } - } -} - -int main(int argc, char* argv[]) -{ - int nDev; - - devicemode_init( argc, argv); - nDev = numdevices * ranksperdev; - - int sendsize = 32*1024*1024; - int recvsize = sendsize / nDev; - - //allocating and initializing device buffers - float** h_sendbuff = (float**)malloc(nDev * sizeof(float*)); - float** sendbuff = (float**)malloc(nDev * sizeof(float*)); - float** h_recvbuff = (float**)malloc(nDev * sizeof(float*)); - float** recvbuff = (float**)malloc(nDev * sizeof(float*)); - hipStream_t* s = (hipStream_t*)malloc(sizeof(hipStream_t)*nDev); - ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev); - - for (int i = 0; i < nDev; ++i) { - device_set(i, nDev); - - HIPCHECK(hipMalloc(sendbuff+i, sendsize * sizeof(float))); - h_sendbuff[i] = (float*) malloc (sendsize *sizeof(float)); - init_sendbuf(h_sendbuff[i], sendsize, i+1); - HIPCHECK(hipMemcpy(sendbuff[i], h_sendbuff[i], sendsize * sizeof(float), hipMemcpyDefault)); - - HIPCHECK(hipMalloc(recvbuff+i, recvsize*sizeof(float))); - h_recvbuff[i] = (float*) malloc (recvsize *sizeof(float)); - HIPCHECK(hipMemset(recvbuff[i], 0, recvsize*sizeof(float))); - HIPCHECK(hipStreamSynchronize(NULL)); - HIPCHECK(hipStreamCreate(s+i)); - } - - - //initializing NCCL - ncclUniqueId id; - ncclGetUniqueId(&id); - NCCLCHECK(ncclGroupStart()); - for (int i=0; i