307 lines
9.3 KiB
C++
307 lines
9.3 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
|
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
#ifndef __COMMON_H__
|
|
#define __COMMON_H__
|
|
|
|
#include "rccl/rccl.h"
|
|
#include <stdio.h>
|
|
#include <cstdint>
|
|
#include <algorithm>
|
|
#ifdef MPI_SUPPORT
|
|
#include "mpi.h"
|
|
#endif
|
|
#include <pthread.h>
|
|
#include "nccl1_compat.h"
|
|
#include "timer.h"
|
|
|
|
// For nccl.h < 2.13 since we define a weak fallback
|
|
extern "C" char const* ncclGetLastError(ncclComm_t comm);
|
|
|
|
#define HIPCHECK(cmd) do { \
|
|
hipError_t e = cmd; \
|
|
if( e != hipSuccess ) { \
|
|
char hostname[1024]; \
|
|
getHostName(hostname, 1024); \
|
|
printf("%s: Test HIP failure %s:%d '%s'\n", \
|
|
hostname, \
|
|
__FILE__,__LINE__,hipGetErrorString(e)); \
|
|
return testCudaError; \
|
|
} \
|
|
} while(0)
|
|
|
|
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,13,0)
|
|
#define NCCLCHECK(cmd) do { \
|
|
ncclResult_t res = cmd; \
|
|
if (res != ncclSuccess) { \
|
|
char hostname[1024]; \
|
|
getHostName(hostname, 1024); \
|
|
printf("%s: Test NCCL failure %s:%d " \
|
|
"'%s / %s'\n", \
|
|
hostname,__FILE__,__LINE__, \
|
|
ncclGetErrorString(res), \
|
|
ncclGetLastError(NULL)); \
|
|
return testNcclError; \
|
|
} \
|
|
} while(0)
|
|
#else
|
|
#define NCCLCHECK(cmd) do { \
|
|
ncclResult_t res = cmd; \
|
|
if (res != ncclSuccess) { \
|
|
char hostname[1024]; \
|
|
getHostName(hostname, 1024); \
|
|
printf("%s: Test NCCL failure %s:%d '%s'\n", \
|
|
hostname, \
|
|
__FILE__,__LINE__,ncclGetErrorString(res)); \
|
|
return testNcclError; \
|
|
} \
|
|
} while(0)
|
|
#endif
|
|
|
|
typedef enum {
|
|
testSuccess = 0,
|
|
testInternalError = 1,
|
|
testCudaError = 2,
|
|
testNcclError = 3,
|
|
testTimeout = 4,
|
|
testNumResults = 5
|
|
} testResult_t;
|
|
|
|
// Relay errors up and trace
|
|
#define TESTCHECK(cmd) do { \
|
|
testResult_t r = cmd; \
|
|
if (r!= testSuccess) { \
|
|
char hostname[1024]; \
|
|
getHostName(hostname, 1024); \
|
|
printf(" .. %s pid %d: Test failure %s:%d\n", \
|
|
hostname, getpid(), \
|
|
__FILE__,__LINE__); \
|
|
return r; \
|
|
} \
|
|
} while(0)
|
|
|
|
struct testColl {
|
|
const char name[20];
|
|
void (*getCollByteCount)(
|
|
size_t *sendcount, size_t *recvcount, size_t *paramcount,
|
|
size_t *sendInplaceOffset, size_t *recvInplaceOffset,
|
|
size_t count, int nranks);
|
|
testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
|
|
ncclRedOp_t op, int root, int rep, int in_place);
|
|
void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
|
|
testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
|
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
|
};
|
|
extern struct testColl allReduceTest;
|
|
extern struct testColl allGatherTest;
|
|
extern struct testColl reduceScatterTest;
|
|
extern struct testColl broadcastTest;
|
|
extern struct testColl reduceTest;
|
|
extern struct testColl alltoAllTest;
|
|
|
|
struct testEngine {
|
|
void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
|
|
testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
|
|
const char* typeName, ncclRedOp_t op, const char* opName);
|
|
};
|
|
|
|
extern struct testEngine ncclTestEngine;
|
|
|
|
struct threadArgs {
|
|
size_t nbytes;
|
|
size_t minbytes;
|
|
size_t maxbytes;
|
|
size_t stepbytes;
|
|
size_t stepfactor;
|
|
|
|
int totalProcs;
|
|
int nProcs;
|
|
int proc;
|
|
int nThreads;
|
|
int thread;
|
|
int nGpus;
|
|
int* gpus;
|
|
int localRank;
|
|
int localNumDevices;
|
|
int enable_multiranks;
|
|
int nRanks;
|
|
void** sendbuffs;
|
|
size_t sendBytes;
|
|
size_t sendInplaceOffset;
|
|
void** recvbuffs;
|
|
size_t recvInplaceOffset;
|
|
ncclUniqueId ncclId;
|
|
ncclComm_t* comms;
|
|
hipStream_t* streams;
|
|
|
|
void** expected;
|
|
size_t expectedBytes;
|
|
int* errors;
|
|
double* bw;
|
|
int* bw_count;
|
|
|
|
int reportErrors;
|
|
|
|
struct testColl* collTest;
|
|
};
|
|
|
|
typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
|
|
struct testThread {
|
|
pthread_t thread;
|
|
threadFunc_t func;
|
|
struct threadArgs args;
|
|
testResult_t ret;
|
|
};
|
|
|
|
// Provided by common.cu
|
|
extern void Barrier(struct threadArgs* args);
|
|
extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root);
|
|
extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
|
|
extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
|
|
extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
|
|
|
|
#include <unistd.h>
|
|
|
|
static void getHostName(char* hostname, int maxlen) {
|
|
gethostname(hostname, maxlen);
|
|
for (int i=0; i< maxlen; i++) {
|
|
if (hostname[i] == '.') {
|
|
hostname[i] = '\0';
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
#include <stdint.h>
|
|
|
|
static uint64_t getHash(const char* string, size_t n) {
|
|
// Based on DJB2a, result = result * 33 ^ char
|
|
uint64_t result = 5381;
|
|
for (size_t c = 0; c < n; c++) {
|
|
result = ((result << 5) + result) ^ string[c];
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/* Generate a hash of the unique identifying string for this host
|
|
* that will be unique for both bare-metal and container instances
|
|
* Equivalent of a hash of;
|
|
*
|
|
* $(hostname)$(cat /proc/sys/kernel/random/boot_id)
|
|
*
|
|
*/
|
|
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
|
|
static uint64_t getHostHash(const char* hostname) {
|
|
char hostHash[1024];
|
|
|
|
// Fall back is the hostname if something fails
|
|
(void) strncpy(hostHash, hostname, sizeof(hostHash));
|
|
int offset = strlen(hostHash);
|
|
|
|
FILE *file = fopen(HOSTID_FILE, "r");
|
|
if (file != NULL) {
|
|
char *p;
|
|
if (fscanf(file, "%ms", &p) == 1) {
|
|
strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
|
|
free(p);
|
|
}
|
|
}
|
|
fclose(file);
|
|
|
|
// Make sure the string is terminated
|
|
hostHash[sizeof(hostHash)-1]='\0';
|
|
|
|
return getHash(hostHash, strlen(hostHash));
|
|
}
|
|
|
|
static size_t wordSize(ncclDataType_t type) {
|
|
switch(type) {
|
|
case ncclChar:
|
|
#if NCCL_MAJOR >= 2
|
|
//case ncclInt8:
|
|
case ncclUint8:
|
|
#endif
|
|
return 1;
|
|
case ncclHalf:
|
|
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
|
|
case ncclBfloat16:
|
|
#endif
|
|
//case ncclFloat16:
|
|
return 2;
|
|
case ncclInt:
|
|
case ncclFloat:
|
|
#if NCCL_MAJOR >= 2
|
|
//case ncclInt32:
|
|
case ncclUint32:
|
|
//case ncclFloat32:
|
|
#endif
|
|
return 4;
|
|
case ncclInt64:
|
|
case ncclUint64:
|
|
case ncclDouble:
|
|
//case ncclFloat64:
|
|
return 8;
|
|
default: return 0;
|
|
}
|
|
}
|
|
|
|
extern int test_ncclVersion; // init'd with ncclGetVersion()
|
|
typedef enum { ncclCoarse = 0,
|
|
ncclFine = 1,
|
|
ncclHost = 2,
|
|
ncclManaged = 3,
|
|
nccl_NUM_MTYPES = 4 } ncclMemoryType_t;
|
|
extern const char *test_memorytypes[nccl_NUM_MTYPES];
|
|
constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
|
|
extern int test_opnum;
|
|
extern int test_typenum;
|
|
extern ncclDataType_t test_types[ncclNumTypes];
|
|
extern const char *test_typenames[ncclNumTypes];
|
|
extern ncclRedOp_t test_ops[];
|
|
extern const char *test_opnames[];
|
|
|
|
static int ncclstringtotype(char *str) {
|
|
for (int t=0; t<ncclNumTypes; t++) {
|
|
if (strcmp(str, test_typenames[t]) == 0) {
|
|
return t;
|
|
}
|
|
}
|
|
if (strcmp(str, "all") == 0) {
|
|
return -1;
|
|
}
|
|
printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
|
|
return ncclFloat;
|
|
}
|
|
|
|
static int ncclstringtoop (char *str) {
|
|
for (int o=0; o<test_opnum; o++) {
|
|
if (strcmp(str, test_opnames[o]) == 0) {
|
|
return o;
|
|
}
|
|
}
|
|
if (strcmp(str, "all") == 0) {
|
|
return -1;
|
|
}
|
|
printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
|
|
return ncclSum;
|
|
}
|
|
|
|
static int ncclstringtomtype (char *str) {
|
|
for (int o=0; o<nccl_NUM_MTYPES; o++) {
|
|
if (strcmp(str, test_memorytypes[o]) == 0) {
|
|
return o;
|
|
}
|
|
}
|
|
printf("invalid memorytype %s, defaulting to %s .. \n", str, test_memorytypes[ncclCoarse]);
|
|
return ncclCoarse;
|
|
}
|
|
|
|
extern int is_main_proc;
|
|
extern thread_local int is_main_thread;
|
|
#define PRINT if (is_main_thread) printf
|
|
|
|
#endif
|