Dateien
rocm-systems/ext-profiler/inspector/inspector.h
T

199 Zeilen
5.3 KiB
C

2025-09-02 13:21:14 -07:00
#pragma once
#include <pthread.h>
#include "json.h"
#include "common.h"
#include "version.h"
#define MAX_CHANNELS 64
#define INS_CHK_GOTO(call, res, label) \
do { \
res = call; \
if (inspectorSuccess != res) { \
INFO(NCCL_INSPECTOR, "%s:%d -> error %d: %s", __FILE__, __LINE__, res, \
inspectorErrorString(res)); \
goto label; \
} \
} while (0);
typedef enum {
ncclFuncBroadcast = 0,
ncclFuncReduce = 1,
ncclFuncAllGather = 2,
ncclFuncReduceScatter = 3,
ncclFuncAllReduce = 4,
ncclFuncSendRecv = 5,
ncclFuncSend = 6,
ncclFuncRecv = 7,
ncclNumFuncs = 8
} ncclFunc_t;
typedef enum {
inspectorSuccess = 0,
inspectorUninitializedError,
inspectorMemoryError,
inspectorFileOpenError,
inspectorDisabledError,
inspectorLockError,
inspectorPthreadError,
inspectorJsonError,
inspectorCudaError,
inspectorBadHash,
inspectorDeleteUnknownCommError,
inspectorAddDuplicateCommError,
inspectorNop,
inspectorNullTally,
inspectorGlobalInitError,
inspectorReturn,
} inspectorResult_t;
typedef enum {
inspectorTimingSourceKernelGpu = 0,
inspectorTimingSourceKernelCpu = 1,
inspectorTimingSourceCollectiveCpu = 2,
} inspectorTimingSource_t;
struct inspectorEventTraceInfo {
uint64_t ts;
uint64_t sn;
};
typedef enum {
NCCL_INSP_EVT_TRK_COLL_START = 0,
NCCL_INSP_EVT_TRK_COLL_STOP = 1,
NCCL_INSP_EVT_TRK_COLL_NEVT = 2,
} inspectorEventTrkColl_t;
typedef enum {
NCCL_INSP_EVT_TRK_KERNEL_START = 0,
NCCL_INSP_EVT_TRK_KERNEL_STOP = 1,
NCCL_INSP_EVT_TRK_KERNEL_RECORD = 2,
NCCL_INSP_EVT_TRK_KERNEL_NEVT = 3,
} inspectorEventTrkKernel_t;
struct inspectorEventTrkKernelInfo {
struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_KERNEL_NEVT];
};
struct inspectorEventTrkCollInfo {
int sn;
uint32_t nChannels;
struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_COLL_NEVT];
struct inspectorEventTrkKernelInfo kernelCh[MAX_CHANNELS];
};
struct inspectorCompletedCollInfo {
ncclFunc_t func;
uint64_t sn;
size_t msgSizeBytes;
uint64_t execTimeUsecs;
inspectorTimingSource_t timingSource;
double algoBwGbs;
double busBwGbs;
// Event trace information
struct inspectorEventTrkCollInfo collEvtTrk;
};
enum {
NCCL_COMM_HASH_LENGTH = 17
};
struct inspectorCommInfo {
struct inspectorCommInfo* next;
const char* commName;
uint64_t commHash;
char commHashStr[NCCL_COMM_HASH_LENGTH];
int rank;
int nranks;
int nnodes;
bool dump;
struct inspectorCompletedCollInfo completedCollInfo;
pthread_rwlock_t guard;
};
struct inspectorKernelChInfo {
uint64_t type;
int refCount; /*unused*/
struct inspectorCollInfo *collInfo;
uint8_t channelId;
uint64_t tsStartUsec;
uint64_t tsCompletedUsec;
uint64_t startGpuClk;
uint64_t stopGpuClk;
};
struct inspectorCollInfo {
uint64_t type;
int refCount;
struct inspectorCommInfo *commInfo;
const char* func;
uint64_t sn;
size_t msgSizeBytes;
uint64_t tsStartUsec;
uint64_t tsCompletedUsec;
uint32_t nChannels;
uint32_t nKernelChStarted;
uint32_t nKernelChCompleted;
pthread_rwlock_t guard;
struct inspectorKernelChInfo kernelCh[MAX_CHANNELS];
struct inspectorEventTrkCollInfo collEvtTrk;
};
extern ncclDebugLogger_t logFn;
#define VERSION(...) logFn(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
#define WARN(...) logFn(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
inline int ncclTypeSize(ncclDataType_t type) {
switch (type) {
case ncclInt8:
case ncclUint8:
case ncclFloat8e4m3:
case ncclFloat8e5m2:
return 1;
case ncclFloat16:
case ncclBfloat16:
return 2;
case ncclInt32:
case ncclUint32:
case ncclFloat32:
return 4;
case ncclInt64:
case ncclUint64:
case ncclFloat64:
return 8;
default:
return -1;
}
}
const char* inspectorErrorString(inspectorResult_t result);
inspectorResult_t inspectorLockInit(pthread_rwlock_t* lockRef);
inspectorResult_t inspectorLockDestroy(pthread_rwlock_t* lockRef);
inspectorResult_t inspectorLockRd(pthread_rwlock_t* lockRef);
inspectorResult_t inspectorLockWr(pthread_rwlock_t* lockRef);
inspectorResult_t inspectorUnlockRWLock(pthread_rwlock_t* lockRef);
inspectorResult_t inspectorGlobalInit(int rank);
inspectorResult_t inspectorGlobalFinalize();
uint64_t inspectorGetTime();
inspectorResult_t inspectorAddComm(struct inspectorCommInfo **commInfo,
const char* commName, uint64_t commHash,
int nNodes, int nranks, int rank);
inspectorResult_t inspectorDelComm(struct inspectorCommInfo *commInfo);
void inspectorUpdateCollPerf(struct inspectorCompletedCollInfo *completedColl,
struct inspectorCollInfo *collInfo);
ncclDataType_t inspectorStringToDatatype(const char* str);
void inspectorComputeCollBw(struct inspectorCommInfo *commInfo,
struct inspectorCompletedCollInfo *completedColl,
ncclFunc_t collType);