Add ncclDataType_t as type to ROCTX (#1512)

This commit is contained in:
Bertan Dogancay
2025-01-30 13:46:48 -05:00
کامیت شده توسط GitHub
والد 6b2b87c9f8
کامیت ecf31da14f
5فایلهای تغییر یافته به همراه86 افزوده شده و 32 حذف شده
+60 -24
مشاهده پرونده
@@ -85,12 +85,18 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
struct NvtxParamsAllGather {
size_t bytes;
ncclDataType_t datatype;
};
// Just pass the size of one message and not the total bytes sent/received.
constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsAllGather, datatype)}
};
size_t msgsize = sendcount * ncclTypeSize(datatype);
NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
NvtxParamsAllGather payload{sendcount * ncclTypeSize(datatype), datatype};
NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, payload)
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
return mscclEnqueueCheck(
@@ -114,14 +120,16 @@ ncclResult_t ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t cou
struct NvtxParamsAllReduce {
size_t bytes;
ncclRedOp_t op;
ncclDataType_t datatype;
};
// Just pass the size of one message and not the total bytes sent/received.
static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsAllReduce, op)}
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, offsetof(NvtxParamsAllReduce, op)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsAllReduce, datatype)}
};
NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op, datatype};
NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
@@ -145,12 +153,18 @@ NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_
ncclResult_t ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclComm_t comm, hipStream_t stream) {
struct NvtxParamsAllToAll {
size_t bytes;
ncclDataType_t datatype;
};
// Just pass the size of one message and not the total bytes sent/received.
constexpr nvtxPayloadSchemaEntry_t AllToAllSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsAllToAll, datatype)}
};
size_t msgsize = count * ncclTypeSize(datatype);
NVTX3_FUNC_WITH_PARAMS(AllToAll, AllToAllSchema, msgsize)
NvtxParamsAllToAll payload{count * ncclTypeSize(datatype), datatype};
NVTX3_FUNC_WITH_PARAMS(AllToAll, AllToAllSchema, payload)
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
return mscclEnqueueCheck(
@@ -192,13 +206,17 @@ ncclResult_t ncclAllToAllv_impl(const void *sendbuff, const size_t sendcounts[],
struct NvtxParamsAllToAllv {
size_t sendbytes;
size_t recvbytes;
ncclDataType_t datatype;
};
// Just pass the size of one send/recv messages and not the total bytes sent/received.
constexpr nvtxPayloadSchemaEntry_t AllToAllvSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes] (Send)"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes] (Recv)"}
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes] (Recv)", nullptr, 0,
offsetof(NvtxParamsAllToAllv, recvbytes)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsAllToAllv, datatype)}
};
NvtxParamsAllToAllv payload{sendcounts[comm->rank] * ncclTypeSize(datatype), recvcounts[comm->rank] * ncclTypeSize(datatype)};
NvtxParamsAllToAllv payload{sendcounts[comm->rank] * ncclTypeSize(datatype), recvcounts[comm->rank] * ncclTypeSize(datatype), datatype};
NVTX3_FUNC_WITH_PARAMS(AllToAllv, AllToAllvSchema, payload)
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
@@ -238,12 +256,15 @@ ncclResult_t ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t cou
struct NvtxParamsBroadcast {
size_t bytes;
int root;
ncclDataType_t datatype;
};
constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsBroadcast, datatype)}
};
NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root, datatype};
NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
@@ -275,12 +296,15 @@ ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendco
struct NvtxParamsGather {
size_t bytes;
int root;
ncclDataType_t datatype;
};
constexpr nvtxPayloadSchemaEntry_t GatherSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsGather, root)}
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsGather, root)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsGather, datatype)}
};
NvtxParamsGather payload{sendcount * ncclTypeSize(datatype), root};
NvtxParamsGather payload{sendcount * ncclTypeSize(datatype), root, datatype};
NVTX3_FUNC_WITH_PARAMS(Gather, GatherSchema, payload)
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
@@ -314,14 +338,17 @@ ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
size_t bytes;
int root;
ncclRedOp_t op;
ncclDataType_t datatype;
};
constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsReduce, op)}
offsetof(NvtxParamsReduce, op)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsReduce, datatype)}
};
NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op, datatype};
NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
@@ -346,13 +373,16 @@ ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t
struct NvtxParamsReduceScatter {
size_t bytes;
ncclRedOp_t op;
ncclDataType_t datatype;
};
constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsReduceScatter, op)}
offsetof(NvtxParamsReduceScatter, op)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsReduceScatter, datatype)}
};
NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op, datatype};
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
@@ -377,12 +407,15 @@ ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvc
struct NvtxParamsScatter {
size_t bytes;
int root;
ncclDataType_t datatype;
};
constexpr nvtxPayloadSchemaEntry_t ScatterSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsScatter, root)}
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsScatter, root)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsScatter, datatype)}
};
NvtxParamsScatter payload{recvcount * ncclTypeSize(datatype), root};
NvtxParamsScatter payload{recvcount * ncclTypeSize(datatype), root, datatype};
NVTX3_FUNC_WITH_PARAMS(Scatter, ScatterSchema, payload)
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
@@ -410,10 +443,13 @@ ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvc
struct NvtxParamsSendRecv {
size_t bytes;
int peer;
ncclDataType_t datatype;
};
constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsSendRecv, datatype)}
};
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
@@ -422,7 +458,7 @@ NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataTyp
ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer, datatype};
NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
@@ -447,7 +483,7 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
ncclResult_t ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer, datatype};
NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
@@ -254,6 +254,11 @@
#define NVTX_PAYLOAD_ENTRY_TYPE_BF16 50
#define NVTX_PAYLOAD_ENTRY_TYPE_TF32 52
/**
* Generic data type
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE 53
/**
* Data types are as defined by NVTXv3 core.
*/
+3
مشاهده پرونده
@@ -32,6 +32,7 @@ enum roctxPayloadEntryType {
ROCTX_PAYLOAD_ENTRY_TYPE_INT,
ROCTX_PAYLOAD_ENTRY_TYPE_SIZE,
ROCTX_PAYLOAD_ENTRY_TYPE_REDOP,
ROCTX_PAYLOAD_ENTRY_TYPE_DATATYPE,
ROCTX_PAYLOAD_NUM_ENTRY_TYPES
};
@@ -58,6 +59,7 @@ struct roctxPayloadSchemaEntryInfo {
int typeInt;
size_t typeSize;
ncclDevRedOp_t typeRedOp;
ncclDataType_t typeDataType;
} payload;
};
@@ -88,6 +90,7 @@ typedef roctxPayloadInfo* roctxPayloadInfo_t;
extern const char* roctxEntryTypeStr[ROCTX_PAYLOAD_NUM_ENTRY_TYPES];
extern const char* ncclRedOpStr[ncclNumDevRedOps];
extern const char* ncclDataTypeStr[ncclNumTypes];
/**
* \brief Maps nvtx types to roctx types.
+13 -6
مشاهده پرونده
@@ -11,10 +11,12 @@
std::map<uint64_t, roctxPayloadEntryType> nvtxToRoctx {
{NVTX_PAYLOAD_ENTRY_TYPE_INT, ROCTX_PAYLOAD_ENTRY_TYPE_INT},
{NVTX_PAYLOAD_ENTRY_TYPE_SIZE, ROCTX_PAYLOAD_ENTRY_TYPE_SIZE},
{NVTX_PAYLOAD_ENTRY_TYPE_REDOP, ROCTX_PAYLOAD_ENTRY_TYPE_REDOP}};
{NVTX_PAYLOAD_ENTRY_TYPE_REDOP, ROCTX_PAYLOAD_ENTRY_TYPE_REDOP},
{NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, ROCTX_PAYLOAD_ENTRY_TYPE_DATATYPE}};
const char* roctxEntryTypeStr[ROCTX_PAYLOAD_NUM_ENTRY_TYPES] = {"ROCTX_PAYLOAD_ENTRY_TYPE_INT", "ROCTX_PAYLOAD_ENTRY_TYPE_SIZE", "ROCTX_PAYLOAD_ENTRY_TYPE_REDOP"};
const char* ncclRedOpStr[ncclNumDevRedOps] = { "Sum", "Prod", "MinMax", "PreMulSum", "SumPostDiv" };
const char* ncclRedOpStr[ncclNumDevRedOps] = {"Sum", "Prod", "MinMax", "PreMulSum", "SumPostDiv"};
const char* ncclDataTypeStr[ncclNumTypes] = {"i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64", "b16", "f8", "b8"};
void roctxAlloc(roctxPayloadInfo_t payloadInfo, const size_t numEntries) {
// Allocate enough memory for numEntries in payloadEntries
@@ -50,10 +52,11 @@ void extractPayloadInfo(const nvtxPayloadSchemaEntry_t* schema, const nvtxPayloa
// Populate payload union based on the roctx type
switch (payloadInfo->payloadEntries[i].type) {
case ROCTX_PAYLOAD_ENTRY_TYPE_INT: payloadInfo->payloadEntries[i].payload.typeInt = *reinterpret_cast<const int*>(entryData); break;
case ROCTX_PAYLOAD_ENTRY_TYPE_SIZE: payloadInfo->payloadEntries[i].payload.typeSize = *reinterpret_cast<const size_t*>(entryData); break;
case ROCTX_PAYLOAD_ENTRY_TYPE_REDOP: payloadInfo->payloadEntries[i].payload.typeRedOp = *reinterpret_cast<const ncclDevRedOp_t*>(entryData); break;
default: break;
case ROCTX_PAYLOAD_ENTRY_TYPE_INT: payloadInfo->payloadEntries[i].payload.typeInt = *reinterpret_cast<const int*>(entryData); break;
case ROCTX_PAYLOAD_ENTRY_TYPE_SIZE: payloadInfo->payloadEntries[i].payload.typeSize = *reinterpret_cast<const size_t*>(entryData); break;
case ROCTX_PAYLOAD_ENTRY_TYPE_REDOP: payloadInfo->payloadEntries[i].payload.typeRedOp = *reinterpret_cast<const ncclDevRedOp_t*>(entryData); break;
case ROCTX_PAYLOAD_ENTRY_TYPE_DATATYPE: payloadInfo->payloadEntries[i].payload.typeDataType = *reinterpret_cast<const ncclDataType_t*>(entryData); break;
default: break;
}
}
@@ -84,6 +87,10 @@ void stringify(roctxPayloadInfo_t payloadInfo) {
offset += snprintf(payloadInfo->message + offset, MAX_MESSAGE_LENGTH - offset, "%s",
entry.payload.typeRedOp < ncclNumDevRedOps ? ncclRedOpStr[entry.payload.typeRedOp] : "unknown");
break;
case ROCTX_PAYLOAD_ENTRY_TYPE_DATATYPE:
offset += snprintf(payloadInfo->message + offset, MAX_MESSAGE_LENGTH - offset, "%s",
entry.payload.typeDataType < ncclNumTypes ? ncclDataTypeStr[entry.payload.typeDataType] : "unknown");
break;
default:
offset += snprintf(payloadInfo->message + offset, MAX_MESSAGE_LENGTH - offset, "unknown roctx payload type");
break;
+5 -2
مشاهده پرونده
@@ -48,14 +48,17 @@ ncclResult_t mscclRunAlgo_impl(
struct NvtxParamsMsccl {
size_t bytes;
ncclRedOp_t op;
ncclDataType_t dataType;
};
// Just pass the size of one send/recv messages and not the total bytes sent/received.
constexpr nvtxPayloadSchemaEntry_t MscclSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsMsccl, op)}
offsetof(NvtxParamsMsccl, op)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsMsccl, dataType)}
};
NvtxParamsMsccl payload{count * ncclTypeSize(dataType), op};
NvtxParamsMsccl payload{count * ncclTypeSize(dataType), op, dataType};
NVTX3_FUNC_WITH_PARAMS(MSCCL, MscclSchema, payload)
mscclStatus& status = mscclGetStatus(comm->rank);