From ecf31da14f2ef796c200bde67e33d2d2779b14e6 Mon Sep 17 00:00:00 2001 From: Bertan Dogancay <111835151+BertanDogancay@users.noreply.github.com> Date: Thu, 30 Jan 2025 13:46:48 -0500 Subject: [PATCH] Add ncclDataType_t as type to ROCTX (#1512) --- src/collectives.cc | 84 +++++++++++++++++++-------- src/include/nvtx3/nvToolsExtPayload.h | 5 ++ src/include/roctx.h | 3 + src/misc/roctx.cc | 19 ++++-- src/msccl.cc | 7 ++- 5 files changed, 86 insertions(+), 32 deletions(-) diff --git a/src/collectives.cc b/src/collectives.cc index 2d8ffcb008..8e840b4a1d 100644 --- a/src/collectives.cc +++ b/src/collectives.cc @@ -85,12 +85,18 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + struct NvtxParamsAllGather { + size_t bytes; + ncclDataType_t datatype; + }; // Just pass the size of one message and not the total bytes sent/received. constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"} + {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, + offsetof(NvtxParamsAllGather, datatype)} }; - size_t msgsize = sendcount * ncclTypeSize(datatype); - NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize) + NvtxParamsAllGather payload{sendcount * ncclTypeSize(datatype), datatype}; + NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, payload) if (mscclAvailable(comm->rank) && !mscclIsCaller()) { return mscclEnqueueCheck( @@ -114,14 +120,16 @@ ncclResult_t ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t cou struct NvtxParamsAllReduce { size_t bytes; ncclRedOp_t op; + ncclDataType_t datatype; }; // Just pass the size of one message and not the total bytes sent/received. static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, - {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, - offsetof(NvtxParamsAllReduce, op)} + {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, offsetof(NvtxParamsAllReduce, op)}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, + offsetof(NvtxParamsAllReduce, datatype)} }; - NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op}; + NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op, datatype}; NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload) if (mscclAvailable(comm->rank) && !mscclIsCaller()) { @@ -145,12 +153,18 @@ NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_ ncclResult_t ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) { + struct NvtxParamsAllToAll { + size_t bytes; + ncclDataType_t datatype; + }; // Just pass the size of one message and not the total bytes sent/received. constexpr nvtxPayloadSchemaEntry_t AllToAllSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"} + {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, + offsetof(NvtxParamsAllToAll, datatype)} }; - size_t msgsize = count * ncclTypeSize(datatype); - NVTX3_FUNC_WITH_PARAMS(AllToAll, AllToAllSchema, msgsize) + NvtxParamsAllToAll payload{count * ncclTypeSize(datatype), datatype}; + NVTX3_FUNC_WITH_PARAMS(AllToAll, AllToAllSchema, payload) if (mscclAvailable(comm->rank) && !mscclIsCaller()) { return mscclEnqueueCheck( @@ -192,13 +206,17 @@ ncclResult_t ncclAllToAllv_impl(const void *sendbuff, const size_t sendcounts[], struct NvtxParamsAllToAllv { size_t sendbytes; size_t recvbytes; + ncclDataType_t datatype; }; // Just pass the size of one send/recv messages and not the total bytes sent/received. constexpr nvtxPayloadSchemaEntry_t AllToAllvSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes] (Send)"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes] (Recv)"} + {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes] (Recv)", nullptr, 0, + offsetof(NvtxParamsAllToAllv, recvbytes)}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, + offsetof(NvtxParamsAllToAllv, datatype)} }; - NvtxParamsAllToAllv payload{sendcounts[comm->rank] * ncclTypeSize(datatype), recvcounts[comm->rank] * ncclTypeSize(datatype)}; + NvtxParamsAllToAllv payload{sendcounts[comm->rank] * ncclTypeSize(datatype), recvcounts[comm->rank] * ncclTypeSize(datatype), datatype}; NVTX3_FUNC_WITH_PARAMS(AllToAllv, AllToAllvSchema, payload) if (mscclAvailable(comm->rank) && !mscclIsCaller()) { @@ -238,12 +256,15 @@ ncclResult_t ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t cou struct NvtxParamsBroadcast { size_t bytes; int root; + ncclDataType_t datatype; }; constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)} + {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, + offsetof(NvtxParamsBroadcast, datatype)} }; - NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root}; + NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root, datatype}; NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload) if (mscclAvailable(comm->rank) && !mscclIsCaller()) { @@ -275,12 +296,15 @@ ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendco struct NvtxParamsGather { size_t bytes; int root; + ncclDataType_t datatype; }; constexpr nvtxPayloadSchemaEntry_t GatherSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsGather, root)} + {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsGather, root)}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, + offsetof(NvtxParamsGather, datatype)} }; - NvtxParamsGather payload{sendcount * ncclTypeSize(datatype), root}; + NvtxParamsGather payload{sendcount * ncclTypeSize(datatype), root, datatype}; NVTX3_FUNC_WITH_PARAMS(Gather, GatherSchema, payload) if (mscclAvailable(comm->rank) && !mscclIsCaller()) { @@ -314,14 +338,17 @@ ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count, size_t bytes; int root; ncclRedOp_t op; + ncclDataType_t datatype; }; constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)}, {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, - offsetof(NvtxParamsReduce, op)} + offsetof(NvtxParamsReduce, op)}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, + offsetof(NvtxParamsReduce, datatype)} }; - NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op}; + NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op, datatype}; NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload) if (mscclAvailable(comm->rank) && !mscclIsCaller()) { @@ -346,13 +373,16 @@ ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t struct NvtxParamsReduceScatter { size_t bytes; ncclRedOp_t op; + ncclDataType_t datatype; }; constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, - offsetof(NvtxParamsReduceScatter, op)} + offsetof(NvtxParamsReduceScatter, op)}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, + offsetof(NvtxParamsReduceScatter, datatype)} }; - NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op}; + NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op, datatype}; NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload) if (mscclAvailable(comm->rank) && !mscclIsCaller()) { @@ -377,12 +407,15 @@ ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvc struct NvtxParamsScatter { size_t bytes; int root; + ncclDataType_t datatype; }; constexpr nvtxPayloadSchemaEntry_t ScatterSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsScatter, root)} + {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsScatter, root)}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, + offsetof(NvtxParamsScatter, datatype)} }; - NvtxParamsScatter payload{recvcount * ncclTypeSize(datatype), root}; + NvtxParamsScatter payload{recvcount * ncclTypeSize(datatype), root, datatype}; NVTX3_FUNC_WITH_PARAMS(Scatter, ScatterSchema, payload) if (mscclAvailable(comm->rank) && !mscclIsCaller()) { @@ -410,10 +443,13 @@ ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvc struct NvtxParamsSendRecv { size_t bytes; int peer; + ncclDataType_t datatype; }; constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)} + {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, + offsetof(NvtxParamsSendRecv, datatype)} }; NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, @@ -422,7 +458,7 @@ NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataTyp ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream) { - NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; + NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer, datatype}; NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload) if (mscclAvailable(comm->rank) && !mscclIsCaller()) { @@ -447,7 +483,7 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da ncclResult_t ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream) { - NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; + NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer, datatype}; NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload) if (mscclAvailable(comm->rank) && !mscclIsCaller()) { diff --git a/src/include/nvtx3/nvToolsExtPayload.h b/src/include/nvtx3/nvToolsExtPayload.h index c775738b18..3fd40badfb 100644 --- a/src/include/nvtx3/nvToolsExtPayload.h +++ b/src/include/nvtx3/nvToolsExtPayload.h @@ -254,6 +254,11 @@ #define NVTX_PAYLOAD_ENTRY_TYPE_BF16 50 #define NVTX_PAYLOAD_ENTRY_TYPE_TF32 52 +/** + * Generic data type + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE 53 + /** * Data types are as defined by NVTXv3 core. */ diff --git a/src/include/roctx.h b/src/include/roctx.h index 2ffafcb9ef..fc5858f9fb 100644 --- a/src/include/roctx.h +++ b/src/include/roctx.h @@ -32,6 +32,7 @@ enum roctxPayloadEntryType { ROCTX_PAYLOAD_ENTRY_TYPE_INT, ROCTX_PAYLOAD_ENTRY_TYPE_SIZE, ROCTX_PAYLOAD_ENTRY_TYPE_REDOP, + ROCTX_PAYLOAD_ENTRY_TYPE_DATATYPE, ROCTX_PAYLOAD_NUM_ENTRY_TYPES }; @@ -58,6 +59,7 @@ struct roctxPayloadSchemaEntryInfo { int typeInt; size_t typeSize; ncclDevRedOp_t typeRedOp; + ncclDataType_t typeDataType; } payload; }; @@ -88,6 +90,7 @@ typedef roctxPayloadInfo* roctxPayloadInfo_t; extern const char* roctxEntryTypeStr[ROCTX_PAYLOAD_NUM_ENTRY_TYPES]; extern const char* ncclRedOpStr[ncclNumDevRedOps]; +extern const char* ncclDataTypeStr[ncclNumTypes]; /** * \brief Maps nvtx types to roctx types. diff --git a/src/misc/roctx.cc b/src/misc/roctx.cc index 9c34b4ccf5..16d68bb47d 100644 --- a/src/misc/roctx.cc +++ b/src/misc/roctx.cc @@ -11,10 +11,12 @@ std::map nvtxToRoctx { {NVTX_PAYLOAD_ENTRY_TYPE_INT, ROCTX_PAYLOAD_ENTRY_TYPE_INT}, {NVTX_PAYLOAD_ENTRY_TYPE_SIZE, ROCTX_PAYLOAD_ENTRY_TYPE_SIZE}, - {NVTX_PAYLOAD_ENTRY_TYPE_REDOP, ROCTX_PAYLOAD_ENTRY_TYPE_REDOP}}; + {NVTX_PAYLOAD_ENTRY_TYPE_REDOP, ROCTX_PAYLOAD_ENTRY_TYPE_REDOP}, + {NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, ROCTX_PAYLOAD_ENTRY_TYPE_DATATYPE}}; const char* roctxEntryTypeStr[ROCTX_PAYLOAD_NUM_ENTRY_TYPES] = {"ROCTX_PAYLOAD_ENTRY_TYPE_INT", "ROCTX_PAYLOAD_ENTRY_TYPE_SIZE", "ROCTX_PAYLOAD_ENTRY_TYPE_REDOP"}; -const char* ncclRedOpStr[ncclNumDevRedOps] = { "Sum", "Prod", "MinMax", "PreMulSum", "SumPostDiv" }; +const char* ncclRedOpStr[ncclNumDevRedOps] = {"Sum", "Prod", "MinMax", "PreMulSum", "SumPostDiv"}; +const char* ncclDataTypeStr[ncclNumTypes] = {"i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64", "b16", "f8", "b8"}; void roctxAlloc(roctxPayloadInfo_t payloadInfo, const size_t numEntries) { // Allocate enough memory for numEntries in payloadEntries @@ -50,10 +52,11 @@ void extractPayloadInfo(const nvtxPayloadSchemaEntry_t* schema, const nvtxPayloa // Populate payload union based on the roctx type switch (payloadInfo->payloadEntries[i].type) { - case ROCTX_PAYLOAD_ENTRY_TYPE_INT: payloadInfo->payloadEntries[i].payload.typeInt = *reinterpret_cast(entryData); break; - case ROCTX_PAYLOAD_ENTRY_TYPE_SIZE: payloadInfo->payloadEntries[i].payload.typeSize = *reinterpret_cast(entryData); break; - case ROCTX_PAYLOAD_ENTRY_TYPE_REDOP: payloadInfo->payloadEntries[i].payload.typeRedOp = *reinterpret_cast(entryData); break; - default: break; + case ROCTX_PAYLOAD_ENTRY_TYPE_INT: payloadInfo->payloadEntries[i].payload.typeInt = *reinterpret_cast(entryData); break; + case ROCTX_PAYLOAD_ENTRY_TYPE_SIZE: payloadInfo->payloadEntries[i].payload.typeSize = *reinterpret_cast(entryData); break; + case ROCTX_PAYLOAD_ENTRY_TYPE_REDOP: payloadInfo->payloadEntries[i].payload.typeRedOp = *reinterpret_cast(entryData); break; + case ROCTX_PAYLOAD_ENTRY_TYPE_DATATYPE: payloadInfo->payloadEntries[i].payload.typeDataType = *reinterpret_cast(entryData); break; + default: break; } } @@ -84,6 +87,10 @@ void stringify(roctxPayloadInfo_t payloadInfo) { offset += snprintf(payloadInfo->message + offset, MAX_MESSAGE_LENGTH - offset, "%s", entry.payload.typeRedOp < ncclNumDevRedOps ? ncclRedOpStr[entry.payload.typeRedOp] : "unknown"); break; + case ROCTX_PAYLOAD_ENTRY_TYPE_DATATYPE: + offset += snprintf(payloadInfo->message + offset, MAX_MESSAGE_LENGTH - offset, "%s", + entry.payload.typeDataType < ncclNumTypes ? ncclDataTypeStr[entry.payload.typeDataType] : "unknown"); + break; default: offset += snprintf(payloadInfo->message + offset, MAX_MESSAGE_LENGTH - offset, "unknown roctx payload type"); break; diff --git a/src/msccl.cc b/src/msccl.cc index 19b98a8af5..2fca78f23c 100644 --- a/src/msccl.cc +++ b/src/msccl.cc @@ -48,14 +48,17 @@ ncclResult_t mscclRunAlgo_impl( struct NvtxParamsMsccl { size_t bytes; ncclRedOp_t op; + ncclDataType_t dataType; }; // Just pass the size of one send/recv messages and not the total bytes sent/received. constexpr nvtxPayloadSchemaEntry_t MscclSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, - offsetof(NvtxParamsMsccl, op)} + offsetof(NvtxParamsMsccl, op)}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, + offsetof(NvtxParamsMsccl, dataType)} }; - NvtxParamsMsccl payload{count * ncclTypeSize(dataType), op}; + NvtxParamsMsccl payload{count * ncclTypeSize(dataType), op, dataType}; NVTX3_FUNC_WITH_PARAMS(MSCCL, MscclSchema, payload) mscclStatus& status = mscclGetStatus(comm->rank);