diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c8ee6e26a..789d558ab6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -420,6 +420,7 @@ set(SRC_FILES src/group.cc src/init.cc src/init_nvtx.cc + src/mnnvl.cc src/net.cc src/msccl.cc src/proxy.cc @@ -486,6 +487,7 @@ set(SRC_FILES src/include/ibvwrap.h src/include/info.h src/include/ipcsocket.h + src/include/mnnvl.h src/include/nccl_common.h src/include/nccl_net.h src/include/nccl_profiler.h @@ -494,6 +496,7 @@ set(SRC_FILES src/include/net.h src/include/nvmlwrap.h src/include/nvtx.h + src/include/nvtx_payload_schemas.h src/include/nvtx_stub.h src/include/p2p.h src/include/param.h diff --git a/ext-profiler/README.md b/ext-profiler/README.md new file mode 100644 index 0000000000..7ef44b2fa4 --- /dev/null +++ b/ext-profiler/README.md @@ -0,0 +1,318 @@ +# NCCL Profiler Plugin Documentation + +This page describes the NCCL Profiler plugin API and how to implement a profiler plugin for NCCL. + +# Overview + +To allow NCCL to better integrate with DL frameworks, NCCL v2.23 introduced a profiler plugin +interface. Any NCCL user can write profiler plugins to extract performance data from NCCL and +use it for debugging and analysis. + +Similarly to other plugins (e.g., network plugin), the profiler plugins come as a shared library +called `libnccl-profiler.so`. That shared library contains one or more implementations of the +NCCL PROFILER API, in the form of versioned structs, filled with pointers to all required +functions. + +# Plugin architecture + +## Plugin name and supporting multiple profiler plugins + +When NCCL is initialized, it will look for a `libnccl-profiler.so` library and dynamically load +it, then look for symbols inside the library. + +The `NCCL_PROFILER_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL +will look for a library with a name of `libnccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. It is therefore +advised to name the library following that pattern, with a symlink pointing `libnccl-profiler.so` +to `libnccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. That way, if there are multiple plugins in the +path, setting `NCCL_PROFILER_PLUGIN` will allow users to select the right plugin. Alternatively, +the user can also set `NCCL_PROFILER_PLUGIN` to the pathname of the `libnccl-profiler.so` library. + +## Struct versioning + +Once a library is found, NCCL will look for a symbol named `ncclProfiler_vX`, with `X` increasing +over time. The versioning ensures that the plugin and the NCCL core are compatible. + +Plugins are encouraged to provide multiple of those symbols, implementing multiple versions of the +NCCL PROFILER API, so that the same plugin can be compiled and support a wide range of NCCL versions. + +Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking +for the latest ncclProfiler struct version, but also looking for older ones so that older plugins +would still work. + +## Headers management + +To help users build plugins effortlessly, plugins should copy the `ncclProfiler_vX` definitions +they support to their internal includes. An example is shown in `ext-profiler/example` where we +keep all headers in the `nccl/` directory and provide thin layers to implement old version on top +of newer ones. + +The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions +from old API versions. It also provides error codes in `err.h`. + +# API (v2) + +Below is the main `ncclProfiler_v2` struct. Each function is explained in later sections. + +``` +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v2_t; +``` + +## Error codes + +As rule of thumb, profiler generated errors should not be propagated to NCCL and alter its normal +functioning. Nevertheless, the profiler interface returns NCCL error codes, in case any need for +them arises in the future. For now, any profiler interface call should only return `ncclSuccess`. +The only exception is `init` that can return an error so that NCCL can disable the plugin. + +## Operation overview + +NCCL will call the `init` function first for every new communicator that is initialized. The profiler +returns an opaque context handle that is used to isolate profiler instances across communicators. +Similarly, NCCL will call `finalize` to destroy the profiler context, thus freeing resources. + +The NCCL core code is instrumented with calls to `startEvent`, `stopEvent` and `recordEventState`. +These are used to start, stop and update events in the profiler, respectively. + +## API Functions + +### Initialization + +#### name + +The `name` field should point to a character string with the name of the profiler plugin. This will +be used for all logging, especially when `NCCL_DEBUG=INFO` is set. + +#### init + +As soon as NCCL finds the plugin and the correct ncclProfiler symbol, it calls its `init` function. +This allows the plugin to initialize its internal context, used during profiling of NCCL events. +If the `init` function does not return `ncclSuccess`, NCCL disables the plugin. + +#### finalize + +When the profiler is no longer needed, a call to `finalize` destroys the profiler context and frees +up resources. + +### Profiling + +#### startEvent + +When NCCL needs to start profiling a new event it calls `startEvent`. `startEvent` takes the profiler +context, previously created by `init`, an event descriptor of type `ncclProfilerEventDescr_t` and +returns an opaque profiler event handle that can be passed to other profiler functions, as discussed +later in the document. + + +The event descriptor contains all the event metadata. Every event type has its own descriptor. Below +is the `ncclProfilerEventDescr_t` struct. + +``` +typedef struct { + uint8_t type; // event type (e.g., ncclProfileGroup, ncclProfileColl, ...) + void* parentObj; // pointer to parent event used to expose the event hierarchy to the profiler + int rank; // rank that generated the event + union { + struct { // collective events metadata + const char* name; // string containing name of the communicator + uint64_t commHash; // unique hash/id for the communicator + uint64_t seqNumber; // sequence number of this collective operation in the communicator + const char* func; // string containing name of the collective + void const* sendBuff; // address of send buffer + void* recvBuff; // address of recv buffer + size_t count; // data count + int root; // root rank + const char* datatype; // string containing the name of the datatype + size_t trafficBytes; // number of transfer bytes + uint8_t nMaxChannels; // max number of channels for this collective + uint8_t nWarps; // number of GPU warps for this collective + const char* algo; // string containing name of the algorithm for this collective + const char* proto; // string containing name of the protocol for this collective + } coll; + + struct { // point-to-point events metadata + const char* name; + uint64_t commHash; + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; // peer rank for this point-to-point + } p2p; + + struct { // proxyOp events metadata + pid_t pid; // process id that generated the associated `ncclProxyOp` object + uint8_t channelId; // id of the channel used by the associated `ncclProxyOp` object + int peer; // peer rank + int nSteps; // number of network transfers/steps required by the `ncclProxyOp` + int chunkSize; // chunk size for this `ncclProxyOp` + int isSend; // set to 1 for sends and 0 for recvs + } proxyOp; + + struct { // proxyStep events metadata + int step; // individual step in `ncclProxyOp` + } proxyStep; + }; +} ncclProfilerEventDescr_v2_t; +``` + +NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`, +`ncclProfileProxyOp`, `ncclProfileProxyStep`, and `ncclProfileProxyCtrl`. + +#### stopEvent + +`stopEvent` takes the event handle returned by `startEvent` to stop the event. After the event +has been stopped the handle can no longer be used with other profiler calls. Using the event +handle after `eventStop` is undefined behavior. + +#### recordEventState + +Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`, +`ncclProfileP2p`, cannot be updated through calls to `recordEventState`. + +`ncclProfileProxyOp`, `ncclProfileProxyStep` and `ncclProfileProxyCtrl` can be updated through +calls to `recordEventState`. + +The state of proxy generated events can be updated, along with event attributes, using +`recordEventState`. These events can go through several states during their lifecycle. +The list of supported states for the proxy-defined events is reported below. + +``` +typedef enum { + // ncclProfileProxyOp event states + ncclProfilerProxyOpSendPosted, // state marks the posting of send buffer to GPU for given network transfer/step + ncclProfilerProxyOpSendRemFifoWait, // state marks the waiting of CTS credits from peer rank + ncclProfilerProxyOpSendTransmitted, // state marks the sending of network transfer/step to peer rank + ncclProfilerProxyOpSendDone, // state marks the ending of network transfer/step + ncclProfilerProxyOpRecvPosted, // state marks the posting of recv to network for given network transfer/step + ncclProfilerProxyOpRecvReceived, // state marks the recving of network transfer/step from peer rank + ncclProfilerProxyOpRecvTransmitted, // state marks the ending of the network transfer/step + ncclProfilerProxyOpRecvDone, // state marks the consuming of data from GPU + + // ncclProfileProxyStep event states + ncclProfilerProxyStepSendGPUWait, // state marks the waiting of send data from GPU for given network transfer/step + ncclProfilerProxyStepSendWait, // state marks the waiting of send data from network for given network transfer/step + ncclProfilerProxyStepRecvWait, // state marks the waiting of recv data from network for given network transfer/step + ncclProfilerProxyStepRecvFlushWait, // state marks the waiting of recv data flush to GPU for given network transfer/step + ncclProfilerProxyStepRecvGPUWait, // state marks the waiting of recv data consumption from GPU for given network transfer/step + + // ncclProfileProxyCtrl event states + ncclProfilerProxyCtrlIdle, // state marks proxy progress thread idle + ncclProfilerProxyCtrlActive, // state marks proxy progress thread active + ncclProfilerProxyCtrlSleep, // state marks proxy progress thread sleeping + ncclProfilerProxyCtrlWakeup, // state marks proxy progress thread waking up + ncclProfilerProxyCtrlAppend, // state marks append of new network work item begin + ncclProfilerProxyCtrlAppendEnd, // state marks append of new network work item end +} ncclProfilerEventState_v2_t; +``` + +`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing +network requests for the GPU kernel. ProxyOp events are generated for every active channel and +provide a summary of the activity of the proxy progress thread for that channel. + +`ncclProfileProxyStep` events are generated by the proxy progress thread while it is processing +network requests for the GPU kernel. ProxyStep events describe individual network transfer in +the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events. + +`ncclProfileProxyCtrl` events are generated by the proxy progress thread while it is not processing +network requests for the GPU kernel. This includes everything else that the proxy thread might be +doing, including appending new `ncclProxyOp` objects to the list of work elements to process. + +State transitions for the events described can also come with event attribute updates. For this +reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below. + +``` +typedef union { + struct { // attributes to update for ncclProfileProxyOp events + size_t transSize; // data transferred thus far + int steps; // network transfer/steps processed thus far + } proxyOp; + + struct { // attributes to update for ncclProfileProxyCtrl + int appendedProxyOps; // number of appended proxy ops thus far + } proxyCtrl; +} ncclProfilerEventStateArgs_v2_t; +``` + +The example profiler in `ext-profiler/example` contains details on how to capture and use the events above. + +### Event hierarchy + +NCCL core events (reported above) are organized into a hierarchy as reported below: + +``` +Group event + | + +- Collective event + | | + | +- ProxyOp event + | | + | +- ProxyStep event + | + +- Point-to-point event + | + +- ProxyOp event + | + +- ProxyStep event + +ProxyCtrl event +``` + +# Profiler instrumentation and logging + +## Profiling of collective and p2p operations + +The NCCL code is instrumented with profiler callbacks at different levels to capture start/stop of groups, +collective and point-to-point operations, as well as proxy progress activity. Due to the asynchronous nature +of NCCL operations, events associated to collective and point-to-point operations are not easy to delimit +precisely. For example, without both proxy and/or kernel activity it is impossible for the profiler to +figure out when a collective operation completes. Therefore, `stopEvent` for collectives simply indicates to +the profiler that the collective has been enqueued. The profiler can leverage proxy event information, if +these are enabled, to estimate when the collective ends. In this case, the profiler can look at the `stopEvent` +call of the last `ncclProfileProxyOp` event to mark the completion of the associated collective event. This +can be achieved by reference counting the collective event and letting calls to `startEvent` and `stopEvent` +increment and decrement the reference counter, respectively. + +## PXN + +PXN causes some proxy operations to be processed in a remote proxy thread that differs from the one that +generated the operation. When this happens, the event hierarchy reported above breaks. Because the +profiler can use the hierarchy information, provided by NCCL in the event descriptor, to dereference the +parent event during `startEvent`, the remote proxy thread must be in the same address space of the proxy +thread originating the operation. To avoid the profiler instance in the remote proxy address space to +dereference a pointer from another address space the event descriptor includes the PID of the originator. +The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the +parent event. diff --git a/ext-profiler/example/README.md b/ext-profiler/example/README.md new file mode 100644 index 0000000000..d98e58f157 --- /dev/null +++ b/ext-profiler/example/README.md @@ -0,0 +1,239 @@ +# NCCL Example Profiler Plugin Usage + +This page describes how to use the NCCL example profiler plugin + +# Overview + +The example profiler plugin implements the NCCL profiler plugin API introduced in NCCL v2.23. The API +defines a set of events and data structures that NCCL uses to share event information with profiler +plugins. The user can control what events are instrumented by NCCL and when traces collected by the +profiler should be dumped through environment variables, as described in the rest of the document. +The user can also control other profiler parameters that alter its behavior. For example, users can +change the size of the event window the profiler keeps track of. + +## Building the profiler plugin + +To use the example plugin, just type `make`. You will need a NCCL build's include directory present. +You can override `NCCL_HOME` to where the NCCL installation is on your system. + +## Using the profiler plugin + +1. Add the directory of this profiler plugin to your `LD_LIBRARY_PATH` or set the `NCCL_PROFILER_PLUGIN`, + as documented in `ext-profiler/README.md`. + +2. Set `NCCL_PROFILE_EVENT_MASK` bitmask to specify the NCCL events you want to instrument. By + default, all collectives and send/recv operations will be traced. For more details about the event + representation used by the profiler refer to `ext-profiler/README.md`. + + As an example, setting: + + `NCCL_PROFILE_EVENT_MASK` to 1 (`ncclProfileGroup`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`) + + enables the profiling of the group, the collective and the proxy op events. The same events can be + expressed more concisely by setting `NCCL_PROFILE_EVENT_MASK` to 8 (`ncclProfileProxyOp`). Indeed, + in NCCL all the events above (in the event hierarchy) the one requested are also captured. The advantage + is that the profiler can easily correlate events that belong to the same NCCL operation and present + them accordingly. + +3. Set `NCCL_PROFILE_DUMP_FILE` to the name of the dump file for the collected traces. A file named + ${NCCL_PROFILE_DUMP_FILE}-hostname-tid.txt is created. Profiler traces are saved using the chrome + event format (more precisely, using asynchronous events). + +4. If you set the dump file variable, type chrome://tracing on your chromium browser search bar and + open the created dump file to visualize the traces. + +# Changing the profiler memory pool sizes + +The example profiler uses separate memory pools for different types of events. The size of these memory +pools (i.e., the # events) determines the number of events that the profiler can keep track of at the +same time. When NCCL requests a new event (e.g., collective event) to profile a `ncclAllReduce` +operation, by calling `startEvent`, the profiler searches in the collective pool for a free event. If it +finds one, it marks it as in use and returns the handle to NCCL. If the pool is completely used the +profiler returns `NULL` to NCCL and ignores all the following NCCL profiler calls for the `NULL` event +handle. When the `ncclAllReduce` has been processed, NCCL calls `stopEvent` with the previosly returned +event handle. The profiler has a total of 5 memory pools. + +The group, collective and p2p pools contain objects for the corresponding events. The `ProxyCtrl` pool +contains objects for `ProxyCtrl` events and the `ProxyDetach` pool contains objects for `ProxyOp` events +generated by remote proxies. A list of pools and their size is reported below: + +- `NCCL_PROFILE_GROUP_POOL_SIZE` (16) +- `NCCL_PROFILE_COLL_POOL_SIZE` (16) +- `NCCL_PROFILE_P2P_POOL_SIZE` (1024) +- `NCCL_PROFILE_PROXY_CTRL_POOL_SIZE` (16) +- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (128) + +Remote proxy operations are generated when PXN is in use. Refer to this article for more information +about PXN and how it works: +https://developer.nvidia.com/blog/doubling-all2all-performance-with-nvidia-collective-communication-library-2-12/ + +# Reported events + +The example profiler generates traces using the json format. An example of trace is reported below: + +``` +[ +{"name": "Group", "cat": "GROUP", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764234.611328, "args": {"groupId": 0}}, +{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764237.294922, "args": {"SeqNum": 0, "CommHash": 673864846479792718, "Rank": 1, "Count": 32768, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "LL", "nMaxChannels": 2}}, +{"name": "Recv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768464.936523, "args": {"Channel": 0, "Peer": 0, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 772020.300781}, "RECEIVED": {"step": 14, "ts": 772196.049805}, "TRANSMITTED": {"step": 14, "ts": 772197.326172}, "DONE": {"step": 14, "ts": 772201.538086}}}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768465.158203, "args": {"Step": 0}}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805}, +{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805, "args": {"Step": 0}}, +{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266, "args": {"Step": 0}}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805, "args": {"Step": 0}}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768568.276367}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768503.604492, "args": {"Step": 1}}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805}, +{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805, "args": {"Step": 1}}, +{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234, "args": {"Step": 1}}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695, "args": {"Step": 1}}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 770006.914062}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768506.941406, "args": {"Step": 2}}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547}, +{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547, "args": {"Step": 2}}, +{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133, "args": {"Step": 2}}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547, "args": {"Step": 2}}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771468.458008}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768509.484375, "args": {"Step": 3}}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000}, +{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000, "args": {"Step": 3}}, +{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023, "args": {"Step": 3}}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211, "args": {"Step": 3}}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771910.500000}, +{"name": "Send", "cat": "PROXY", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768482.878906, "args": {"Channel": 0, "Peer": 2, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 771995.675781}, "REM_FIFO_WAIT": {"step": 14, "ts": 772190.692383}, "TRANSMITTED": {"step": 14, "ts": 772191.516602}, "DONE": {"step": 14, "ts": 772208.473633}}}, +{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.019531, "args": {"Step": 0}}, +{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781}, +{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781, "args": {"Step": 0}}, +{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234}, +{"name": "SendWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234, "args": {"Step": 0}}, +{"name": "SendWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769618.889648}, +{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.083008, "args": {"Step": 1}}, +{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086}, +{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086, "args": {"Step": 1}}, +{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664}, +{"name": "SendWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664, "args": {"Step": 1}}, +{"name": "SendWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769622.517578}, +{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768507.937500, "args": {"Step": 2}}, +{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578}, +{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578, "args": {"Step": 2}}, +{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883}, +{"name": "SendWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883, "args": {"Step": 2}}, +{"name": "SendWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770013.848633}, +{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.742188, "args": {"Step": 3}}, +{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266}, +{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266, "args": {"Step": 3}}, +{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477}, +{"name": "SendWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477, "args": {"Step": 3}}, +{"name": "SendWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771469.171875}, + ... [ trace truncated for brevity ] +{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.317383}, +{"name": "Group", "cat": "GROUP", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.418945}, +{}] +``` + +Details about the fields used in the trace can be found at this link: +https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw + +The trace above is obtained by running a `ncclAllReduce` operation on 8 GPUs, communicating with each other through +the network interface. The `Group` event encloses all traces that are related to the single `ncclAllReduce` call. +(Note that for single collective invocations, where there are no explicit group calls, NCCL creates a group with only +one collective and this is what is presented in the traces above). + + +The `AllReduce` event encloses traces for the proxy operation associated to the `ncclAllReduce` operation. The `args` +field in the traces contains NCCL specific information (aside from the chrome trace event format). + +## AllReduce trace + +The `AllReduce` entry presents information about the `ncclAllReduce` operation. It contains the following info in the args field: + +- seqNum : sequential number of the collective in the communicator (every collective type has its own sequence number in the communicator) +- commHash : communicator unique identifier +- rank : NCCL rank for the ncclAllReduce +- datatype : NCCL datatype +- algorithm : algorithm used to process the ncclAllReduce +- protocol : protocol used to process the ncclAllReduce +- nMaxChannels: max number of channels used to process the ncclAllReduce + +If the proxy events are not active (e.g., the `ncclAllReduce` is intranode) the end timestamp will match the time +consumed by the CPU to launch the collective. For more details refer to `ext-profiler/README.md`, section `Profiling +of collective and p2p operations`. + +### Proxy Send +The `Send` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following +info in the args field: + +- Channel : id of the channel used by this proxy operation to send data to the peer +- Peer : peer rank +- Steps : number of network steps required to transfer transSize bytes to the peer +- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread +- transSize : bytes transferred across the channel by this proxy operation +- POSTED : struct containing the number of buffer posts to the GPU and the time stamp for the last post +- REM_FIFO_WAIT: struct containing the number of remote buffer waits and the time stamp for the last wait +- TRANSMITTED : struct containing the number of network sends and the time stamp of the last send +- DONE : struct containing the number of network sends completed and the time stamp of the last send completed + +In case of a network problem the POSTED, REM_FIFO_WAIT, TRANSMITTED and DONE might all have partially updated steps, +which could help identify at which point the network problem occurred. + +The Proxy send trace gives a summary of the proxy progress thread activity for the channel. If more details are +needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace +entries below are also reported by the profiler. + +#### Proxy SendBufferWait + +Presents, for every network step, the time the CPU proxy spends waiting for the channel staging buffer to become available. + +#### Proxy SendGPUWait + +Presents, for every network step, the time the CPU proxy spends waiting for the GPU to provide the data in the staging +buffer. + +#### Proxy SendWait + +Presents, for every network step, the time the CPU proxy spends waiting for the `isend` to complete + +### Proxy Recv + +The `Recv` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following +info in the args field: + +- Channel : id of the channel used by this proxy operation to recv data from the peer +- Peer : peer rank +- Steps : number of network steps required to transfer transSize bytes from the peer +- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread +- transSize : bytes transferred across the channel by this proxy operation +- POSTED : struct containing the number of recvs posted and the time stamp for the last recv posted +- RECEIVED : struct containing the number of recvs completed and the time stamp for the last recv completed +- TRANSMITTED: struct containing the number of recvs flushed to the GPU memory and the time stamp for the last recv flushed +- DONE : struct containing the number of flush completed and the time stamp for the last flush completed + +The Proxy Recv trace gives a summary of the proxy progress thread activity for the channel. If more details are +needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace +entries below are also reported by the profiler. + + +#### Proxy RecvBufferWait + +Presents, for every network step, the time the CPU proxy spends waiting for the staging buffer for the channel to +become available. + +#### Proxy RecvWait + +Presents, for every network step, the time the CPU proxy spends waiting for a posted `irecv` to complete + +#### Proxy RecvFlushWait + +Presents, for every network step, the time the CPU proxy spends waitng for the recv data to be flushed to the GPU + +#### Proxy RecvGPUWait + +Presents, for every network step, the time the CPU proxy spends waiting for the GPU to consume the recv data diff --git a/makefiles/common.mk b/makefiles/common.mk index 82164ab5c0..1b1bb8674d 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -39,14 +39,20 @@ endif CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80 CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90 +CUDA13_GENCODE = -gencode=arch=compute_100,code=sm_100 \ + -gencode=arch=compute_120,code=sm_120 CUDA8_PTX = -gencode=arch=compute_61,code=compute_61 CUDA9_PTX = -gencode=arch=compute_70,code=compute_70 CUDA11_PTX = -gencode=arch=compute_80,code=compute_80 CUDA12_PTX = -gencode=arch=compute_90,code=compute_90 +CUDA13_PTX = -gencode=arch=compute_120,code=compute_120 -ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0) +ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 12; echo $$?),0) +# Include Blackwell support if we're using CUDA12.8 or above + NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX) +else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0) # Include Hopper support if we're using CUDA11.8 or above NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX) else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) diff --git a/makefiles/version.mk b/makefiles/version.mk index 2523009340..b02cf909cf 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 24 -NCCL_PATCH := 3 +NCCL_MINOR := 25 +NCCL_PATCH := 1 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/pkg/debian/Makefile b/pkg/debian/Makefile index 0494f3e032..650ca42705 100644 --- a/pkg/debian/Makefile +++ b/pkg/debian/Makefile @@ -25,7 +25,7 @@ prep : $(DEBTARGETS) build : prep $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) @printf "Building Debian package\n" - (cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b) + (cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b -Zxz) mkdir -p $(PKGDIR) mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/ diff --git a/pkg/debian/libnccl-dev.install.in b/pkg/debian/libnccl-dev.install.in index 13eca26c64..45120e6de4 100644 --- a/pkg/debian/libnccl-dev.install.in +++ b/pkg/debian/libnccl-dev.install.in @@ -1,4 +1,4 @@ +bin/ncclras /usr/bin include/nccl.h /usr/include -include/nccl_net.h /usr/include lib/libnccl.so /usr/lib/${pkg:MultiArch} lib/libnccl_static.a /usr/lib/${pkg:MultiArch} diff --git a/pkg/debian/rules b/pkg/debian/rules index 23b90a9e01..8005d30201 100755 --- a/pkg/debian/rules +++ b/pkg/debian/rules @@ -11,3 +11,6 @@ override_dh_auto_test: override_dh_auto_clean: # Do not make clean + +override_dh_builddeb: + dh_builddeb -- -Zxz diff --git a/pkg/redhat/nccl.spec.in b/pkg/redhat/nccl.spec.in index 8e5aed6f3d..d629555924 100644 --- a/pkg/redhat/nccl.spec.in +++ b/pkg/redhat/nccl.spec.in @@ -20,6 +20,7 @@ sockets. %package devel Summary: NVIDIA Collective Communication Library (NCCL) Runtime Group: Development/Libraries +Requires: libnccl >= ${nccl:Major}.${nccl:Minor}.${nccl:Patch} %description devel NCCL development files @@ -44,9 +45,10 @@ install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUI ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major} # devel +install -m 755 -d $RPM_BUILD_ROOT/%{_bindir} install -m 755 -d $RPM_BUILD_ROOT/%{_includedir} +install -m 755 bin/ncclras $RPM_BUILD_ROOT/%{_bindir} install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir} -install -m 644 include/nccl_net.h $RPM_BUILD_ROOT/%{_includedir} ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so # static @@ -64,8 +66,8 @@ rm -rf $RPM_BUILD_ROOT %files devel %doc LICENSE.txt %defattr(-,root,root,-) +%{_bindir}/ncclras %{_includedir}/nccl.h -%{_includedir}/nccl_net.h %{_libdir}/libnccl.so %files static diff --git a/pkg/txz/create_txz.sh.in b/pkg/txz/create_txz.sh.in index deae854830..88f961325c 100644 --- a/pkg/txz/create_txz.sh.in +++ b/pkg/txz/create_txz.sh.in @@ -21,4 +21,4 @@ PKG_ARCH=${pkg:Arch} NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}" -tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt +tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/bin $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt diff --git a/src/Makefile b/src/Makefile index 2c5d9e863e..b66ebefa2c 100644 --- a/src/Makefile +++ b/src/Makefile @@ -10,7 +10,7 @@ include ../makefiles/version.mk INCEXPORTS := nccl.h LIBSRCFILES := \ bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \ - init.cc init_nvtx.cc net.cc proxy.cc transport.cc \ + init.cc init_nvtx.cc net.cc proxy.cc transport.cc mnnvl.cc \ $(wildcard graph/*.cc) \ $(wildcard misc/*.cc) \ $(wildcard transport/*.cc) \ diff --git a/src/collectives.cc b/src/collectives.cc index f005488a2d..82e81983e4 100644 --- a/src/collectives.cc +++ b/src/collectives.cc @@ -10,7 +10,7 @@ #include "graph/topo.h" #include "nccl.h" #include "api_trace.h" - +#include "nvtx_payload_schemas.h" #include "msccl/msccl_lifecycle.h" using namespace rccl; @@ -84,18 +84,8 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { - struct NvtxParamsAllGather { - size_t bytes; - ncclDataType_t datatype; - }; - // Just pass the size of one message and not the total bytes sent/received. - constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, - offsetof(NvtxParamsAllGather, datatype)} - }; - NvtxParamsAllGather payload{sendcount * ncclTypeSize(datatype), datatype}; - NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, payload) + NVTX3_FUNC_WITH_PARAMS(AllGather, NcclNvtxParamsAllGather, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype), datatype)); struct ncclInfo info = { ncclFuncAllGather, "AllGather", sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */ @@ -121,20 +111,8 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size ncclResult_t ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { - struct NvtxParamsAllReduce { - size_t bytes; - ncclRedOp_t op; - ncclDataType_t datatype; - }; - // Just pass the size of one message and not the total bytes sent/received. - static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, - {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, offsetof(NvtxParamsAllReduce, op)}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, - offsetof(NvtxParamsAllReduce, datatype)} - }; - NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op, datatype}; - NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload) + NVTX3_FUNC_WITH_PARAMS(AllReduce, NcclNvtxParamsAllReduce, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), op, datatype)); struct ncclInfo info = { ncclFuncAllReduce, "AllReduce", sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */ @@ -162,25 +140,14 @@ NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_ ncclResult_t ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) { + NVTX3_FUNC_WITH_PARAMS(AllToAll, NcclNvtxParamsAllToAll, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), datatype)); if (!mscclIsCaller()) // when msccl falls back to { NCCLCHECK(Recorder::instance().record(rrAllToAll, sendbuff, recvbuff, count, datatype, comm, stream)); } - struct NvtxParamsAllToAll { - size_t bytes; - ncclDataType_t datatype; - }; - // Just pass the size of one message and not the total bytes sent/received. - constexpr nvtxPayloadSchemaEntry_t AllToAllSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, - offsetof(NvtxParamsAllToAll, datatype)} - }; - NvtxParamsAllToAll payload{count * ncclTypeSize(datatype), datatype}; - NVTX3_FUNC_WITH_PARAMS(AllToAll, AllToAllSchema, payload) - if (mscclAvailable(comm->rank) && !mscclIsCaller()) { return mscclEnqueueCheck( sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr, @@ -218,28 +185,15 @@ NCCL_API(ncclResult_t, ncclAllToAllv, const void *sendbuff, const size_t sendcou ncclResult_t ncclAllToAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[], void *recvbuff, const size_t recvcounts[], const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) { + NVTX3_FUNC_WITH_PARAMS(AllToAllv, NcclNvtxParamsAllToAllv, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcounts[comm->rank] * ncclTypeSize(datatype), + recvcounts[comm->rank] * ncclTypeSize(datatype), datatype)); if (!mscclIsCaller()) // when msccl falls back to { NCCLCHECK(Recorder::instance().record(rrAllToAllv, sendbuff, recvbuff, 0, datatype, comm, stream, -1, sendcounts, sdispls, recvcounts, rdispls)); } - struct NvtxParamsAllToAllv { - size_t sendbytes; - size_t recvbytes; - ncclDataType_t datatype; - }; - // Just pass the size of one send/recv messages and not the total bytes sent/received. - constexpr nvtxPayloadSchemaEntry_t AllToAllvSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes] (Send)"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes] (Recv)", nullptr, 0, - offsetof(NvtxParamsAllToAllv, recvbytes)}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, - offsetof(NvtxParamsAllToAllv, datatype)} - }; - NvtxParamsAllToAllv payload{sendcounts[comm->rank] * ncclTypeSize(datatype), recvcounts[comm->rank] * ncclTypeSize(datatype), datatype}; - NVTX3_FUNC_WITH_PARAMS(AllToAllv, AllToAllvSchema, payload) - if (mscclAvailable(comm->rank) && !mscclIsCaller()) { return mscclEnqueueCheck( sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls, @@ -274,19 +228,8 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size ncclResult_t ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream) { - struct NvtxParamsBroadcast { - size_t bytes; - int root; - ncclDataType_t datatype; - }; - constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, - offsetof(NvtxParamsBroadcast, datatype)} - }; - NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root, datatype}; - NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload) + NVTX3_FUNC_WITH_PARAMS(Broadcast, NcclNvtxParamsBroadcast, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, datatype)); struct ncclInfo info = { ncclFuncBroadcast, "Broadcast", sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */ @@ -319,45 +262,34 @@ NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) { - struct NvtxParamsGather { - size_t bytes; - int root; - ncclDataType_t datatype; - }; - constexpr nvtxPayloadSchemaEntry_t GatherSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsGather, root)}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, - offsetof(NvtxParamsGather, datatype)} - }; - NvtxParamsGather payload{sendcount * ncclTypeSize(datatype), root, datatype}; - NVTX3_FUNC_WITH_PARAMS(Gather, GatherSchema, payload) + NVTX3_FUNC_WITH_PARAMS(Gather, NcclNvtxParamsGather, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype), root, datatype)); - if (!mscclIsCaller()) // when msccl falls back to - { - NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, sendcount, datatype, comm, stream, root)); - } + if (!mscclIsCaller()) // when msccl falls back to + { + NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, sendcount, datatype, comm, stream, root)); + } - if (mscclAvailable(comm->rank) && !mscclIsCaller()) { - return mscclEnqueueCheck( - sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr, - sendcount, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream); - } + if (mscclAvailable(comm->rank) && !mscclIsCaller()) { + return mscclEnqueueCheck( + sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr, + sendcount, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream); + } - int nRanks; - NCCLCHECK(ncclCommCount(comm, &nRanks)); - size_t rankOffset = sendcount * ncclTypeSize(datatype); - if (sendcount == 0) return ncclSuccess; - int rank; - NCCLCHECK(ncclCommUserRank(comm, &rank)); - NCCLCHECK(ncclGroupStart()); - if (rank == root) { - for (int r=0; rcommHash : 0, count * ncclTypeSize(datatype), root, op, datatype)); struct ncclInfo info = { ncclFuncReduce, "Reduce", sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */ @@ -406,20 +324,8 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { - struct NvtxParamsReduceScatter { - size_t bytes; - ncclRedOp_t op; - ncclDataType_t datatype; - }; - constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, - {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, - offsetof(NvtxParamsReduceScatter, op)}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, - offsetof(NvtxParamsReduceScatter, datatype)} - }; - NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op, datatype}; - NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload) + NVTX3_FUNC_WITH_PARAMS(ReduceScatter, NcclNvtxParamsReduceScatter, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), op, datatype)); struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */ @@ -445,67 +351,44 @@ NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) { - struct NvtxParamsScatter { - size_t bytes; - int root; - ncclDataType_t datatype; - }; - constexpr nvtxPayloadSchemaEntry_t ScatterSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsScatter, root)}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, - offsetof(NvtxParamsScatter, datatype)} - }; - NvtxParamsScatter payload{recvcount * ncclTypeSize(datatype), root, datatype}; - NVTX3_FUNC_WITH_PARAMS(Scatter, ScatterSchema, payload) + NVTX3_FUNC_WITH_PARAMS(Scatter, NcclNvtxParamsScatter, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), root, datatype)); - if (!mscclIsCaller()) // when msccl falls back to - { - NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, recvcount, datatype, comm, stream, root)); - } + if (!mscclIsCaller()) // when msccl falls back to + { + NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, recvcount, datatype, comm, stream, root)); + } - if (mscclAvailable(comm->rank) && !mscclIsCaller()) { - return mscclEnqueueCheck( - sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr, - recvcount, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream); - } + if (mscclAvailable(comm->rank) && !mscclIsCaller()) { + return mscclEnqueueCheck( + sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr, + recvcount, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream); + } - int nRanks; - NCCLCHECK(ncclCommCount(comm, &nRanks)); - size_t rankOffset = recvcount * ncclTypeSize(datatype); - if (recvcount == 0) return ncclSuccess; - int rank; - NCCLCHECK(ncclCommUserRank(comm, &rank)); - NCCLCHECK(ncclGroupStart()); - if (rank == root) { - for (int r=0; rcommHash : 0, count * ncclTypeSize(datatype), peer, datatype)); struct ncclInfo info = { ncclFuncSend, "Send", NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ @@ -530,8 +413,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da ncclResult_t ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream) { - NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer, datatype}; - NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload) + NVTX3_FUNC_WITH_PARAMS(Recv, NcclNvtxParamsSendRecv, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer, datatype)); struct ncclInfo info = { ncclFuncRecv, "Recv", NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h index 835ab708d4..6c58d72ddb 100644 --- a/src/device/all_reduce.h +++ b/src/device/all_reduce.h @@ -767,7 +767,7 @@ struct RunWorkCollregUsed ? 0 : min(loopCount, channelCount - elemOffset); prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0); } - } else if (tid < tidEndReduce) { + } else if (tid < tidEndReduce && nvls->headRank != -1) { // Reduce, broadcast through NVLS using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>; Primitives, /*Direct=*/1, Proto, 0> diff --git a/src/enqueue.cc b/src/enqueue.cc index 1500a610b7..bf325c58f3 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -52,25 +52,22 @@ static ncclKernelMatch const ncclKerns[2] = { NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0); // Returns maximum kernel stack size of all CUDA kernels -ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) { +ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize) { constexpr int KernelCount = sizeof(ncclKerns)/sizeof(ncclKerns[0]); ncclResult_t result = ncclSuccess; + int print = 0; if (maxStackSize) *maxStackSize = 0; int carveout = ncclParamL1SharedMemoryCarveout(); + int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch); - // Keep track if we already visited a function pointer. - void* lru[2] = {nullptr, nullptr}; - for (int i=0; i < KernelCount; i++) { - void* fn = ncclKerns[i].kernelFn; - if (fn == lru[0] || fn == lru[1]) goto next_kernel; - lru[1] = lru[0]; - lru[0] = fn; + for (int k=0; k < KernelCount; k++) { + void* fn = ncclKerns[k].kernelFn; + cudaFuncAttributes attr = {0}; + if (fn == nullptr) continue; + CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0); if (maxStackSize) { - cudaFuncAttributes attr = {0}; - if (cudaFuncGetAttributes(&attr, fn) != cudaSuccess) - WARN("Failed to get kernel attributes"); if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes; ignore0:; } @@ -81,10 +78,17 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) { result, ignore1); ignore1:; } - - if (ncclShmemDynamicSize(cudaArch) != 0) { + if (ncclMaxSharedMem != 0) { + int sharedMemSize = ncclMaxSharedMem; + if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) { + if (print++ == 0) + INFO(NCCL_INIT, "ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu", + sharedMemSize, maxSharedMem-attr.sharedSizeBytes); + // Reduce requested MaxDynamicSharedMemorySize attribute + sharedMemSize = maxSharedMem - attr.sharedSizeBytes; + } CUDACHECKGOTO(cudaFuncSetAttribute(fn, - cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)), + cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize), result, next_kernel); } next_kernel:; @@ -1520,7 +1524,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan NCCLCHECK(ncclCudaDriverVersion(&driverVersion)); if (driverVersion >= 11080) { int compCap = comm->compCap; - unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0; + unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0; CUlaunchConfig launchConfig = {0}; CUlaunchAttribute launchAttrs[3]; @@ -1674,7 +1678,7 @@ static ncclResult_t updateCollCostTable( if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue; if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue; /* now we only support single-node NVLS allgather and reducescatter */ - if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue; + if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && (comm->nNodes > 1 || comm->nRanks > NCCL_MAX_NVLS_ARITY)) continue; /* Tree reduceScatter doesn't support scaling yet */ if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter && (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue; diff --git a/src/graph/connect.cc b/src/graph/connect.cc index 33aa8b3556..2cbc039906 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -22,7 +22,6 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) { int rank = comm->rank; int localRanks = comm->topo->nodes[GPU].count; - int nvlsRanks = comm->MNNVL ? comm->clique.size : localRanks; int nChannels = comm->nChannels; topoRanks->nvlsHeadNum = 0; @@ -77,7 +76,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs // Get nvls heads and the number of heads. Duplicate head is not allowed. for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) { bool addHead = true; - int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * nvlsRanks; + int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks; for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) { if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) { @@ -457,8 +456,6 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead channel->nvls.out = -1; // NVLS+SHARP not yet implemented. channel->nvls.headRank = headRank; channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1; - channel->nvls.node = comm->node; - channel->nvls.nNodes = comm->nNodes; if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks; } if (comm->nNodes == 1) return ncclSuccess; @@ -853,7 +850,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa } // Use 4 compute channels per search channel to reach peak BW on <8 PPN - if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && 2*nChannels <= maxChannels) { + if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && 2*nChannels <= maxChannels) { nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext); } diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 46ccca7f3e..cbfd2d8698 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -980,14 +980,37 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr return ncclSuccess; } -int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) { - int minPath = PATH_DIS; +ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min) { + int minPath = PATH_SYS; for (int i=0; inodes[GPU].count; i++) { - struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU]; - for (int j=0; jnodes[GPU].count; j++) { - if (i == j) continue; + struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type]; + if (paths == NULL) continue; + for (int j=0; jnodes[type].count; j++) { + if (type == GPU && i == j) continue; minPath = std::min(minPath, paths[j].type); } } - return minPath >= PATH_PIX ? 0 : 1; + *min = minPath; + return ncclSuccess; +} + +ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max) { + int maxPath = PATH_LOC; + for (int i=0; inodes[GPU].count; i++) { + struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type]; + if (paths == NULL) continue; + for (int j=0; jnodes[type].count; j++) { + if (type == GPU && i == j) continue; + maxPath = std::max(maxPath, paths[j].type); + } + } + *max = maxPath; + return ncclSuccess; +} + +ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink) { + int maxPath; + NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxPath)); + *allNvLink = maxPath >= PATH_PIX ? 0 : 1; + return ncclSuccess; } diff --git a/src/graph/search.cc b/src/graph/search.cc index 5cd44e9f28..ce56566b98 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -988,15 +988,20 @@ float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float)) #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float)) +#endif float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 11.0, 6.0, 3.0 }; float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float)) #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float)) -#endif RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0); +float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0 }; +float sm100SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; +#define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float)) +#define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float)) + ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; int crossNic = (system->nodes[NET].count > 1) && @@ -1006,8 +1011,20 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph graph->crossNic = crossNic == 1 ? 1 : 0; graph->bwIntra = graph->bwInter = 0; graph->latencyInter = 0; - graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; - graph->typeInter = PATH_PIX; + int minTypeIntra = PATH_LOC, minTypeInter = PATH_PIX; + int maxTypeIntra = PATH_SYS, maxTypeInter = PATH_SYS; + if (ngpus > 1) { + NCCLCHECK(ncclTopoGetGpuMinPath(system, GPU, &minTypeIntra)); + NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxTypeIntra)); + } + if (system->nodes[NET].count > 0) { + NCCLCHECK(ncclTopoGetGpuMinPath(system, NET, &minTypeInter)); + NCCLCHECK(ncclTopoGetGpuMaxPath(system, NET, &maxTypeInter)); + maxTypeIntra = maxTypeInter; + } + + graph->typeIntra = minTypeIntra; + graph->typeInter = minTypeInter; graph->nChannels = 0; graph->nIntraChannels = 0; memset(graph->intraNets, 0, MAXCHANNELS*NCCL_TOPO_MAX_NODES*2*sizeof(int)); @@ -1080,14 +1097,14 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL)); if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess; // NVLS and COLLNET_DIRECT search must have ngpus heads at most. - if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) - graph->maxChannels = system->nodes[GPU].count; + if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) graph->maxChannels = std::min(NCCL_MAX_NVLS_ARITY, system->nodes[GPU].count); + if (graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) graph->maxChannels = std::min(NCCL_MAX_DIRECT_ARITY+1, system->nodes[GPU].count); if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE; if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // Force intra-node NVLS algorithm to pull evenly from all GPUs. - graph->minChannels = graph->maxChannels = system->nodes[GPU].count; + graph->minChannels = graph->maxChannels; } struct ncclTopoGraph tmpGraph; @@ -1097,11 +1114,11 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph int nspeeds = 0; float* speedArray = NULL; if (system->nodes[NET].count == 0) { - nspeeds = NSPEEDSINTRA; - speedArray = speedArrayIntra; + nspeeds = ccMin >= 100 ? NSPEEDSINTRA_SM100 : (ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA); + speedArray = ccMin >= 100 ? sm100SpeedArrayIntra : (ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra); } else { - nspeeds = NSPEEDSINTER; - speedArray = speedArrayInter; + nspeeds = ccMin >= 100 ? NSPEEDSINTER_SM100 : (ccMin >= 90 ? NSPEEDSINTER_SM90 : NSPEEDSINTER); + speedArray = ccMin >= 100 ? sm100SpeedArrayInter : (ccMin >= 90 ? sm90SpeedArrayInter : speedArrayInter); } int pass = 1; int speedIndex = 0; @@ -1156,18 +1173,18 @@ search: } tmpGraph.pattern = graph->pattern; - int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS; - if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) { + int maxIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : maxTypeIntra; + if (tmpGraph.typeIntra < maxIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) { tmpGraph.typeIntra += 1; goto search; } - tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; + tmpGraph.typeIntra = minTypeIntra; - if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) { + if (system->nodes[NET].count > 0 && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) { tmpGraph.typeInter += 1; goto search; } - tmpGraph.typeInter = PATH_PIX; + tmpGraph.typeInter = minTypeInter; if (crossNic == 2 && tmpGraph.crossNic == 0 && (graph->pattern == NCCL_TOPO_PATTERN_RING || graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE)) { diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 54266dddf6..2cf18ca4de 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -1445,11 +1445,11 @@ fail: goto exit; } -ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) { +static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, + int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType) { int minType = PATH_DIS; float maxBw = 0; int count = 0; - NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count)); struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType]; if (paths == NULL) { *localCount = 0; return ncclSuccess; } for (int i=0; inodes[resultType].count; i++) { @@ -1459,7 +1459,15 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index if (pathType) *pathType = minType; count = 0; } - if (paths[i].bw == maxBw && paths[i].type == minType) (*locals)[count++] = i; + if (paths[i].bw == maxBw && paths[i].type == minType) { + if (count == NCCL_TOPO_MAX_NODES) { + WARN("Error : ran out of room to store found nodes in ncclTopoGetLocal." + " Filled %d of type %d, starting from index %d of type %d.", + NCCL_TOPO_MAX_NODES, resultType, index, type); + return ncclInternalError; + } + locals[count++] = i; + } } *localCount = count; return ncclSuccess; @@ -1467,7 +1475,7 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count) { int localNetCount = 0, netCountByBw = 0; - int* localNets; + int localNets[NCCL_TOPO_MAX_NODES]; float totalNetBw = 0, gpuBw = 0; for (int l=0; lnodes[GPU].nodes[gpu].nlinks; l++) { @@ -1479,69 +1487,55 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c } } - NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL)); + NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL)); for (int l=0; (l < localNetCount) && (totalNetBw < gpuBw); l++, netCountByBw++) { totalNetBw += system->nodes[GPU].nodes[gpu].paths[NET][localNets[l]].bw; } *count = netCountByBw; - free(localNets); return ncclSuccess; } ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) { - ncclResult_t ret = ncclSuccess; int gpu; NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu)); - int* localNets = NULL; + + int localNets[NCCL_TOPO_MAX_NODES]; int localNetCount; - NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL)); - int* localGpus = NULL; + NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL)); + if (localNetCount==0) { + WARN("Could not find any local path from gpu %d to net.", gpu); + return ncclInternalError; + } + + int localGpus[NCCL_TOPO_MAX_NODES]; int localGpuCount; - int net = 0; - if (localNetCount == 0) { - *id = -1; - free(localNets); - return ncclSuccess; - } - NCCLCHECKGOTO(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL), ret, fail); - for (int i = 0; i < localGpuCount; i++) { - if (gpu == localGpus[i]) { - net = i; - break; - } - } + NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL)); + + int net = system->nodes[GPU].nodes[gpu].gpu.dev; if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount); - if (localNetCount == 0) { - if (id) *id = -1; - if (dev) *dev = -1; - } else { - net += channelId%(DIVUP(localNetCount,localGpuCount)); - if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id; - if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev; - } -exit: - free(localNets); - if (localGpus) free(localGpus); - return ret; -fail: - goto exit; + net += channelId%(DIVUP(localNetCount,localGpuCount)); + if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id; + if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev; + return ncclSuccess; } ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) { ncclResult_t ret = ncclSuccess; int netIndex; NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &netIndex)); - int* localGpus = NULL; + + int localGpus[NCCL_TOPO_MAX_NODES]; int localGpuCount; + NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, localGpus, &localGpuCount, NULL)); + int foundGpu = -1; - NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL)); for (int c=0; cnodes[GPU].nodes+g; int64_t id; - NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL), ret, fail); + NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL)); if (netId == id) { foundGpu = g; goto exit; @@ -1550,8 +1544,6 @@ ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, i } exit: *gpuIndex = foundGpu; -fail: - free(localGpus); return ret; } diff --git a/src/graph/topo.h b/src/graph/topo.h index fdfca50d75..2a5738dcea 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -19,6 +19,7 @@ #define SM80_NVLINK_BW 20.0 #define SM90_NVLINK_BW 20.6 #define SM86_NVLINK_BW 12.0 +#define SM100_NVLINK_BW 40.0 #define PCI_BW 12.0 // PCI Gen3 x16 #define QPI_BW 6.0 #define AMD_BW 16.0 @@ -98,8 +99,8 @@ struct ncclTopoLink { float bw; struct ncclTopoNode* remNode; }; -#define NCCL_TOPO_MAX_LINKS 128 - +// Allows for up to 32 NICs per node on GB200-NVL72 +#define NCCL_TOPO_MAX_LINKS 576 #define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES) struct ncclTopoLinkList { @@ -209,6 +210,8 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system); ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system); ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank); +ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min); +ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max); #define NCCL_TOPO_XML_MAX_NODES 8192 #define NCCL_GRAPH_XML_MAX_NODES 8192 @@ -279,6 +282,7 @@ static float ncclTopoXGMISpeed(const char* gcn) { // Returns NVLink bw in GB/s static float ncclTopoNVLinkBw(int cudaCompCap) { return + cudaCompCap >= 100 ? SM100_NVLINK_BW : cudaCompCap >= 90 ? SM90_NVLINK_BW : cudaCompCap == 86 ? SM86_NVLINK_BW : cudaCompCap >= 80 ? SM80_NVLINK_BW : diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index c86fe56a3e..3ec9f24fd9 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -350,28 +350,33 @@ static struct tuningModel rcclTuningModel[] = { #define VOLTA_COMPCAP_IDX 0 #define AMPERE_COMPCAP_IDX 1 #define HOPPER_COMPCAP_IDX 2 +#define BLACKWELL_COMPCAP_IDX 3 // LL128 max BW per channel -static const double llMaxBws[3][3] = { +static const double llMaxBws[][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}, - /* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0} + /* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0}, + /* Blackwell-N1/AMD-N2/AMD-N4) */ {2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0}, }; -static const double perChMaxRingLL128Bws[3][3] = { +static const double perChMaxRingLL128Bws[][3] = { /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7}, + /* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*36.7}, }; -static const double perChMaxTreeLL128Bws[3][3] = { +static const double perChMaxTreeLL128Bws[][3] = { /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0}, + /* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*29.0}, }; -static const double perChMaxTreeBws[3][3] = { +static const double perChMaxTreeBws[][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8}, /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0}, + /* Blackwell (N1/N2/N4) */ {2*38.7, 2*41.4, 2*36.0}, }; NCCL_PARAM(PatEnable, "PAT_ENABLE", 2); @@ -422,7 +427,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom int nRanks = comm->nRanks; if (nRanks <= 1) return ncclSuccess; - int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX; + int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX); int index2 = nNodes <= 2 ? nNodes-1 : 2; // LL: for single node, we look at GPU type; for multi-node, we look at CPU type int index1 = nNodes == 1 ? compCapIndex : @@ -666,6 +671,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom case 70: pEnable &= 1; break; case 80: pEnable &= 1; break; case 90: pEnable &= !(CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2); break; + case 100: pEnable &= 1; break; + case 120: pEnable &= 1; break; default: pEnable &= 0; break; } #endif diff --git a/src/include/alloc.h b/src/include/alloc.h index 76f859d1e2..3bac16603b 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -228,14 +228,13 @@ static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) { return result; } -static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) { +static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, CUmemAllocationHandleType type, size_t size) { ncclResult_t result = ncclSuccess; size_t granularity = 0; CUdevice currentDev; CUmemAllocationProp prop = {}; CUmemAccessDesc accessDesc = {}; CUmemGenericAllocationHandle handle; - CUmemAllocationHandleType type = ncclCuMemHandleType; int cudaDev; int flag = 0; CUDACHECK(cudaGetDevice(&cudaDev)); @@ -284,7 +283,7 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) { extern int ncclCuMemEnable(); -static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) { +static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, int type, size_t size) { WARN("CUMEM not supported prior to CUDA 11.3"); return ncclInternalError; } diff --git a/src/include/device.h b/src/include/device.h index 5f745f1e82..fe6f94c1ee 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -70,7 +70,7 @@ union ncclLLFifoLine { #define WARP_SIZE warpSize #define MAXCHANNELS 128 #define CHANNEL_LIMIT 16 -#define NCCL_MAX_LOCAL_RANKS 64 +#define NCCL_MAX_LOCAL_RANKS 72 #define NCCL_MAX_NTHREADS 256 #define NCCL_MIN_NTHREADS (4*WARP_SIZE) #define NCCL_SIMPLE_MAX_NTHREADS NCCL_MAX_NTHREADS @@ -204,8 +204,6 @@ struct ncclNvls { int down; int treeUp; int treeDown[NCCL_MAX_NVLS_TREE_ARITY]; - int node; - int nNodes; }; #if __CUDA_ARCH__ >= 900 diff --git a/src/include/enqueue.h b/src/include/enqueue.h index 03d44c51ff..a381846ad0 100644 --- a/src/include/enqueue.h +++ b/src/include/enqueue.h @@ -17,7 +17,7 @@ #define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL) #define NCCL_BYTES_ALIGNMENT 16 -ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize); +ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize); ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); ncclResult_t ncclLaunchPrepare(struct ncclComm* comm); ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); diff --git a/src/include/graph.h b/src/include/graph.h index 31b1221ba7..4c3759cb90 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -29,7 +29,8 @@ void ncclTopoFree(struct ncclTopoSystem* system); ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm); ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm); ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks); -int ncclTopoPathAllNVLink(struct ncclTopoSystem* system); +ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink); + ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm); // Query topology diff --git a/src/include/mnnvl.h b/src/include/mnnvl.h new file mode 100644 index 0000000000..dedbefe43d --- /dev/null +++ b/src/include/mnnvl.h @@ -0,0 +1,15 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_MNNVL_H_ +#define NCCL_MNNVL_H_ + +#include "nccl.h" +#include "comm.h" + +ncclResult_t ncclMnnvlCheck(struct ncclComm* comm); + +#endif diff --git a/src/include/nvtx.h b/src/include/nvtx.h index b7057f19df..33826db794 100644 --- a/src/include/nvtx.h +++ b/src/include/nvtx.h @@ -36,19 +36,22 @@ #define NVTX_SID_CommInitRankConfig 16 // same schema as NVTX_SID_CommInitRank #define NVTX_SID_CommInitRankScalable 17 // same schema as NVTX_SID_CommInitRank #define NVTX_SID_CommSplit 18 +#define NVTX_SID_CommFinalize 19 // Define static schema ID for the reduction operation. -#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START +#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 19 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START extern const nvtxDomainHandle_t ncclNvtxDomainHandle; struct nccl_domain{static constexpr char const* name{"NCCL"};}; +/// @brief Register an NVTX payload schema for static-size payloads. class payload_schema { -public: - explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept + public: + explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, + const uint64_t schemaId, const size_t size) noexcept { - schema_attr.name = schemaName; + schema_attr.payloadStaticSize = size; schema_attr.entries = entries; schema_attr.numEntries = numEntries; schema_attr.schemaId = schemaId; @@ -69,33 +72,105 @@ private: NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, - nullptr, + nullptr, /* schema name is not needed */ NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, nullptr, 0, 0, 0, 0, nullptr}; }; +// Convenience macro to give the payload parameters a scope. +#define NVTX3_PAYLOAD(...) __VA_ARGS__ + // Create NVTX push/pop range with parameters -// @param name of the operation (see `NVTX_SID_*`) -// @param N schema name -// @param S schema (entries) -// @param P payload (struct) +// @param N NCCL API name without the `nccl` prefix. +// @param T name of the used NVTX payload schema without "Schema" suffix. +// @param P payload parameters/entries #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) -#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \ - nvtxPayloadData_t nvtx3_bpl__[] = { \ - {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \ - roctx_scoped_range_in const roctx_range__{S, nvtx3_bpl__, std::extent::value, "RCCL_" #ID}; +#define NVTX3_FUNC_WITH_PARAMS(N, T, P) \ + constexpr uint64_t schemaId = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \ + static const payload_schema schema{T##Schema, std::extent::value - 1, \ + schemaId, sizeof(T)}; \ + const T _payload = {P}; \ + nvtxPayloadData_t nvtx3_bpl__[] = {{schemaId, sizeof(_payload), &_payload}}; \ + roctx_scoped_range_in const roctx_range__{T##Schema, nvtx3_bpl__, std::extent::value - 1, "RCCL_" #N}; #else -#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \ - static const payload_schema schema{S, std::extent::value, \ - NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \ +#define NVTX3_FUNC_WITH_PARAMS(N, T, P) \ + constexpr uint64_t schemaId = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \ + static const payload_schema schema{T##Schema, std::extent::value - 1, \ + schemaId, sizeof(T)}; \ static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ - nvtxPayloadData_t nvtx3_bpl__[] = { \ - {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \ + const T _payload = {P}; \ + nvtxPayloadData_t nvtx3_bpl__[] = {{schemaId, sizeof(_payload), &_payload}}; \ ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \ ::nvtx3::v1::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; #endif +/// @brief Creates an NVTX range with extended payload using the RAII pattern. +/// @tparam PayloadType Data type of the payload. +template +class ncclNvtxRange { + public: + explicit ncclNvtxRange(const nvtxEventAttributes_t* evtAttr) noexcept { + nvtxDomainRangePushEx(nvtx3::domain::get(), evtAttr); + } + + ~ncclNvtxRange() noexcept { + if (payloadData.payload) { + nvtxRangePopPayload(nvtx3::domain::get(), &payloadData, 1); + } else { + nvtxDomainRangePop(nvtx3::domain::get()); + } + } + + void setPayloadData(const uint64_t schemaId) noexcept + { + payloadData = {schemaId, sizeof(PayloadType), &payload}; + } + + ncclNvtxRange() = delete; + ncclNvtxRange(ncclNvtxRange const&) = default; + ncclNvtxRange& operator=(ncclNvtxRange const&) = default; + ncclNvtxRange(ncclNvtxRange&&) = default; + ncclNvtxRange& operator=(ncclNvtxRange&&) = default; + + // Holds the payload data. + PayloadType payload{}; + + nvtxPayloadData_t payloadData = {NVTX_PAYLOAD_ENTRY_TYPE_INVALID, 0, NULL}; +}; + +// Create an NVTX range with the function name as the range name. Use RAII pattern. +// @param T Type ID of the NVTX payload (pointer for variable-size payloads). +#define NVTX3_RANGE(T) \ + static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ + ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ + ncclNvtxRange nvtx3_range__{nvtx3_func_attr__.get()}; + +// Add static-size payload to the NVTX range created with `NVTX3_RANGE()`, +// which must be in this or an outer scope. +// @param N NCCL API name without the `nccl` prefix. +// @param S name of the used NVTX payload schema. +// @param P payload parameters/entries +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) +#define NVTX3_RANGE_ADD_PAYLOAD(N, S, P) do { \ + constexpr uint64_t schema_id = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \ + static const payload_schema schema{S, std::extent::value - 1, schema_id, \ + sizeof(nvtx3_range__.payload)}; \ + nvtx3_range__.payload = {P}; \ + nvtx3_range__.setPayloadData(schema_id); \ + nvtxPayloadData_t nvtx3_bpl__[] = {{schema_id, sizeof(nvtx3_range__.payloadData), &nvtx3_range__.payloadData}}; \ + roctx_scoped_range_in const roctx_range__{S, nvtx3_bpl__, std::extent::value - 1, "RCCL_" #N}; \ +} while (0) +#else +#define NVTX3_RANGE_ADD_PAYLOAD(N, S, P) do { \ + constexpr uint64_t schema_id = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \ + static const payload_schema schema{S, std::extent::value - 1, schema_id, \ + sizeof(nvtx3_range__.payload)}; \ + nvtx3_range__.payload = {P}; \ + nvtx3_range__.setPayloadData(schema_id); \ +} while (0) +#endif + extern void initNvtxRegisteredEnums(); #endif diff --git a/src/include/nvtx3/nvToolsExtPayloadHelper.h b/src/include/nvtx3/nvToolsExtPayloadHelper.h index 304d5d6a5f..0f0c87d6ae 100644 --- a/src/include/nvtx3/nvToolsExtPayloadHelper.h +++ b/src/include/nvtx3/nvToolsExtPayloadHelper.h @@ -11,7 +11,7 @@ /* This is just an empty marker (for readability), which can be omitted. */ /* TODO: Fix issue with trailing comma at end of entry list. */ -#define NVTX_PAYLOAD_ENTRIES +#define NCCL_NVTX_PAYLOAD_ENTRIES /** @@ -32,7 +32,7 @@ * * Example: * NVTX_DEFINE_SCHEMA_FOR_STRUCT(your_struct, "SchemaName", - * NVTX_PAYLOAD_ENTRIES( + * NCCL_NVTX_PAYLOAD_ENTRIES( * (index, TYPE_INT, "integer value"), * (dpfloat, TYPE_DOUBLE, "fp64 value"), * (text, TYPE_CSTRING, "text", NULL, 24) @@ -80,7 +80,7 @@ * * Example: * NVTX_DEFINE_STRUCT_WITH_SCHEMA(your_struct_name, "Your schema name", - * NVTX_PAYLOAD_ENTRIES( + * NCCL_NVTX_PAYLOAD_ENTRIES( * (int, index, TYPE_INT, "integer value"), * (double, dpfloat, TYPE_DOUBLE, "fp64 value"), * (const char, (text, 24), TYPE_CSTRING, "text", NULL, 24) diff --git a/src/include/nvtx_payload_schemas.h b/src/include/nvtx_payload_schemas.h new file mode 100644 index 0000000000..476f2a8612 --- /dev/null +++ b/src/include/nvtx_payload_schemas.h @@ -0,0 +1,178 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +/// Definitions of NVTX payload types and schemas used for the NVTX +/// instrumentation in init.cc and collectives.cc. + +#ifndef NVTX_PAYLOAD_SCHEMAS_H_ +#define NVTX_PAYLOAD_SCHEMAS_H_ + + +#include "nccl.h" +#include "nvtx3/nvToolsExtPayload.h" +#include "nvtx3/nvToolsExtPayloadHelper.h" + +/** + * \brief Define a C struct together with the matching schema entries. + * + * Does the same as `NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA`, but without creating the + * schema attributes. (Remove this helper when it is available in the NVTX headers.) + */ +#define NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(struct_id, prefix, entries) \ + _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ + prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) + +// C strings used as NVTX payload entry names. +static constexpr char const* nccl_nvtxCommStr = "NCCL communicator ID"; +static constexpr char const* nccl_nvtxCudaDevStr = "CUDA device"; +static constexpr char const* nccl_nvtxRankStr = "Rank"; +static constexpr char const* nccl_nvtxNranksStr = "No. of ranks"; +static constexpr char const* nccl_nvtxMsgSizeStr = "Message size [bytes]"; +static constexpr char const* nccl_nvtxMsgSizeSendStr = "Message size [bytes] (Send)"; +static constexpr char const* nccl_nvtxMsgSizeRecvStr = "Message size [bytes] (Recv)"; +static constexpr char const* nccl_nvtxReductionOpStrpStr = "Reduction operation"; +static constexpr char const* nccl_nvtxDataTypeStr = "Data type"; + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommInitAll, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, commhash, TYPE_UINT64, nccl_nvtxCommStr), + (int, ndev, TYPE_INT, "No. of devices") + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommInitRank, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr), + (int, nranks, TYPE_INT, nccl_nvtxNranksStr), + (int, myrank, TYPE_INT, nccl_nvtxRankStr), + (int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr) + ) +) +// The typedef and payload schema for ncclCommInitRank is also used for, +// ncclCommInitRankConfig, ncclCommInitRankScalable, ncclCommDestroy, and ncclCommAbort. +typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankConfig; +typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankScalable; +typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommAbort; +typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommDestroy; + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommSplit, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr), + (uint64_t, parentcomm, TYPE_UINT64, "Parent NCCL communicator ID"), + (int, nranks, TYPE_INT, nccl_nvtxNranksStr), + (int, myrank, TYPE_INT, nccl_nvtxRankStr), + (int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr), + (int, color, TYPE_INT, "Color"), + (int, key, TYPE_INT, "Key") + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommFinalize, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr) + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllGather, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr) + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllReduce, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr), + (ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr) + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllToAll, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr) + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllToAllv, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, sendBytes, TYPE_SIZE, nccl_nvtxMsgSizeSendStr), + (size_t, recvBytes, TYPE_SIZE, nccl_nvtxMsgSizeRecvStr), + (ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr) + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsBroadcast, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (int, root, TYPE_INT, "Root"), + (ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr) + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsGather, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (int, root, TYPE_INT, "Root"), + (ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr) + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduce, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (int, root, TYPE_INT, "Root"), + (ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr), + (ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr) + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduceScatter, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr), + (ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr) + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsScatter, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (int, root, TYPE_INT, "Root"), + (ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr) + ) +) + +// Used in NCCL APIs `ncclSend` and `ncclRecv`. +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsSendRecv, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (int, peer, TYPE_INT, "Peer rank"), + (ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr) + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsMSCCL, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr), + (ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr) + ) +) + +#endif // end include guard diff --git a/src/include/nvtx_stub.h b/src/include/nvtx_stub.h index 73538cab3f..442d1cdd72 100755 --- a/src/include/nvtx_stub.h +++ b/src/include/nvtx_stub.h @@ -13,7 +13,10 @@ struct nccl_domain{static constexpr char const* name{"NCCL"};}; #define NVTX3_FUNC_RANGE_IN(domain) #define nvtxNameOsThreadA(syscall, thread) -#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) +#define NVTX3_FUNC_WITH_PARAMS(N, T, P) +#define NVTX3_PAYLOAD(...) __VA_ARGS__ +#define NVTX3_RANGE(T) +#define NVTX3_RANGE_ADD_PAYLOAD(N, S, P) #define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 diff --git a/src/include/proxy.h b/src/include/proxy.h index 52b46b8557..37d4daf337 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -380,6 +380,8 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm); ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS); ncclResult_t ncclProxyCreate(struct ncclComm* comm); ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn); + +// NB: ncclProxyMsgTypeStr[] in proxy.cc needs to match enum ncclProxyMsgType { ncclProxyMsgInit = 1, ncclProxyMsgSharedInit = 2, diff --git a/src/include/rocmwrap.h b/src/include/rocmwrap.h index e171a2d60e..03c51b8402 100644 --- a/src/include/rocmwrap.h +++ b/src/include/rocmwrap.h @@ -70,6 +70,9 @@ DECLARE_ROCM_PFN_EXTERN(hsa_status_string); extern int ncclCuMemEnable(); extern int ncclCuMemHostEnable(); +// Handle type used for cuMemCreate() +extern CUmemAllocationHandleType ncclCuMemHandleType; + ncclResult_t rocmLibraryInit(void); extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit() diff --git a/src/init.cc b/src/init.cc index 41d81a2cc8..44d63ecaef 100644 --- a/src/init.cc +++ b/src/init.cc @@ -25,6 +25,7 @@ #endif #include "tuner.h" #include "ras.h" +#include "mnnvl.h" #include #include #include @@ -39,6 +40,7 @@ #include "graph/xml.h" #include "archinfo.h" #include "param.h" +#include "nvtx_payload_schemas.h" // [RCCL] #include "git_version.h" @@ -436,6 +438,7 @@ static ncclResult_t commFree(ncclComm_t comm) { free(comm->rankToNode); free(comm->rankToLocalRank); free(comm->collNetHeads); + free(comm->clique.ranks); if (comm->bootstrap) NCCLCHECK(bootstrapClose(comm->bootstrap)); @@ -860,6 +863,7 @@ static void showVersion() { } } +NCCL_PARAM(MNNVLUUID, "MNNVL_UUID", -1); NCCL_PARAM(MNNVLCliqueId, "MNNVL_CLIQUE_ID", -1); static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) { @@ -915,12 +919,16 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u info->fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED; (void) ncclNvmlDeviceGetGpuFabricInfoV(nvmlDev, &info->fabricInfo); if (info->fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) { + if (ncclParamMNNVLUUID() != -1) { + ((long*)&info->fabricInfo.clusterUuid)[0] = ncclParamMNNVLUUID(); + ((long*)&info->fabricInfo.clusterUuid)[1] = ncclParamMNNVLUUID(); + } + if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId(); INFO(NCCL_INIT, "MNNVL busId 0x%lx fabric UUID %lx.%lx cliqueId 0x%x state %d healthMask 0x%x", info->busId, ((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1], info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask); } - if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId(); } #endif @@ -990,71 +998,6 @@ NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0); // MNNVL: Flag to indicate whether to enable Multi-Node NVLink NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2); -#if CUDART_VERSION >= 11030 - -#include -#include "cudawrap.h" - -// Determine if MNNVL support is available -static int checkMNNVL(struct ncclComm* comm) { - ncclResult_t ret = ncclSuccess; - - // MNNVL requires cuMem to be enabled - if (!ncclCuMemEnable()) return 0; - - // MNNVL also requires FABRIC handle support - int cudaDev; - int flag = 0; - CUdevice currentDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - CUCHECK(cuDeviceGet(¤tDev, cudaDev)); - // Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported - (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));; - if (!flag) return 0; - // Check that all ranks have initialized the fabric fully - for (int i = 0; i < comm->nRanks; i++) { - if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return 0; - } - - // Determine our MNNVL domain/clique - NCCLCHECKGOTO(ncclCalloc(&comm->clique.ranks, comm->nRanks), ret, fail); - comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId; - for (int i = 0; i < comm->nRanks; i++) { - nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo; - nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo; - // Check if the cluster UUID and cliqueId match - // A zero UUID means we don't have MNNVL fabric info - disable MNNVL - if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) goto fail; - if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) && - (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) { - if (i == comm->rank) { - comm->cliqueRank = comm->clique.size; - } - comm->clique.ranks[comm->clique.size++] = i; - } - } - // Determine whether to enable MNNVL or not - comm->MNNVL = ncclParamMNNVLEnable() == 2 ? comm->clique.size > 1 : ncclParamMNNVLEnable(); - INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d ", comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank); - - if (comm->MNNVL) { - // Force the CUMEM handle type to be FABRIC for MNNVL - ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC; - } - - return comm->MNNVL; - -fail: - if (comm->clique.ranks) free(comm->clique.ranks); - return 0; -} - -#else -static int checkMNNVL(struct ncclComm* comm) { - return 0; -} -#endif - #define TIMER_INIT_TOTAL 0 #define TIMER_INIT_KERNELS 1 #define TIMER_INIT_BOOTSTRAP 2 @@ -1142,12 +1085,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p // AllGather1 - end timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER]; - // MNNVL support - if (nNodes > 1 && !checkMNNVL(comm) && ncclParamMNNVLEnable() == 1) { - // Return an error if the user specifically requested MNNVL support - WARN("MNNVL is not supported on this system"); - ret = ncclSystemError; - goto fail; + // Check for MNNVL support + if ((nNodes > 1 && ncclParamMNNVLEnable() != 0) || ncclParamMNNVLEnable() == 1) { + NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail); } do { @@ -1558,7 +1498,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p comm->collNetSupport = 0; } } - comm->isAllNvlink = ncclTopoPathAllNVLink(comm->topo); + NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink)); comm->isOneRPN = (comm->maxLocalRanks == 1); NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail); @@ -1916,6 +1856,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { int cudaDev = job->cudaDev; int* parentRanks = NULL; int cudaArch; + int maxSharedMem = 0; double sum_timers = 0; uint64_t timers[TIMERS_INIT_COUNT] = {0}; unsigned long long commIdHash; @@ -1924,12 +1865,13 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { timers[TIMER_INIT_TOTAL] = clockNano(); CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail); + CUDACHECKGOTO(cudaDeviceGetAttribute(&maxSharedMem, cudaDevAttrMaxSharedMemoryPerBlockOptin, cudaDev), res, fail); CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail); CUDACHECKGOTO(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail); cudaArch = 100*archMajor + 10*archMinor; timers[TIMER_INIT_KERNELS] = clockNano(); - NCCLCHECK(ncclInitKernelsForDevice(cudaArch, &maxLocalSizeBytes)); + NCCLCHECK(ncclInitKernelsForDevice(cudaArch, maxSharedMem, &maxLocalSizeBytes)); // Set the maximum kernel stack size of all kernels to avoid // a CUDA memory reconfig on load (c.f. NVSHMEM issue) #ifdef USE_INDIRECT_FUNCTION_CALL @@ -2118,18 +2060,24 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) { comm->config.cgaClusterSize = cgaClusterSizeEnv; } else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) { - WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE); + INFO(NCCL_ENV, "NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE); comm->config.cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE; } minCTAsEnv = ncclParamMinCTAs(); if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) { - comm->config.minCTAs = minCTAsEnv; + if (minCTAsEnv <= 0) + INFO(NCCL_ENV, "NCCL_MIN_CTAS %d is too low, leaving it set at %d", minCTAsEnv, comm->config.minCTAs); + else + comm->config.minCTAs = minCTAsEnv; } maxCTAsEnv = ncclParamMaxCTAs(); if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) { - comm->config.maxCTAs = maxCTAsEnv; + if (maxCTAsEnv <= 0) + INFO(NCCL_ENV, "NCCL_MAX_CTAS %d is too low, leaving it set at %d", maxCTAsEnv, comm->config.maxCTAs); + else + comm->config.maxCTAs = maxCTAsEnv; } envNetName = ncclGetEnv("NCCL_NET"); @@ -2150,22 +2098,22 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { /* cap channels if needed */ if (comm->config.minCTAs > MAXCHANNELS) { - WARN("minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS); + INFO(NCCL_ENV, "minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS); comm->config.minCTAs = MAXCHANNELS; } if (comm->config.maxCTAs > MAXCHANNELS) { - WARN("maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS); + INFO(NCCL_ENV, "maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS); comm->config.maxCTAs = MAXCHANNELS; } if (comm->config.minCTAs > comm->config.maxCTAs) { - WARN("minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs); + INFO(NCCL_ENV, "minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs); comm->config.minCTAs = comm->config.maxCTAs; } if (comm->config.splitShare != 1 && comm->config.splitShare != 0) { - WARN("splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare); + INFO(NCCL_ENV, "splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare); comm->config.splitShare = 0; } @@ -2351,21 +2299,10 @@ fail: goto exit; } -struct NvtxParamsCommInitRank -{ - int rank; - int nranks; - int cudaDev; -}; -constexpr nvtxPayloadSchemaEntry_t CommInitRankSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommInitRank, nranks)}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommInitRank, cudaDev)}, -}; - NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank); ncclResult_t ncclCommInitRank_impl(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { NCCLCHECK(Recorder::instance().record(rrCommInitRank, nranks, myrank, &commId)); + NVTX3_RANGE(NcclNvtxParamsCommInitRank) // Load the CUDA driver and dlsym hooks (can fail on old drivers) rocmLibraryInit(); @@ -2373,10 +2310,11 @@ ncclResult_t ncclCommInitRank_impl(ncclComm_t* newcomm, int nranks, ncclUniqueId ncclConfig_t config = NCCL_CONFIG_INITIALIZER; CUDACHECK(cudaGetDevice(&cudaDev)); - NvtxParamsCommInitRank payload{myrank, nranks, cudaDev}; - NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload) - NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, &config, __func__)); + + NVTX3_RANGE_ADD_PAYLOAD(CommInitRank, NcclNvtxParamsCommInitRankSchema, + NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev)); + return ncclSuccess; } @@ -2389,10 +2327,7 @@ ncclResult_t ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlis ncclConfig_t config = NCCL_CONFIG_INITIALIZER; int oldDev = 0; - constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"} - }; - NVTX3_FUNC_WITH_PARAMS(CommInitAll, CommInitAllSchema, ndev) + NVTX3_RANGE(NcclNvtxParamsCommInitAll); // Load the CUDA driver and dlsym hooks (can fail on old drivers) rocmLibraryInit(); @@ -2430,14 +2365,17 @@ ncclResult_t ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlis ncclUniqueId uniqueId; NCCLCHECKGOTO(ncclGetUniqueId(&uniqueId), ret, fail); - NCCLCHECKGOTO(ncclGroupStart(), ret, fail); + NCCLCHECKGOTO(ncclGroupStartInternal(), ret, fail); for (int i=0; icommHash, ndev)); exit: (void)cudaSetDevice(oldDev); @@ -2464,14 +2402,14 @@ ncclResult_t ncclCommInitRankConfig_impl(ncclComm_t *newcomm, int nranks, ncclUn ncclResult_t ret = ncclSuccess; ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER; ncclConfig_t *internalConfigPtr = NULL; + + NVTX3_RANGE(NcclNvtxParamsCommInitRankConfig); + NCCLCHECK(ncclGroupStartInternal()); rocmLibraryInit(); CUDACHECK(cudaGetDevice(&cudaDev)); - NvtxParamsCommInitRank payload{myrank, nranks, cudaDev}; - NVTX3_FUNC_WITH_PARAMS(CommInitRankConfig, CommInitRankSchema, payload) - if (config == NULL) internalConfigPtr = &internalConfig; else @@ -2481,7 +2419,13 @@ ncclResult_t ncclCommInitRankConfig_impl(ncclComm_t *newcomm, int nranks, ncclUn exit: ncclGroupErrCheck(ret); NCCLCHECK(ncclGroupEndInternal()); - if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret); + if (newcomm && *newcomm) { + if (!(*newcomm)->config.blocking) { + (void) ncclCommGetAsyncError(*newcomm, &ret); + } + NVTX3_RANGE_ADD_PAYLOAD(CommInitRankConfig, NcclNvtxParamsCommInitRankSchema, + NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev)); + } return ret; fail: if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret); @@ -2490,6 +2434,8 @@ fail: NCCL_API(ncclResult_t, ncclCommInitRankScalable, ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config); ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config) { + NVTX3_RANGE(NcclNvtxParamsCommInitRankScalable); + int cudaDev; ncclResult_t ret = ncclSuccess; ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER; @@ -2499,9 +2445,6 @@ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myran rocmLibraryInit(); CUDACHECK(cudaGetDevice(&cudaDev)); - NvtxParamsCommInitRank payload{myrank, nranks, cudaDev}; - NVTX3_FUNC_WITH_PARAMS(CommInitRankScalable, CommInitRankSchema, payload) - if (config == NULL) internalConfigPtr = &internalConfig; else @@ -2511,7 +2454,13 @@ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myran exit: ncclGroupErrCheck(ret); NCCLCHECK(ncclGroupEndInternal()); - if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret); + if (newcomm && *newcomm) { + if (!(*newcomm)->config.blocking) { + (void) ncclCommGetAsyncError(*newcomm, &ret); + } + NVTX3_RANGE_ADD_PAYLOAD(CommInitRankScalable, NcclNvtxParamsCommInitRankSchema, + NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev)); + } return ret; fail: if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret); @@ -2589,7 +2538,8 @@ static ncclResult_t commCleanup(ncclComm_t comm) { NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm); ncclResult_t ncclCommFinalize_impl(ncclComm_t comm) { NCCLCHECK(Recorder::instance().record(rrCommFinalize, comm)); - NVTX3_FUNC_RANGE_IN(nccl_domain); + NVTX3_RANGE(NcclNvtxParamsCommFinalize); + ncclResult_t ret = ncclSuccess; struct ncclCommFinalizeAsyncJob *job = NULL; @@ -2614,7 +2564,13 @@ ncclResult_t ncclCommFinalize_impl(ncclComm_t comm) { exit: ncclGroupErrCheck(ret); NCCLCHECK(ncclGroupEndInternal()); - if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); } + if (comm) { + if (!comm->config.blocking) { + NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); + } + NVTX3_RANGE_ADD_PAYLOAD(CommFinalize, NcclNvtxParamsCommFinalizeSchema, + NVTX3_PAYLOAD(comm->commHash)); + } return ret; fail: free(job); @@ -2710,8 +2666,8 @@ ncclResult_t ncclCommDestroy_impl(ncclComm_t comm) { struct ncclCommFinalizeAsyncJob *job = NULL; ncclResult_t res = ncclSuccess; - NvtxParamsCommInitRank payload{rank, nranks, cudaDev}; - NVTX3_FUNC_WITH_PARAMS(CommDestroy, CommInitRankSchema, payload) + NVTX3_FUNC_WITH_PARAMS(CommDestroy, NcclNvtxParamsCommInitRank, + NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev)); TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId); NCCLCHECK(ncclGroupStartInternal()); @@ -2739,8 +2695,9 @@ fail: NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm); ncclResult_t ncclCommAbort_impl(ncclComm_t comm) { NCCLCHECK(Recorder::instance().record(rrCommAbort, comm)); + NVTX3_RANGE(NcclNvtxParamsCommAbort); + if (comm == NULL) { - NVTX3_FUNC_RANGE_IN(nccl_domain); return ncclSuccess; } NCCLCHECK(ncclGroupStartInternal()); @@ -2761,8 +2718,8 @@ ncclResult_t ncclCommAbort_impl(ncclComm_t comm) { struct ncclCommFinalizeAsyncJob *job = NULL; ncclResult_t res = ncclSuccess; - NvtxParamsCommInitRank payload{rank, nranks, cudaDev}; - NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload) + NVTX3_RANGE_ADD_PAYLOAD(CommAbort, NcclNvtxParamsCommInitRankSchema, + NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev)); TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId); @@ -2778,29 +2735,13 @@ fail: goto exit; } -struct NvtxParamsCommSplit { - int rank; - int nranks; - int cudaDev; - int color; - int key; -}; -constexpr nvtxPayloadSchemaEntry_t CommSplitSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommSplit, nranks)}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommSplit, cudaDev)}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "color", nullptr, 0, offsetof(NvtxParamsCommSplit, color)}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "key", nullptr, 0, offsetof(NvtxParamsCommSplit, key)}, -}; - NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config); ncclResult_t ncclCommSplit_impl(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) { struct ncclCommInitRankAsyncJob *job = NULL; struct ncclComm* childComm = NCCL_COMM_NULL; ncclResult_t res = ncclSuccess; - NvtxParamsCommSplit payload{comm->rank, comm->nRanks, comm->cudaDev, color, key}; - NVTX3_FUNC_WITH_PARAMS(CommSplit, CommSplitSchema, payload) + NVTX3_RANGE(NcclNvtxParamsCommSplit) int oldDev; CUDACHECK(cudaGetDevice(&oldDev)); @@ -2862,6 +2803,12 @@ exit: (void)cudaSetDevice(oldDev); (void)ncclGroupErrCheck(res); NCCLCHECK(ncclGroupEndInternal()); + + if (res == ncclSuccess && *newcomm) { + NVTX3_RANGE_ADD_PAYLOAD(CommSplit, NcclNvtxParamsCommSplitSchema, + NVTX3_PAYLOAD((*newcomm)->commHash, comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, color, key)); + } + return res; fail: if (childComm) { diff --git a/src/misc/rocmwrap.cc b/src/misc/rocmwrap.cc index 2102641bff..9095fac8a4 100644 --- a/src/misc/rocmwrap.cc +++ b/src/misc/rocmwrap.cc @@ -24,6 +24,8 @@ DECLARE_ROCM_PFN(hsa_init); DECLARE_ROCM_PFN(hsa_system_get_info); DECLARE_ROCM_PFN(hsa_status_string); +// Handle type used for cuMemCreate() +CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; static void *hsaLib; static uint16_t version_major, version_minor; diff --git a/src/mnnvl.cc b/src/mnnvl.cc new file mode 100644 index 0000000000..cff4a49565 --- /dev/null +++ b/src/mnnvl.cc @@ -0,0 +1,82 @@ +/************************************************************************* + * Copyright (c) 2015-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "mnnvl.h" +#include "transport.h" +#include +#include "rocmwrap.h" + +// Determine if MNNVL support is available +ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) { + // MNNVL requires cuMem to be enabled + if (!ncclCuMemEnable()) return ncclSuccess; + + // MNNVL also requires FABRIC handle support + int cudaDev; + int flag = 0; + CUdevice currentDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + CUDACHECK(cuDeviceGet(¤tDev, cudaDev)); + // Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported + (void) cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev); + if (!flag) return ncclSuccess; + // Check that all ranks have initialized the fabric fully + for (int i = 0; i < comm->nRanks; i++) { + if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return ncclSuccess; + } + + // Determine our MNNVL domain/clique + NCCLCHECK(ncclCalloc(&comm->clique.ranks, comm->nRanks)); + comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId; + for (int i = 0; i < comm->nRanks; i++) { + nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo; + nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo; + // Check if the cluster UUID and cliqueId match + // A zero UUID means we don't have MNNVL fabric info - disable MNNVL + if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess; + if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) && + (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) { + if (i == comm->rank) { + comm->cliqueRank = comm->clique.size; + } + comm->clique.ranks[comm->clique.size++] = i; + } + } + + // No MNNVL clique found + if (comm->clique.size <= 1) return ncclSuccess; + + // Check that FABRIC handles can be exported & imported by IMEX + { + void *ptr = NULL; + CUmemGenericAllocationHandle handle; + ncclCuDesc cuDesc; + CUresult err; + + // Allocate FABRIC handle compatible memory + ncclResult_t ret = ncclCuMemAlloc(&ptr, &handle, CU_MEM_HANDLE_TYPE_FABRIC, CUDA_IPC_MIN); + if (ret != ncclSuccess) return ncclSuccess; + err = cuMemExportToShareableHandle(&cuDesc, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0); + if (err != CUDA_SUCCESS || + (err = cuMemImportFromShareableHandle(&handle, &cuDesc, CU_MEM_HANDLE_TYPE_FABRIC)) != CUDA_SUCCESS) { + const char *errStr; + (void) cuGetErrorString(err, &errStr); + NCCLCHECK(ncclCuMemFree(ptr)); + // Return an error if this is a MNNVL capable system but it's not working + WARN("MNNVL (cliqueSize %d) is available but not supported on this system. Check the IMEX configuration.", + comm->clique.size); + return ncclSystemError; + } + NCCLCHECK(ncclCuMemFree(ptr)); + + // Force the CUMEM handle type to be FABRIC for MNNVL + ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC; + comm->MNNVL = 1; + INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d", + comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank); + } + return ncclSuccess; +} diff --git a/src/msccl.cc b/src/msccl.cc index 312b8b2ce4..f19c6f863f 100644 --- a/src/msccl.cc +++ b/src/msccl.cc @@ -8,6 +8,7 @@ #include "msccl/msccl_setup.h" #include "msccl/msccl_status.h" #include "api_trace.h" +#include "nvtx_payload_schemas.h" #include #include @@ -49,21 +50,8 @@ ncclResult_t mscclRunAlgo_impl( size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream) { Recorder::instance().record("mscclRunAlgo"); - struct NvtxParamsMsccl { - size_t bytes; - ncclRedOp_t op; - ncclDataType_t dataType; - }; - // Just pass the size of one send/recv messages and not the total bytes sent/received. - constexpr nvtxPayloadSchemaEntry_t MscclSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, - {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, - offsetof(NvtxParamsMsccl, op)}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0, - offsetof(NvtxParamsMsccl, dataType)} - }; - NvtxParamsMsccl payload{count * ncclTypeSize(dataType), op, dataType}; - NVTX3_FUNC_WITH_PARAMS(MSCCL, MscclSchema, payload) + NVTX3_FUNC_WITH_PARAMS(MSCCL, NcclNvtxParamsMSCCL, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(dataType), op, dataType)); mscclStatus& status = mscclGetStatus(comm->rank); struct mscclAlgo* hostAlgo = status.hostAlgos[mscclAlgoHandle]; diff --git a/src/proxy.cc b/src/proxy.cc index 67cf6cfd1b..c6ff51a031 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -23,6 +23,8 @@ #include #include +#define NCCL_MAX_PROXY_CONNECTIONS (NCCL_MAX_LOCAL_RANKS+1) + void* ncclProxyServiceUDS(void* _args); static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) { @@ -799,8 +801,8 @@ process_nextops: ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle); ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlAppend); TIME_START(2); - int freeOp[NCCL_MAX_LOCAL_RANKS]; - int freeOpEnd[NCCL_MAX_LOCAL_RANKS]; + int freeOp[NCCL_MAX_PROXY_CONNECTIONS]; + int freeOpEnd[NCCL_MAX_PROXY_CONNECTIONS]; for (int i = 0; i < proxyState->tpLocalnRanks; i++) freeOp[i] = -1; uint64_t lastOpCount = 0; @@ -1090,7 +1092,8 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in struct ncclProxyState* sharedProxyState = comm->proxyState; int tpProxyRank = comm->topParentRanks[proxyRank]; - proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0; + proxyConn->sameProcess = ((comm->peerInfo[proxyRank].hostHash == comm->peerInfo[comm->rank].hostHash) && + (comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash)) ? 1 : 0; // Keep one connection per local rank proxyConn->connection = NULL; proxyConn->tpRank = tpProxyRank; @@ -1223,7 +1226,7 @@ fail: goto exit; } -const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" }; +const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd", "QueryFd", "Register", "Deregister" }; ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) { struct ncclSocket* sock; ncclResult_t ret = ncclSuccess; @@ -1583,18 +1586,18 @@ void* ncclProxyService(void* _args) { connectionPool.banks = 0; connectionPool.offset = NCCL_PROXY_CONN_POOL_SIZE; - struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1]; - struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS]; - memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS); - for (int s=0; slistenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) { + if (ncclSocketGetFd(proxyState->listenSock, &pollfds[NCCL_MAX_PROXY_CONNECTIONS].fd) != ncclSuccess) { WARN("[Proxy Service] Get listenSock fd fails"); return NULL; }; - pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN; + pollfds[NCCL_MAX_PROXY_CONNECTIONS].events = POLLIN; int maxnpeers = 0; int npeers = 0; @@ -1608,17 +1611,19 @@ void* ncclProxyService(void* _args) { /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */ int ret; do { - ret = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500); + // poll all fds including the listenSock + ret = poll(pollfds, NCCL_MAX_PROXY_CONNECTIONS+1, asyncOpCount ? 0 : 500); } while (ret < 0 && errno == EINTR); if (ret < 0) { WARN("[Proxy Service] Poll failed: %s", strerror(errno)); return NULL; } - if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) { + if (pollfds[NCCL_MAX_PROXY_CONNECTIONS].revents) { + // We got an event on the listenSock int s = 0; - while (s < NCCL_MAX_LOCAL_RANKS && pollfds[s].fd >= 0) s++; - if (s == NCCL_MAX_LOCAL_RANKS) { - WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS); + while (s < NCCL_MAX_PROXY_CONNECTIONS && pollfds[s].fd >= 0) s++; + if (s == NCCL_MAX_PROXY_CONNECTIONS) { + WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_PROXY_CONNECTIONS); return NULL; } if (maxnpeers < s+1) maxnpeers = s+1; @@ -1851,6 +1856,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) { if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) { if (*comm->abortFlag == 0 && sharedProxyState->peerAddresses) { + // We need to send a ncclProxyMsgStop message to our own proxy struct ncclSocket sock; int type = ncclProxyMsgStop; NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag)); diff --git a/src/ras/client_support.cc b/src/ras/client_support.cc index 414a1ed94f..3e4e9a5042 100644 --- a/src/ras/client_support.cc +++ b/src/ras/client_support.cc @@ -80,7 +80,7 @@ static int rasOutBufferSize = 0; // We use them all over the place; no point in wasting the stack... static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS_CLIENT_DETAIL_THRESHOLD) rank numbers - // or for printing the local GPU devices, which can't be more than 64 (NCCL_MAX_LOCAL_RANKS) + // or for printing the local GPU devices, which can't be more than 64 // small numbers (times two if the NVML mask is different than the CUDA mask). // Still, 1024 should normally be plenty (verbose output may make things more difficult, // but we do check for overflows, so it will just be trimmed). @@ -1687,7 +1687,7 @@ static int rasCommRanksCollOpCompare(const void* p1, const void* p2) { const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size) { bool first = true; buf[0] = '\0'; - for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++) + for (int i = 0; i < sizeof(cudaDevs)*8; i++) if (cudaDevs & (1UL << i)) { snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i); first = false; @@ -1695,7 +1695,7 @@ const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, if (cudaDevs != nvmlDevs) { snprintf(buf+strlen(buf), size-strlen(buf), " (NVML "); first = true; - for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++) + for (int i = 0; i < sizeof(nvmlDevs)*8; i++) if (nvmlDevs & (1UL << i)) { snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i); first = false; diff --git a/src/ras/ras_internal.h b/src/ras/ras_internal.h index 68cac0b44b..715fff4a46 100644 --- a/src/ras/ras_internal.h +++ b/src/ras/ras_internal.h @@ -78,7 +78,7 @@ struct rasCollResponse { struct rasPeerInfo { union ncclSocketAddress addr; pid_t pid; - uint64_t cudaDevs; // Bitmask. Conveniently, NCCL_MAX_LOCAL_RANKS == 64. + uint64_t cudaDevs; // Bitmask. This is for local devices so 64 bits is enough. uint64_t nvmlDevs; // Same, but not affected by CUDA_VISIBLE_DEVICES. }; diff --git a/src/register/coll_reg.cc b/src/register/coll_reg.cc index ef59514287..1f1c62da2d 100644 --- a/src/register/coll_reg.cc +++ b/src/register/coll_reg.cc @@ -73,15 +73,19 @@ ncclResult_t ncclRegisterCollNvlsBuffers( if (nvlsReged) { *regNeedConnect = 0; - /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to - * saturate bandwidth. */ + /* tweak NVLS channels usage; for registered NVLS buffer to saturate bandwidth. */ if (comm->nNodes == 1) { - if (info->func == ncclFuncReduceScatter) - info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5)); - else - info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4)); + if (info->func == ncclFuncReduceScatter) { + // RS: Further tweaks for Blackwell with NVLS registered buffers + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 6 : 5)); + } + else { + // AR/AG: Further tweaks for Blackwell with NVLS registered buffers + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 8 : 4)); + } } else { - info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6)); + // Further tweaks for Blackwell with NVLS registered buffers + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 7 : 6)); } info->regBufType |= NCCL_NVLS_REG_BUFFER; } diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc index 582c30a353..3fe25a324c 100644 --- a/src/transport/nvls.cc +++ b/src/transport/nvls.cc @@ -141,9 +141,11 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr, #include "channel.h" #define NVLS_MEM_ALIGN_SIZE (1 << 21) +#define NVLS_NCHANNELS_SM90 16 +#define NVLS_NCHANNELS_SM100 32 NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2); -NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16); +NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", -2); NCCL_PARAM(NvlsChunkSize, "NVLS_CHUNKSIZE", 128*1024); ncclResult_t ncclNvlsInit(struct ncclComm* comm) { @@ -152,7 +154,7 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) { int gpuCount; NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount)); - if (!ncclParamNvlsEnable() || ((!comm->MNNVL && gpuCount <= 2) || (comm->MNNVL && comm->clique.size <= 2))) return ncclSuccess; + if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess; CUdevice dev; int driverVersion; @@ -170,7 +172,11 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) { } INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev); - if (comm->nvlsSupport == 1) comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (int)ncclParamNvlsChannels())); + if (comm->nvlsSupport) { + int channels = (comm->compCap >= 100) ? NVLS_NCHANNELS_SM100 : NVLS_NCHANNELS_SM90; + if (ncclParamNvlsChannels() >= 0) channels = ncclParamNvlsChannels(); + comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, channels)); + } return ncclSuccess; } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 7d561582e7..12f4f0e1f3 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -228,7 +228,7 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int refcount, ncclIpcDe // cuMem API support CUmemGenericAllocationHandle handle; - NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size)); + NCCLCHECK(ncclCuMemAlloc(ptr, &handle, type, size)); if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { // Return the native cuMem handle for later Export/Import via UDS memcpy(&ipcDesc->cuDesc.data, &handle, sizeof(handle)); @@ -852,7 +852,7 @@ ncclResult_t ret = ncclSuccess; if (isLegacyIpc) *isLegacyIpc = false; if (regRecord) { // buffer was registered by by users, we need to start to register or reuse it - int peerLocalRank; + int peerLocalRank = -1; for (int p = 0; p < nPeers; p++) { int peerRank = peerRanks[p]; peerLocalRank = comm->rankToLocalRank[peerRank]; @@ -924,8 +924,10 @@ ncclResult_t ret = ncclSuccess; ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr; // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side // and get the remote register address back. - if (proxyConn) + if (proxyConn) { + INFO(NCCL_REG, "rank %d - IPC registering buffer %p size %ld (baseAddr %p size %ld) to peer %d", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank); NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail); + } if (rmtRegAddr) { NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail); assert(regRecord->ipcInfos[peerLocalRank] == NULL); @@ -943,7 +945,7 @@ ncclResult_t ret = ncclSuccess; regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr; needUpdate = true; *regBufFlag = 1; - INFO(NCCL_REG, "rank %d - IPC register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr); + INFO(NCCL_REG, "rank %d - IPC registered buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr); } } } @@ -1077,6 +1079,8 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str assert(sizeof(struct p2pIpcExpInfo) == reqSize); assert(sizeof(void*) == respSize); + INFO(NCCL_REG, "Proxy rank %d register reqBuff %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, reqBuff, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess); + // request peer passes all necessary buffer info to import. The proxy thread would register // the buffer locally and return register addr back if (ipcExpInfo->legacyIpcCap) { @@ -1110,7 +1114,7 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset); #endif } - INFO(NCCL_REG, "Proxy rank %d register succeeds, regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess); + INFO(NCCL_REG, "Proxy rank %d register success regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess); exit: memcpy(respBuff, (void*)®Addr, sizeof(void*));