Merge remote-tracking branch 'nccl/master' into develop

Este commit está contenido en:
BertanDogancay
2025-04-30 13:30:11 -05:00
Se han modificado 41 ficheros con 1315 adiciones y 504 borrados
+3
Ver fichero
@@ -419,6 +419,7 @@ set(SRC_FILES
src/group.cc
src/init.cc
src/init_nvtx.cc
src/mnnvl.cc
src/net.cc
src/msccl.cc
src/proxy.cc
@@ -485,6 +486,7 @@ set(SRC_FILES
src/include/ibvwrap.h
src/include/info.h
src/include/ipcsocket.h
src/include/mnnvl.h
src/include/nccl_common.h
src/include/nccl_net.h
src/include/nccl_profiler.h
@@ -493,6 +495,7 @@ set(SRC_FILES
src/include/net.h
src/include/nvmlwrap.h
src/include/nvtx.h
src/include/nvtx_payload_schemas.h
src/include/nvtx_stub.h
src/include/p2p.h
src/include/param.h
+318
Ver fichero
@@ -0,0 +1,318 @@
# NCCL Profiler Plugin Documentation
This page describes the NCCL Profiler plugin API and how to implement a profiler plugin for NCCL.
# Overview
To allow NCCL to better integrate with DL frameworks, NCCL v2.23 introduced a profiler plugin
interface. Any NCCL user can write profiler plugins to extract performance data from NCCL and
use it for debugging and analysis.
Similarly to other plugins (e.g., network plugin), the profiler plugins come as a shared library
called `libnccl-profiler.so`. That shared library contains one or more implementations of the
NCCL PROFILER API, in the form of versioned structs, filled with pointers to all required
functions.
# Plugin architecture
## Plugin name and supporting multiple profiler plugins
When NCCL is initialized, it will look for a `libnccl-profiler.so` library and dynamically load
it, then look for symbols inside the library.
The `NCCL_PROFILER_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL
will look for a library with a name of `libnccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. It is therefore
advised to name the library following that pattern, with a symlink pointing `libnccl-profiler.so`
to `libnccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. That way, if there are multiple plugins in the
path, setting `NCCL_PROFILER_PLUGIN` will allow users to select the right plugin. Alternatively,
the user can also set `NCCL_PROFILER_PLUGIN` to the pathname of the `libnccl-profiler.so` library.
## Struct versioning
Once a library is found, NCCL will look for a symbol named `ncclProfiler_vX`, with `X` increasing
over time. The versioning ensures that the plugin and the NCCL core are compatible.
Plugins are encouraged to provide multiple of those symbols, implementing multiple versions of the
NCCL PROFILER API, so that the same plugin can be compiled and support a wide range of NCCL versions.
Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking
for the latest ncclProfiler struct version, but also looking for older ones so that older plugins
would still work.
## Headers management
To help users build plugins effortlessly, plugins should copy the `ncclProfiler_vX` definitions
they support to their internal includes. An example is shown in `ext-profiler/example` where we
keep all headers in the `nccl/` directory and provide thin layers to implement old version on top
of newer ones.
The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
from old API versions. It also provides error codes in `err.h`.
# API (v2)
Below is the main `ncclProfiler_v2` struct. Each function is explained in later sections.
```
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v2_t;
```
## Error codes
As rule of thumb, profiler generated errors should not be propagated to NCCL and alter its normal
functioning. Nevertheless, the profiler interface returns NCCL error codes, in case any need for
them arises in the future. For now, any profiler interface call should only return `ncclSuccess`.
The only exception is `init` that can return an error so that NCCL can disable the plugin.
## Operation overview
NCCL will call the `init` function first for every new communicator that is initialized. The profiler
returns an opaque context handle that is used to isolate profiler instances across communicators.
Similarly, NCCL will call `finalize` to destroy the profiler context, thus freeing resources.
The NCCL core code is instrumented with calls to `startEvent`, `stopEvent` and `recordEventState`.
These are used to start, stop and update events in the profiler, respectively.
## API Functions
### Initialization
#### name
The `name` field should point to a character string with the name of the profiler plugin. This will
be used for all logging, especially when `NCCL_DEBUG=INFO` is set.
#### init
As soon as NCCL finds the plugin and the correct ncclProfiler symbol, it calls its `init` function.
This allows the plugin to initialize its internal context, used during profiling of NCCL events.
If the `init` function does not return `ncclSuccess`, NCCL disables the plugin.
#### finalize
When the profiler is no longer needed, a call to `finalize` destroys the profiler context and frees
up resources.
### Profiling
#### startEvent
When NCCL needs to start profiling a new event it calls `startEvent`. `startEvent` takes the profiler
context, previously created by `init`, an event descriptor of type `ncclProfilerEventDescr_t` and
returns an opaque profiler event handle that can be passed to other profiler functions, as discussed
later in the document.
The event descriptor contains all the event metadata. Every event type has its own descriptor. Below
is the `ncclProfilerEventDescr_t` struct.
```
typedef struct {
uint8_t type; // event type (e.g., ncclProfileGroup, ncclProfileColl, ...)
void* parentObj; // pointer to parent event used to expose the event hierarchy to the profiler
int rank; // rank that generated the event
union {
struct { // collective events metadata
const char* name; // string containing name of the communicator
uint64_t commHash; // unique hash/id for the communicator
uint64_t seqNumber; // sequence number of this collective operation in the communicator
const char* func; // string containing name of the collective
void const* sendBuff; // address of send buffer
void* recvBuff; // address of recv buffer
size_t count; // data count
int root; // root rank
const char* datatype; // string containing the name of the datatype
size_t trafficBytes; // number of transfer bytes
uint8_t nMaxChannels; // max number of channels for this collective
uint8_t nWarps; // number of GPU warps for this collective
const char* algo; // string containing name of the algorithm for this collective
const char* proto; // string containing name of the protocol for this collective
} coll;
struct { // point-to-point events metadata
const char* name;
uint64_t commHash;
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer; // peer rank for this point-to-point
} p2p;
struct { // proxyOp events metadata
pid_t pid; // process id that generated the associated `ncclProxyOp` object
uint8_t channelId; // id of the channel used by the associated `ncclProxyOp` object
int peer; // peer rank
int nSteps; // number of network transfers/steps required by the `ncclProxyOp`
int chunkSize; // chunk size for this `ncclProxyOp`
int isSend; // set to 1 for sends and 0 for recvs
} proxyOp;
struct { // proxyStep events metadata
int step; // individual step in `ncclProxyOp`
} proxyStep;
};
} ncclProfilerEventDescr_v2_t;
```
NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
`ncclProfileProxyOp`, `ncclProfileProxyStep`, and `ncclProfileProxyCtrl`.
#### stopEvent
`stopEvent` takes the event handle returned by `startEvent` to stop the event. After the event
has been stopped the handle can no longer be used with other profiler calls. Using the event
handle after `eventStop` is undefined behavior.
#### recordEventState
Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`,
`ncclProfileP2p`, cannot be updated through calls to `recordEventState`.
`ncclProfileProxyOp`, `ncclProfileProxyStep` and `ncclProfileProxyCtrl` can be updated through
calls to `recordEventState`.
The state of proxy generated events can be updated, along with event attributes, using
`recordEventState`. These events can go through several states during their lifecycle.
The list of supported states for the proxy-defined events is reported below.
```
typedef enum {
// ncclProfileProxyOp event states
ncclProfilerProxyOpSendPosted, // state marks the posting of send buffer to GPU for given network transfer/step
ncclProfilerProxyOpSendRemFifoWait, // state marks the waiting of CTS credits from peer rank
ncclProfilerProxyOpSendTransmitted, // state marks the sending of network transfer/step to peer rank
ncclProfilerProxyOpSendDone, // state marks the ending of network transfer/step
ncclProfilerProxyOpRecvPosted, // state marks the posting of recv to network for given network transfer/step
ncclProfilerProxyOpRecvReceived, // state marks the recving of network transfer/step from peer rank
ncclProfilerProxyOpRecvTransmitted, // state marks the ending of the network transfer/step
ncclProfilerProxyOpRecvDone, // state marks the consuming of data from GPU
// ncclProfileProxyStep event states
ncclProfilerProxyStepSendGPUWait, // state marks the waiting of send data from GPU for given network transfer/step
ncclProfilerProxyStepSendWait, // state marks the waiting of send data from network for given network transfer/step
ncclProfilerProxyStepRecvWait, // state marks the waiting of recv data from network for given network transfer/step
ncclProfilerProxyStepRecvFlushWait, // state marks the waiting of recv data flush to GPU for given network transfer/step
ncclProfilerProxyStepRecvGPUWait, // state marks the waiting of recv data consumption from GPU for given network transfer/step
// ncclProfileProxyCtrl event states
ncclProfilerProxyCtrlIdle, // state marks proxy progress thread idle
ncclProfilerProxyCtrlActive, // state marks proxy progress thread active
ncclProfilerProxyCtrlSleep, // state marks proxy progress thread sleeping
ncclProfilerProxyCtrlWakeup, // state marks proxy progress thread waking up
ncclProfilerProxyCtrlAppend, // state marks append of new network work item begin
ncclProfilerProxyCtrlAppendEnd, // state marks append of new network work item end
} ncclProfilerEventState_v2_t;
```
`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
network requests for the GPU kernel. ProxyOp events are generated for every active channel and
provide a summary of the activity of the proxy progress thread for that channel.
`ncclProfileProxyStep` events are generated by the proxy progress thread while it is processing
network requests for the GPU kernel. ProxyStep events describe individual network transfer in
the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events.
`ncclProfileProxyCtrl` events are generated by the proxy progress thread while it is not processing
network requests for the GPU kernel. This includes everything else that the proxy thread might be
doing, including appending new `ncclProxyOp` objects to the list of work elements to process.
State transitions for the events described can also come with event attribute updates. For this
reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below.
```
typedef union {
struct { // attributes to update for ncclProfileProxyOp events
size_t transSize; // data transferred thus far
int steps; // network transfer/steps processed thus far
} proxyOp;
struct { // attributes to update for ncclProfileProxyCtrl
int appendedProxyOps; // number of appended proxy ops thus far
} proxyCtrl;
} ncclProfilerEventStateArgs_v2_t;
```
The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
### Event hierarchy
NCCL core events (reported above) are organized into a hierarchy as reported below:
```
Group event
|
+- Collective event
| |
| +- ProxyOp event
| |
| +- ProxyStep event
|
+- Point-to-point event
|
+- ProxyOp event
|
+- ProxyStep event
ProxyCtrl event
```
# Profiler instrumentation and logging
## Profiling of collective and p2p operations
The NCCL code is instrumented with profiler callbacks at different levels to capture start/stop of groups,
collective and point-to-point operations, as well as proxy progress activity. Due to the asynchronous nature
of NCCL operations, events associated to collective and point-to-point operations are not easy to delimit
precisely. For example, without both proxy and/or kernel activity it is impossible for the profiler to
figure out when a collective operation completes. Therefore, `stopEvent` for collectives simply indicates to
the profiler that the collective has been enqueued. The profiler can leverage proxy event information, if
these are enabled, to estimate when the collective ends. In this case, the profiler can look at the `stopEvent`
call of the last `ncclProfileProxyOp` event to mark the completion of the associated collective event. This
can be achieved by reference counting the collective event and letting calls to `startEvent` and `stopEvent`
increment and decrement the reference counter, respectively.
## PXN
PXN causes some proxy operations to be processed in a remote proxy thread that differs from the one that
generated the operation. When this happens, the event hierarchy reported above breaks. Because the
profiler can use the hierarchy information, provided by NCCL in the event descriptor, to dereference the
parent event during `startEvent`, the remote proxy thread must be in the same address space of the proxy
thread originating the operation. To avoid the profiler instance in the remote proxy address space to
dereference a pointer from another address space the event descriptor includes the PID of the originator.
The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the
parent event.
+239
Ver fichero
@@ -0,0 +1,239 @@
# NCCL Example Profiler Plugin Usage
This page describes how to use the NCCL example profiler plugin
# Overview
The example profiler plugin implements the NCCL profiler plugin API introduced in NCCL v2.23. The API
defines a set of events and data structures that NCCL uses to share event information with profiler
plugins. The user can control what events are instrumented by NCCL and when traces collected by the
profiler should be dumped through environment variables, as described in the rest of the document.
The user can also control other profiler parameters that alter its behavior. For example, users can
change the size of the event window the profiler keeps track of.
## Building the profiler plugin
To use the example plugin, just type `make`. You will need a NCCL build's include directory present.
You can override `NCCL_HOME` to where the NCCL installation is on your system.
## Using the profiler plugin
1. Add the directory of this profiler plugin to your `LD_LIBRARY_PATH` or set the `NCCL_PROFILER_PLUGIN`,
as documented in `ext-profiler/README.md`.
2. Set `NCCL_PROFILE_EVENT_MASK` bitmask to specify the NCCL events you want to instrument. By
default, all collectives and send/recv operations will be traced. For more details about the event
representation used by the profiler refer to `ext-profiler/README.md`.
As an example, setting:
`NCCL_PROFILE_EVENT_MASK` to 1 (`ncclProfileGroup`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
enables the profiling of the group, the collective and the proxy op events. The same events can be
expressed more concisely by setting `NCCL_PROFILE_EVENT_MASK` to 8 (`ncclProfileProxyOp`). Indeed,
in NCCL all the events above (in the event hierarchy) the one requested are also captured. The advantage
is that the profiler can easily correlate events that belong to the same NCCL operation and present
them accordingly.
3. Set `NCCL_PROFILE_DUMP_FILE` to the name of the dump file for the collected traces. A file named
${NCCL_PROFILE_DUMP_FILE}-hostname-tid.txt is created. Profiler traces are saved using the chrome
event format (more precisely, using asynchronous events).
4. If you set the dump file variable, type chrome://tracing on your chromium browser search bar and
open the created dump file to visualize the traces.
# Changing the profiler memory pool sizes
The example profiler uses separate memory pools for different types of events. The size of these memory
pools (i.e., the # events) determines the number of events that the profiler can keep track of at the
same time. When NCCL requests a new event (e.g., collective event) to profile a `ncclAllReduce`
operation, by calling `startEvent`, the profiler searches in the collective pool for a free event. If it
finds one, it marks it as in use and returns the handle to NCCL. If the pool is completely used the
profiler returns `NULL` to NCCL and ignores all the following NCCL profiler calls for the `NULL` event
handle. When the `ncclAllReduce` has been processed, NCCL calls `stopEvent` with the previosly returned
event handle. The profiler has a total of 5 memory pools.
The group, collective and p2p pools contain objects for the corresponding events. The `ProxyCtrl` pool
contains objects for `ProxyCtrl` events and the `ProxyDetach` pool contains objects for `ProxyOp` events
generated by remote proxies. A list of pools and their size is reported below:
- `NCCL_PROFILE_GROUP_POOL_SIZE` (16)
- `NCCL_PROFILE_COLL_POOL_SIZE` (16)
- `NCCL_PROFILE_P2P_POOL_SIZE` (1024)
- `NCCL_PROFILE_PROXY_CTRL_POOL_SIZE` (16)
- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (128)
Remote proxy operations are generated when PXN is in use. Refer to this article for more information
about PXN and how it works:
https://developer.nvidia.com/blog/doubling-all2all-performance-with-nvidia-collective-communication-library-2-12/
# Reported events
The example profiler generates traces using the json format. An example of trace is reported below:
```
[
{"name": "Group", "cat": "GROUP", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764234.611328, "args": {"groupId": 0}},
{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764237.294922, "args": {"SeqNum": 0, "CommHash": 673864846479792718, "Rank": 1, "Count": 32768, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "LL", "nMaxChannels": 2}},
{"name": "Recv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768464.936523, "args": {"Channel": 0, "Peer": 0, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 772020.300781}, "RECEIVED": {"step": 14, "ts": 772196.049805}, "TRANSMITTED": {"step": 14, "ts": 772197.326172}, "DONE": {"step": 14, "ts": 772201.538086}}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768465.158203, "args": {"Step": 0}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805, "args": {"Step": 0}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266, "args": {"Step": 0}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805, "args": {"Step": 0}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768568.276367},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768503.604492, "args": {"Step": 1}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805, "args": {"Step": 1}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234, "args": {"Step": 1}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695, "args": {"Step": 1}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 770006.914062},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768506.941406, "args": {"Step": 2}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547, "args": {"Step": 2}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133, "args": {"Step": 2}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547, "args": {"Step": 2}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771468.458008},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768509.484375, "args": {"Step": 3}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000, "args": {"Step": 3}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023, "args": {"Step": 3}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211, "args": {"Step": 3}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771910.500000},
{"name": "Send", "cat": "PROXY", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768482.878906, "args": {"Channel": 0, "Peer": 2, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 771995.675781}, "REM_FIFO_WAIT": {"step": 14, "ts": 772190.692383}, "TRANSMITTED": {"step": 14, "ts": 772191.516602}, "DONE": {"step": 14, "ts": 772208.473633}}},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.019531, "args": {"Step": 0}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781, "args": {"Step": 0}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234, "args": {"Step": 0}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769618.889648},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.083008, "args": {"Step": 1}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086, "args": {"Step": 1}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664, "args": {"Step": 1}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769622.517578},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768507.937500, "args": {"Step": 2}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578, "args": {"Step": 2}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883, "args": {"Step": 2}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770013.848633},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.742188, "args": {"Step": 3}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266, "args": {"Step": 3}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477, "args": {"Step": 3}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771469.171875},
... [ trace truncated for brevity ]
{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.317383},
{"name": "Group", "cat": "GROUP", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.418945},
{}]
```
Details about the fields used in the trace can be found at this link:
https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw
The trace above is obtained by running a `ncclAllReduce` operation on 8 GPUs, communicating with each other through
the network interface. The `Group` event encloses all traces that are related to the single `ncclAllReduce` call.
(Note that for single collective invocations, where there are no explicit group calls, NCCL creates a group with only
one collective and this is what is presented in the traces above).
The `AllReduce` event encloses traces for the proxy operation associated to the `ncclAllReduce` operation. The `args`
field in the traces contains NCCL specific information (aside from the chrome trace event format).
## AllReduce trace
The `AllReduce` entry presents information about the `ncclAllReduce` operation. It contains the following info in the args field:
- seqNum : sequential number of the collective in the communicator (every collective type has its own sequence number in the communicator)
- commHash : communicator unique identifier
- rank : NCCL rank for the ncclAllReduce
- datatype : NCCL datatype
- algorithm : algorithm used to process the ncclAllReduce
- protocol : protocol used to process the ncclAllReduce
- nMaxChannels: max number of channels used to process the ncclAllReduce
If the proxy events are not active (e.g., the `ncclAllReduce` is intranode) the end timestamp will match the time
consumed by the CPU to launch the collective. For more details refer to `ext-profiler/README.md`, section `Profiling
of collective and p2p operations`.
### Proxy Send
The `Send` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
info in the args field:
- Channel : id of the channel used by this proxy operation to send data to the peer
- Peer : peer rank
- Steps : number of network steps required to transfer transSize bytes to the peer
- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread
- transSize : bytes transferred across the channel by this proxy operation
- POSTED : struct containing the number of buffer posts to the GPU and the time stamp for the last post
- REM_FIFO_WAIT: struct containing the number of remote buffer waits and the time stamp for the last wait
- TRANSMITTED : struct containing the number of network sends and the time stamp of the last send
- DONE : struct containing the number of network sends completed and the time stamp of the last send completed
In case of a network problem the POSTED, REM_FIFO_WAIT, TRANSMITTED and DONE might all have partially updated steps,
which could help identify at which point the network problem occurred.
The Proxy send trace gives a summary of the proxy progress thread activity for the channel. If more details are
needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
entries below are also reported by the profiler.
#### Proxy SendBufferWait
Presents, for every network step, the time the CPU proxy spends waiting for the channel staging buffer to become available.
#### Proxy SendGPUWait
Presents, for every network step, the time the CPU proxy spends waiting for the GPU to provide the data in the staging
buffer.
#### Proxy SendWait
Presents, for every network step, the time the CPU proxy spends waiting for the `isend` to complete
### Proxy Recv
The `Recv` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
info in the args field:
- Channel : id of the channel used by this proxy operation to recv data from the peer
- Peer : peer rank
- Steps : number of network steps required to transfer transSize bytes from the peer
- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread
- transSize : bytes transferred across the channel by this proxy operation
- POSTED : struct containing the number of recvs posted and the time stamp for the last recv posted
- RECEIVED : struct containing the number of recvs completed and the time stamp for the last recv completed
- TRANSMITTED: struct containing the number of recvs flushed to the GPU memory and the time stamp for the last recv flushed
- DONE : struct containing the number of flush completed and the time stamp for the last flush completed
The Proxy Recv trace gives a summary of the proxy progress thread activity for the channel. If more details are
needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
entries below are also reported by the profiler.
#### Proxy RecvBufferWait
Presents, for every network step, the time the CPU proxy spends waiting for the staging buffer for the channel to
become available.
#### Proxy RecvWait
Presents, for every network step, the time the CPU proxy spends waiting for a posted `irecv` to complete
#### Proxy RecvFlushWait
Presents, for every network step, the time the CPU proxy spends waitng for the recv data to be flushed to the GPU
#### Proxy RecvGPUWait
Presents, for every network step, the time the CPU proxy spends waiting for the GPU to consume the recv data
+7 -1
Ver fichero
@@ -39,14 +39,20 @@ endif
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90
CUDA13_GENCODE = -gencode=arch=compute_100,code=sm_100 \
-gencode=arch=compute_120,code=sm_120
CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
CUDA11_PTX = -gencode=arch=compute_80,code=compute_80
CUDA12_PTX = -gencode=arch=compute_90,code=compute_90
CUDA13_PTX = -gencode=arch=compute_120,code=compute_120
ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 12; echo $$?),0)
# Include Blackwell support if we're using CUDA12.8 or above
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
# Include Hopper support if we're using CUDA11.8 or above
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+2 -2
Ver fichero
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 24
NCCL_PATCH := 3
NCCL_MINOR := 25
NCCL_PATCH := 1
NCCL_SUFFIX :=
PKG_REVISION := 1
+1 -1
Ver fichero
@@ -25,7 +25,7 @@ prep : $(DEBTARGETS)
build : prep
$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
@printf "Building Debian package\n"
(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b -Zxz)
mkdir -p $(PKGDIR)
mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/
+1 -1
Ver fichero
@@ -1,4 +1,4 @@
bin/ncclras /usr/bin
include/nccl.h /usr/include
include/nccl_net.h /usr/include
lib/libnccl.so /usr/lib/${pkg:MultiArch}
lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
+3
Ver fichero
@@ -11,3 +11,6 @@ override_dh_auto_test:
override_dh_auto_clean:
# Do not make clean
override_dh_builddeb:
dh_builddeb -- -Zxz
+4 -2
Ver fichero
@@ -20,6 +20,7 @@ sockets.
%package devel
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
Group: Development/Libraries
Requires: libnccl >= ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
%description devel
NCCL development files
@@ -44,9 +45,10 @@ install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUI
ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major}
# devel
install -m 755 -d $RPM_BUILD_ROOT/%{_bindir}
install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
install -m 755 bin/ncclras $RPM_BUILD_ROOT/%{_bindir}
install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
install -m 644 include/nccl_net.h $RPM_BUILD_ROOT/%{_includedir}
ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
# static
@@ -64,8 +66,8 @@ rm -rf $RPM_BUILD_ROOT
%files devel
%doc LICENSE.txt
%defattr(-,root,root,-)
%{_bindir}/ncclras
%{_includedir}/nccl.h
%{_includedir}/nccl_net.h
%{_libdir}/libnccl.so
%files static
+1 -1
Ver fichero
@@ -21,4 +21,4 @@ PKG_ARCH=${pkg:Arch}
NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}"
tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt
tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/bin $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt
+1 -1
Ver fichero
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
INCEXPORTS := nccl.h
LIBSRCFILES := \
bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
init.cc init_nvtx.cc net.cc proxy.cc transport.cc mnnvl.cc \
$(wildcard graph/*.cc) \
$(wildcard misc/*.cc) \
$(wildcard transport/*.cc) \
+70 -187
Ver fichero
@@ -10,7 +10,7 @@
#include "graph/topo.h"
#include "nccl.h"
#include "api_trace.h"
#include "nvtx_payload_schemas.h"
#include "msccl/msccl_lifecycle.h"
using namespace rccl;
@@ -84,18 +84,8 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
struct NvtxParamsAllGather {
size_t bytes;
ncclDataType_t datatype;
};
// Just pass the size of one message and not the total bytes sent/received.
constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsAllGather, datatype)}
};
NvtxParamsAllGather payload{sendcount * ncclTypeSize(datatype), datatype};
NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, payload)
NVTX3_FUNC_WITH_PARAMS(AllGather, NcclNvtxParamsAllGather,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype), datatype));
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
@@ -121,20 +111,8 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size
ncclResult_t ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
struct NvtxParamsAllReduce {
size_t bytes;
ncclRedOp_t op;
ncclDataType_t datatype;
};
// Just pass the size of one message and not the total bytes sent/received.
static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, offsetof(NvtxParamsAllReduce, op)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsAllReduce, datatype)}
};
NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op, datatype};
NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
NVTX3_FUNC_WITH_PARAMS(AllReduce, NcclNvtxParamsAllReduce,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), op, datatype));
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
@@ -162,25 +140,14 @@ NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_
ncclResult_t ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclComm_t comm, hipStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(AllToAll, NcclNvtxParamsAllToAll,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), datatype));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrAllToAll, sendbuff, recvbuff, count, datatype, comm, stream));
}
struct NvtxParamsAllToAll {
size_t bytes;
ncclDataType_t datatype;
};
// Just pass the size of one message and not the total bytes sent/received.
constexpr nvtxPayloadSchemaEntry_t AllToAllSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsAllToAll, datatype)}
};
NvtxParamsAllToAll payload{count * ncclTypeSize(datatype), datatype};
NVTX3_FUNC_WITH_PARAMS(AllToAll, AllToAllSchema, payload)
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
@@ -218,28 +185,15 @@ NCCL_API(ncclResult_t, ncclAllToAllv, const void *sendbuff, const size_t sendcou
ncclResult_t ncclAllToAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(AllToAllv, NcclNvtxParamsAllToAllv,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcounts[comm->rank] * ncclTypeSize(datatype),
recvcounts[comm->rank] * ncclTypeSize(datatype), datatype));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrAllToAllv, sendbuff, recvbuff, 0, datatype, comm, stream, -1, sendcounts, sdispls, recvcounts, rdispls));
}
struct NvtxParamsAllToAllv {
size_t sendbytes;
size_t recvbytes;
ncclDataType_t datatype;
};
// Just pass the size of one send/recv messages and not the total bytes sent/received.
constexpr nvtxPayloadSchemaEntry_t AllToAllvSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes] (Send)"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes] (Recv)", nullptr, 0,
offsetof(NvtxParamsAllToAllv, recvbytes)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsAllToAllv, datatype)}
};
NvtxParamsAllToAllv payload{sendcounts[comm->rank] * ncclTypeSize(datatype), recvcounts[comm->rank] * ncclTypeSize(datatype), datatype};
NVTX3_FUNC_WITH_PARAMS(AllToAllv, AllToAllvSchema, payload)
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls,
@@ -274,19 +228,8 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
ncclResult_t ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
struct NvtxParamsBroadcast {
size_t bytes;
int root;
ncclDataType_t datatype;
};
constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsBroadcast, datatype)}
};
NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root, datatype};
NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
NVTX3_FUNC_WITH_PARAMS(Broadcast, NcclNvtxParamsBroadcast,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, datatype));
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
@@ -319,45 +262,34 @@ NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t
ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) {
struct NvtxParamsGather {
size_t bytes;
int root;
ncclDataType_t datatype;
};
constexpr nvtxPayloadSchemaEntry_t GatherSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsGather, root)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsGather, datatype)}
};
NvtxParamsGather payload{sendcount * ncclTypeSize(datatype), root, datatype};
NVTX3_FUNC_WITH_PARAMS(Gather, GatherSchema, payload)
NVTX3_FUNC_WITH_PARAMS(Gather, NcclNvtxParamsGather,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype), root, datatype));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, sendcount, datatype, comm, stream, root));
}
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, sendcount, datatype, comm, stream, root));
}
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
sendcount, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
}
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
sendcount, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
}
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
size_t rankOffset = sendcount * ncclTypeSize(datatype);
if (sendcount == 0) return ncclSuccess;
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
NCCLCHECK(ncclGroupStart());
if (rank == root) {
for (int r=0; r<nRanks; r++)
NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, sendcount, datatype, r, comm, stream));
}
NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, root, comm, stream));
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
size_t rankOffset = sendcount * ncclTypeSize(datatype);
if (sendcount == 0) return ncclSuccess;
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
NCCLCHECK(ncclGroupStart());
if (rank == root) {
for (int r=0; r<nRanks; r++)
NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, sendcount, datatype, r, comm, stream));
}
NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, root, comm, stream));
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
@@ -365,22 +297,8 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t
ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
struct NvtxParamsReduce {
size_t bytes;
int root;
ncclRedOp_t op;
ncclDataType_t datatype;
};
constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsReduce, op)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsReduce, datatype)}
};
NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op, datatype};
NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
NVTX3_FUNC_WITH_PARAMS(Reduce, NcclNvtxParamsReduce,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, op, datatype));
struct ncclInfo info = { ncclFuncReduce, "Reduce",
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
@@ -406,20 +324,8 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
struct NvtxParamsReduceScatter {
size_t bytes;
ncclRedOp_t op;
ncclDataType_t datatype;
};
constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsReduceScatter, op)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsReduceScatter, datatype)}
};
NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op, datatype};
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, NcclNvtxParamsReduceScatter,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), op, datatype));
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
@@ -445,67 +351,44 @@ NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t
ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream) {
struct NvtxParamsScatter {
size_t bytes;
int root;
ncclDataType_t datatype;
};
constexpr nvtxPayloadSchemaEntry_t ScatterSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsScatter, root)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsScatter, datatype)}
};
NvtxParamsScatter payload{recvcount * ncclTypeSize(datatype), root, datatype};
NVTX3_FUNC_WITH_PARAMS(Scatter, ScatterSchema, payload)
NVTX3_FUNC_WITH_PARAMS(Scatter, NcclNvtxParamsScatter,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), root, datatype));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, recvcount, datatype, comm, stream, root));
}
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, recvcount, datatype, comm, stream, root));
}
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
recvcount, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
}
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
recvcount, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
}
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
size_t rankOffset = recvcount * ncclTypeSize(datatype);
if (recvcount == 0) return ncclSuccess;
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
NCCLCHECK(ncclGroupStart());
if (rank == root) {
for (int r=0; r<nRanks; r++)
NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, recvcount, datatype, r, comm, stream));
}
NCCLCHECK(ncclRecv(recvbuff, recvcount, datatype, root, comm, stream));
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
size_t rankOffset = recvcount * ncclTypeSize(datatype);
if (recvcount == 0) return ncclSuccess;
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
NCCLCHECK(ncclGroupStart());
if (rank == root) {
for (int r=0; r<nRanks; r++)
NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, recvcount, datatype, r, comm, stream));
}
NCCLCHECK(ncclRecv(recvbuff, recvcount, datatype, root, comm, stream));
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;
}
struct NvtxParamsSendRecv {
size_t bytes;
int peer;
ncclDataType_t datatype;
};
constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsSendRecv, datatype)}
};
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer, datatype};
NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
NVTX3_FUNC_WITH_PARAMS(Send, NcclNvtxParamsSendRecv,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer, datatype));
struct ncclInfo info = { ncclFuncSend, "Send",
NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
@@ -530,8 +413,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
ncclResult_t ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer, datatype};
NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
NVTX3_FUNC_WITH_PARAMS(Recv, NcclNvtxParamsSendRecv,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer, datatype));
struct ncclInfo info = { ncclFuncRecv, "Recv",
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
+1 -1
Ver fichero
@@ -767,7 +767,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
int nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndReduce) {
} else if (tid < tidEndReduce && nvls->headRank != -1) {
// Reduce, broadcast through NVLS
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+20 -16
Ver fichero
@@ -55,25 +55,22 @@ static ncclKernelMatch const ncclKerns[3] = {
NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
// Returns maximum kernel stack size of all CUDA kernels
ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize) {
constexpr int KernelCount = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
ncclResult_t result = ncclSuccess;
int print = 0;
if (maxStackSize) *maxStackSize = 0;
int carveout = ncclParamL1SharedMemoryCarveout();
int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch);
// Keep track if we already visited a function pointer.
void* lru[2] = {nullptr, nullptr};
for (int i=0; i < KernelCount; i++) {
void* fn = ncclKerns[i].kernelFn;
if (fn == lru[0] || fn == lru[1]) goto next_kernel;
lru[1] = lru[0];
lru[0] = fn;
for (int k=0; k < KernelCount; k++) {
void* fn = ncclKerns[k].kernelFn;
cudaFuncAttributes attr = {0};
if (fn == nullptr) continue;
CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
if (maxStackSize) {
cudaFuncAttributes attr = {0};
if (cudaFuncGetAttributes(&attr, fn) != cudaSuccess)
WARN("Failed to get kernel attributes");
if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
ignore0:;
}
@@ -84,10 +81,17 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
result, ignore1);
ignore1:;
}
if (ncclShmemDynamicSize(cudaArch) != 0) {
if (ncclMaxSharedMem != 0) {
int sharedMemSize = ncclMaxSharedMem;
if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) {
if (print++ == 0)
INFO(NCCL_INIT, "ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
// Reduce requested MaxDynamicSharedMemorySize attribute
sharedMemSize = maxSharedMem - attr.sharedSizeBytes;
}
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)),
cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize),
result, next_kernel);
}
next_kernel:;
@@ -1523,7 +1527,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
if (driverVersion >= 11080) {
int compCap = comm->compCap;
unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0;
unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0;
CUlaunchConfig launchConfig = {0};
CUlaunchAttribute launchAttrs[3];
@@ -1677,7 +1681,7 @@ static ncclResult_t updateCollCostTable(
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue;
if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
/* now we only support single-node NVLS allgather and reducescatter */
if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue;
if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && (comm->nNodes > 1 || comm->nRanks > NCCL_MAX_NVLS_ARITY)) continue;
/* Tree reduceScatter doesn't support scaling yet */
if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter
&& (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue;
+2 -5
Ver fichero
@@ -22,7 +22,6 @@
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
int rank = comm->rank;
int localRanks = comm->topo->nodes[GPU].count;
int nvlsRanks = comm->MNNVL ? comm->clique.size : localRanks;
int nChannels = comm->nChannels;
topoRanks->nvlsHeadNum = 0;
@@ -77,7 +76,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
// Get nvls heads and the number of heads. Duplicate head is not allowed.
for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
bool addHead = true;
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * nvlsRanks;
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
@@ -457,8 +456,6 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
channel->nvls.out = -1; // NVLS+SHARP not yet implemented.
channel->nvls.headRank = headRank;
channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
channel->nvls.node = comm->node;
channel->nvls.nNodes = comm->nNodes;
if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
}
if (comm->nNodes == 1) return ncclSuccess;
@@ -853,7 +850,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
}
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && 2*nChannels <= maxChannels) {
if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && 2*nChannels <= maxChannels) {
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
}
+29 -6
Ver fichero
@@ -975,14 +975,37 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
return ncclSuccess;
}
int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) {
int minPath = PATH_DIS;
ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min) {
int minPath = PATH_SYS;
for (int i=0; i<system->nodes[GPU].count; i++) {
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU];
for (int j=0; j<system->nodes[GPU].count; j++) {
if (i == j) continue;
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type];
if (paths == NULL) continue;
for (int j=0; j<system->nodes[type].count; j++) {
if (type == GPU && i == j) continue;
minPath = std::min(minPath, paths[j].type);
}
}
return minPath >= PATH_PIX ? 0 : 1;
*min = minPath;
return ncclSuccess;
}
ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max) {
int maxPath = PATH_LOC;
for (int i=0; i<system->nodes[GPU].count; i++) {
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type];
if (paths == NULL) continue;
for (int j=0; j<system->nodes[type].count; j++) {
if (type == GPU && i == j) continue;
maxPath = std::max(maxPath, paths[j].type);
}
}
*max = maxPath;
return ncclSuccess;
}
ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink) {
int maxPath;
NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxPath));
*allNvLink = maxPath >= PATH_PIX ? 0 : 1;
return ncclSuccess;
}
+32 -15
Ver fichero
@@ -988,15 +988,20 @@ float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0,
float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
#endif
float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 11.0, 6.0, 3.0 };
float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
#endif
RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0);
float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0 };
float sm100SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float))
#define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float))
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
int crossNic = (system->nodes[NET].count > 1) &&
@@ -1006,8 +1011,20 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
graph->crossNic = crossNic == 1 ? 1 : 0;
graph->bwIntra = graph->bwInter = 0;
graph->latencyInter = 0;
graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
graph->typeInter = PATH_PIX;
int minTypeIntra = PATH_LOC, minTypeInter = PATH_PIX;
int maxTypeIntra = PATH_SYS, maxTypeInter = PATH_SYS;
if (ngpus > 1) {
NCCLCHECK(ncclTopoGetGpuMinPath(system, GPU, &minTypeIntra));
NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxTypeIntra));
}
if (system->nodes[NET].count > 0) {
NCCLCHECK(ncclTopoGetGpuMinPath(system, NET, &minTypeInter));
NCCLCHECK(ncclTopoGetGpuMaxPath(system, NET, &maxTypeInter));
maxTypeIntra = maxTypeInter;
}
graph->typeIntra = minTypeIntra;
graph->typeInter = minTypeInter;
graph->nChannels = 0;
graph->nIntraChannels = 0;
memset(graph->intraNets, 0, MAXCHANNELS*NCCL_TOPO_MAX_NODES*2*sizeof(int));
@@ -1080,14 +1097,14 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
// NVLS and COLLNET_DIRECT search must have ngpus heads at most.
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT)
graph->maxChannels = system->nodes[GPU].count;
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) graph->maxChannels = std::min(NCCL_MAX_NVLS_ARITY, system->nodes[GPU].count);
if (graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) graph->maxChannels = std::min(NCCL_MAX_DIRECT_ARITY+1, system->nodes[GPU].count);
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
// Force intra-node NVLS algorithm to pull evenly from all GPUs.
graph->minChannels = graph->maxChannels = system->nodes[GPU].count;
graph->minChannels = graph->maxChannels;
}
struct ncclTopoGraph tmpGraph;
@@ -1097,11 +1114,11 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
int nspeeds = 0;
float* speedArray = NULL;
if (system->nodes[NET].count == 0) {
nspeeds = NSPEEDSINTRA;
speedArray = speedArrayIntra;
nspeeds = ccMin >= 100 ? NSPEEDSINTRA_SM100 : (ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA);
speedArray = ccMin >= 100 ? sm100SpeedArrayIntra : (ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra);
} else {
nspeeds = NSPEEDSINTER;
speedArray = speedArrayInter;
nspeeds = ccMin >= 100 ? NSPEEDSINTER_SM100 : (ccMin >= 90 ? NSPEEDSINTER_SM90 : NSPEEDSINTER);
speedArray = ccMin >= 100 ? sm100SpeedArrayInter : (ccMin >= 90 ? sm90SpeedArrayInter : speedArrayInter);
}
int pass = 1;
int speedIndex = 0;
@@ -1156,18 +1173,18 @@ search:
}
tmpGraph.pattern = graph->pattern;
int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
int maxIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : maxTypeIntra;
if (tmpGraph.typeIntra < maxIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
tmpGraph.typeIntra += 1;
goto search;
}
tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
tmpGraph.typeIntra = minTypeIntra;
if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
if (system->nodes[NET].count > 0 && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
tmpGraph.typeInter += 1;
goto search;
}
tmpGraph.typeInter = PATH_PIX;
tmpGraph.typeInter = minTypeInter;
if (crossNic == 2 && tmpGraph.crossNic == 0
&& (graph->pattern == NCCL_TOPO_PATTERN_RING || graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE)) {
+36 -44
Ver fichero
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -1445,11 +1445,11 @@ fail:
goto exit;
}
ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) {
static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType,
int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType) {
int minType = PATH_DIS;
float maxBw = 0;
int count = 0;
NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count));
struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType];
if (paths == NULL) { *localCount = 0; return ncclSuccess; }
for (int i=0; i<system->nodes[resultType].count; i++) {
@@ -1459,7 +1459,15 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index
if (pathType) *pathType = minType;
count = 0;
}
if (paths[i].bw == maxBw && paths[i].type == minType) (*locals)[count++] = i;
if (paths[i].bw == maxBw && paths[i].type == minType) {
if (count == NCCL_TOPO_MAX_NODES) {
WARN("Error : ran out of room to store found nodes in ncclTopoGetLocal."
" Filled %d of type %d, starting from index %d of type %d.",
NCCL_TOPO_MAX_NODES, resultType, index, type);
return ncclInternalError;
}
locals[count++] = i;
}
}
*localCount = count;
return ncclSuccess;
@@ -1467,7 +1475,7 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index
ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count) {
int localNetCount = 0, netCountByBw = 0;
int* localNets;
int localNets[NCCL_TOPO_MAX_NODES];
float totalNetBw = 0, gpuBw = 0;
for (int l=0; l<system->nodes[GPU].nodes[gpu].nlinks; l++) {
@@ -1479,69 +1487,55 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
}
}
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL));
for (int l=0; (l < localNetCount) && (totalNetBw < gpuBw); l++, netCountByBw++) {
totalNetBw += system->nodes[GPU].nodes[gpu].paths[NET][localNets[l]].bw;
}
*count = netCountByBw;
free(localNets);
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
ncclResult_t ret = ncclSuccess;
int gpu;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
int* localNets = NULL;
int localNets[NCCL_TOPO_MAX_NODES];
int localNetCount;
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
int* localGpus = NULL;
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL));
if (localNetCount==0) {
WARN("Could not find any local path from gpu %d to net.", gpu);
return ncclInternalError;
}
int localGpus[NCCL_TOPO_MAX_NODES];
int localGpuCount;
int net = 0;
if (localNetCount == 0) {
*id = -1;
free(localNets);
return ncclSuccess;
}
NCCLCHECKGOTO(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL), ret, fail);
for (int i = 0; i < localGpuCount; i++) {
if (gpu == localGpus[i]) {
net = i;
break;
}
}
NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL));
int net = system->nodes[GPU].nodes[gpu].gpu.dev;
if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
if (localNetCount == 0) {
if (id) *id = -1;
if (dev) *dev = -1;
} else {
net += channelId%(DIVUP(localNetCount,localGpuCount));
if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
}
exit:
free(localNets);
if (localGpus) free(localGpus);
return ret;
fail:
goto exit;
net += channelId%(DIVUP(localNetCount,localGpuCount));
if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
return ncclSuccess;
}
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) {
ncclResult_t ret = ncclSuccess;
int netIndex;
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &netIndex));
int* localGpus = NULL;
int localGpus[NCCL_TOPO_MAX_NODES];
int localGpuCount;
NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, localGpus, &localGpuCount, NULL));
int foundGpu = -1;
NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
for (int c=0; c<MAXCHANNELS; c++) {
for (int lg=0; lg<localGpuCount; lg++) {
int g = localGpus[lg];
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
int64_t id;
NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL), ret, fail);
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL));
if (netId == id) {
foundGpu = g;
goto exit;
@@ -1550,8 +1544,6 @@ ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, i
}
exit:
*gpuIndex = foundGpu;
fail:
free(localGpus);
return ret;
}
+6 -2
Ver fichero
@@ -19,6 +19,7 @@
#define SM80_NVLINK_BW 20.0
#define SM90_NVLINK_BW 20.6
#define SM86_NVLINK_BW 12.0
#define SM100_NVLINK_BW 40.0
#define PCI_BW 12.0 // PCI Gen3 x16
#define QPI_BW 6.0
#define AMD_BW 16.0
@@ -98,8 +99,8 @@ struct ncclTopoLink {
float bw;
struct ncclTopoNode* remNode;
};
#define NCCL_TOPO_MAX_LINKS 128
// Allows for up to 32 NICs per node on GB200-NVL72
#define NCCL_TOPO_MAX_LINKS 576
#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
struct ncclTopoLinkList {
@@ -209,6 +210,8 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank);
ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min);
ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max);
#define NCCL_TOPO_XML_MAX_NODES 8192
#define NCCL_GRAPH_XML_MAX_NODES 8192
@@ -279,6 +282,7 @@ static float ncclTopoXGMISpeed(const char* gcn) {
// Returns NVLink bw in GB/s
static float ncclTopoNVLinkBw(int cudaCompCap) {
return
cudaCompCap >= 100 ? SM100_NVLINK_BW :
cudaCompCap >= 90 ? SM90_NVLINK_BW :
cudaCompCap == 86 ? SM86_NVLINK_BW :
cudaCompCap >= 80 ? SM80_NVLINK_BW :
+13 -6
Ver fichero
@@ -350,28 +350,33 @@ static struct tuningModel rcclTuningModel[] = {
#define VOLTA_COMPCAP_IDX 0
#define AMPERE_COMPCAP_IDX 1
#define HOPPER_COMPCAP_IDX 2
#define BLACKWELL_COMPCAP_IDX 3
// LL128 max BW per channel
static const double llMaxBws[3][3] = {
static const double llMaxBws[][3] = {
/* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
/* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
/* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0}
/* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0},
/* Blackwell-N1/AMD-N2/AMD-N4) */ {2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0},
};
static const double perChMaxRingLL128Bws[3][3] = {
static const double perChMaxRingLL128Bws[][3] = {
/* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7},
/* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*36.7},
};
static const double perChMaxTreeLL128Bws[3][3] = {
static const double perChMaxTreeLL128Bws[][3] = {
/* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0},
/* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*29.0},
};
static const double perChMaxTreeBws[3][3] = {
static const double perChMaxTreeBws[][3] = {
/* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
/* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
/* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
/* Blackwell (N1/N2/N4) */ {2*38.7, 2*41.4, 2*36.0},
};
NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
@@ -422,7 +427,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
int nRanks = comm->nRanks;
if (nRanks <= 1) return ncclSuccess;
int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX;
int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
int index2 = nNodes <= 2 ? nNodes-1 : 2;
// LL: for single node, we look at GPU type; for multi-node, we look at CPU type
int index1 = nNodes == 1 ? compCapIndex :
@@ -666,6 +671,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
case 70: pEnable &= 1; break;
case 80: pEnable &= 1; break;
case 90: pEnable &= !(CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2); break;
case 100: pEnable &= 1; break;
case 120: pEnable &= 1; break;
default: pEnable &= 0; break;
}
#endif
+2 -3
Ver fichero
@@ -228,14 +228,13 @@ static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
return result;
}
static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, CUmemAllocationHandleType type, size_t size) {
ncclResult_t result = ncclSuccess;
size_t granularity = 0;
CUdevice currentDev;
CUmemAllocationProp prop = {};
CUmemAccessDesc accessDesc = {};
CUmemGenericAllocationHandle handle;
CUmemAllocationHandleType type = ncclCuMemHandleType;
int cudaDev;
int flag = 0;
CUDACHECK(cudaGetDevice(&cudaDev));
@@ -284,7 +283,7 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
extern int ncclCuMemEnable();
static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) {
static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, int type, size_t size) {
WARN("CUMEM not supported prior to CUDA 11.3");
return ncclInternalError;
}
+1 -3
Ver fichero
@@ -70,7 +70,7 @@ union ncclLLFifoLine {
#define WARP_SIZE warpSize
#define MAXCHANNELS 128
#define CHANNEL_LIMIT 16
#define NCCL_MAX_LOCAL_RANKS 64
#define NCCL_MAX_LOCAL_RANKS 72
#define NCCL_MAX_NTHREADS 256
#define NCCL_MIN_NTHREADS (4*WARP_SIZE)
#define NCCL_SIMPLE_MAX_NTHREADS NCCL_MAX_NTHREADS
@@ -204,8 +204,6 @@ struct ncclNvls {
int down;
int treeUp;
int treeDown[NCCL_MAX_NVLS_TREE_ARITY];
int node;
int nNodes;
};
#if __CUDA_ARCH__ >= 900
+1 -1
Ver fichero
@@ -17,7 +17,7 @@
#define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL)
#define NCCL_BYTES_ALIGNMENT 16
ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize);
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
+2 -1
Ver fichero
@@ -29,7 +29,8 @@ void ncclTopoFree(struct ncclTopoSystem* system);
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink);
ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
// Query topology
+15
Ver fichero
@@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_MNNVL_H_
#define NCCL_MNNVL_H_
#include "nccl.h"
#include "comm.h"
ncclResult_t ncclMnnvlCheck(struct ncclComm* comm);
#endif
+93 -18
Ver fichero
@@ -36,19 +36,22 @@
#define NVTX_SID_CommInitRankConfig 16 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_CommInitRankScalable 17 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_CommSplit 18
#define NVTX_SID_CommFinalize 19
// Define static schema ID for the reduction operation.
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 19 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
struct nccl_domain{static constexpr char const* name{"NCCL"};};
/// @brief Register an NVTX payload schema for static-size payloads.
class payload_schema {
public:
explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
public:
explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries,
const uint64_t schemaId, const size_t size) noexcept
{
schema_attr.name = schemaName;
schema_attr.payloadStaticSize = size;
schema_attr.entries = entries;
schema_attr.numEntries = numEntries;
schema_attr.schemaId = schemaId;
@@ -69,33 +72,105 @@ private:
NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE |
NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
nullptr,
nullptr, /* schema name is not needed */
NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
nullptr, 0, 0, 0, 0, nullptr};
};
// Convenience macro to give the payload parameters a scope.
#define NVTX3_PAYLOAD(...) __VA_ARGS__
// Create NVTX push/pop range with parameters
// @param name of the operation (see `NVTX_SID_*`)
// @param N schema name
// @param S schema (entries)
// @param P payload (struct)
// @param N NCCL API name without the `nccl` prefix.
// @param T name of the used NVTX payload schema without "Schema" suffix.
// @param P payload parameters/entries
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
nvtxPayloadData_t nvtx3_bpl__[] = { \
{NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
roctx_scoped_range_in const roctx_range__{S, nvtx3_bpl__, std::extent<decltype(S)>::value, "RCCL_" #ID};
#define NVTX3_FUNC_WITH_PARAMS(N, T, P) \
constexpr uint64_t schemaId = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \
static const payload_schema schema{T##Schema, std::extent<decltype(T##Schema)>::value - 1, \
schemaId, sizeof(T)}; \
const T _payload = {P}; \
nvtxPayloadData_t nvtx3_bpl__[] = {{schemaId, sizeof(_payload), &_payload}}; \
roctx_scoped_range_in const roctx_range__{T##Schema, nvtx3_bpl__, std::extent<decltype(T##Schema)>::value - 1, "RCCL_" #N};
#else
#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
static const payload_schema schema{S, std::extent<decltype(S)>::value, \
NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
#define NVTX3_FUNC_WITH_PARAMS(N, T, P) \
constexpr uint64_t schemaId = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \
static const payload_schema schema{T##Schema, std::extent<decltype(T##Schema)>::value - 1, \
schemaId, sizeof(T)}; \
static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
nvtxPayloadData_t nvtx3_bpl__[] = { \
{NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
const T _payload = {P}; \
nvtxPayloadData_t nvtx3_bpl__[] = {{schemaId, sizeof(_payload), &_payload}}; \
::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
::nvtx3::v1::scoped_range_in<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
#endif
/// @brief Creates an NVTX range with extended payload using the RAII pattern.
/// @tparam PayloadType Data type of the payload.
template <typename PayloadType>
class ncclNvtxRange {
public:
explicit ncclNvtxRange(const nvtxEventAttributes_t* evtAttr) noexcept {
nvtxDomainRangePushEx(nvtx3::domain::get<nccl_domain>(), evtAttr);
}
~ncclNvtxRange() noexcept {
if (payloadData.payload) {
nvtxRangePopPayload(nvtx3::domain::get<nccl_domain>(), &payloadData, 1);
} else {
nvtxDomainRangePop(nvtx3::domain::get<nccl_domain>());
}
}
void setPayloadData(const uint64_t schemaId) noexcept
{
payloadData = {schemaId, sizeof(PayloadType), &payload};
}
ncclNvtxRange() = delete;
ncclNvtxRange(ncclNvtxRange const&) = default;
ncclNvtxRange& operator=(ncclNvtxRange const&) = default;
ncclNvtxRange(ncclNvtxRange&&) = default;
ncclNvtxRange& operator=(ncclNvtxRange&&) = default;
// Holds the payload data.
PayloadType payload{};
nvtxPayloadData_t payloadData = {NVTX_PAYLOAD_ENTRY_TYPE_INVALID, 0, NULL};
};
// Create an NVTX range with the function name as the range name. Use RAII pattern.
// @param T Type ID of the NVTX payload (pointer for variable-size payloads).
#define NVTX3_RANGE(T) \
static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \
ncclNvtxRange<T> nvtx3_range__{nvtx3_func_attr__.get()};
// Add static-size payload to the NVTX range created with `NVTX3_RANGE()`,
// which must be in this or an outer scope.
// @param N NCCL API name without the `nccl` prefix.
// @param S name of the used NVTX payload schema.
// @param P payload parameters/entries
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
#define NVTX3_RANGE_ADD_PAYLOAD(N, S, P) do { \
constexpr uint64_t schema_id = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \
static const payload_schema schema{S, std::extent<decltype(S)>::value - 1, schema_id, \
sizeof(nvtx3_range__.payload)}; \
nvtx3_range__.payload = {P}; \
nvtx3_range__.setPayloadData(schema_id); \
nvtxPayloadData_t nvtx3_bpl__[] = {{schema_id, sizeof(nvtx3_range__.payloadData), &nvtx3_range__.payloadData}}; \
roctx_scoped_range_in const roctx_range__{S, nvtx3_bpl__, std::extent<decltype(S)>::value - 1, "RCCL_" #N}; \
} while (0)
#else
#define NVTX3_RANGE_ADD_PAYLOAD(N, S, P) do { \
constexpr uint64_t schema_id = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \
static const payload_schema schema{S, std::extent<decltype(S)>::value - 1, schema_id, \
sizeof(nvtx3_range__.payload)}; \
nvtx3_range__.payload = {P}; \
nvtx3_range__.setPayloadData(schema_id); \
} while (0)
#endif
extern void initNvtxRegisteredEnums();
#endif
+3 -3
Ver fichero
@@ -11,7 +11,7 @@
/* This is just an empty marker (for readability), which can be omitted. */
/* TODO: Fix issue with trailing comma at end of entry list. */
#define NVTX_PAYLOAD_ENTRIES
#define NCCL_NVTX_PAYLOAD_ENTRIES
/**
@@ -32,7 +32,7 @@
*
* Example:
* NVTX_DEFINE_SCHEMA_FOR_STRUCT(your_struct, "SchemaName",
* NVTX_PAYLOAD_ENTRIES(
* NCCL_NVTX_PAYLOAD_ENTRIES(
* (index, TYPE_INT, "integer value"),
* (dpfloat, TYPE_DOUBLE, "fp64 value"),
* (text, TYPE_CSTRING, "text", NULL, 24)
@@ -80,7 +80,7 @@
*
* Example:
* NVTX_DEFINE_STRUCT_WITH_SCHEMA(your_struct_name, "Your schema name",
* NVTX_PAYLOAD_ENTRIES(
* NCCL_NVTX_PAYLOAD_ENTRIES(
* (int, index, TYPE_INT, "integer value"),
* (double, dpfloat, TYPE_DOUBLE, "fp64 value"),
* (const char, (text, 24), TYPE_CSTRING, "text", NULL, 24)
+178
Ver fichero
@@ -0,0 +1,178 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
/// Definitions of NVTX payload types and schemas used for the NVTX
/// instrumentation in init.cc and collectives.cc.
#ifndef NVTX_PAYLOAD_SCHEMAS_H_
#define NVTX_PAYLOAD_SCHEMAS_H_
#include "nccl.h"
#include "nvtx3/nvToolsExtPayload.h"
#include "nvtx3/nvToolsExtPayloadHelper.h"
/**
* \brief Define a C struct together with the matching schema entries.
*
* Does the same as `NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA`, but without creating the
* schema attributes. (Remove this helper when it is available in the NVTX headers.)
*/
#define NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(struct_id, prefix, entries) \
_NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries)
// C strings used as NVTX payload entry names.
static constexpr char const* nccl_nvtxCommStr = "NCCL communicator ID";
static constexpr char const* nccl_nvtxCudaDevStr = "CUDA device";
static constexpr char const* nccl_nvtxRankStr = "Rank";
static constexpr char const* nccl_nvtxNranksStr = "No. of ranks";
static constexpr char const* nccl_nvtxMsgSizeStr = "Message size [bytes]";
static constexpr char const* nccl_nvtxMsgSizeSendStr = "Message size [bytes] (Send)";
static constexpr char const* nccl_nvtxMsgSizeRecvStr = "Message size [bytes] (Recv)";
static constexpr char const* nccl_nvtxReductionOpStrpStr = "Reduction operation";
static constexpr char const* nccl_nvtxDataTypeStr = "Data type";
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommInitAll, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, commhash, TYPE_UINT64, nccl_nvtxCommStr),
(int, ndev, TYPE_INT, "No. of devices")
)
)
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommInitRank, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr),
(int, nranks, TYPE_INT, nccl_nvtxNranksStr),
(int, myrank, TYPE_INT, nccl_nvtxRankStr),
(int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr)
)
)
// The typedef and payload schema for ncclCommInitRank is also used for,
// ncclCommInitRankConfig, ncclCommInitRankScalable, ncclCommDestroy, and ncclCommAbort.
typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankConfig;
typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankScalable;
typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommAbort;
typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommDestroy;
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommSplit, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr),
(uint64_t, parentcomm, TYPE_UINT64, "Parent NCCL communicator ID"),
(int, nranks, TYPE_INT, nccl_nvtxNranksStr),
(int, myrank, TYPE_INT, nccl_nvtxRankStr),
(int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr),
(int, color, TYPE_INT, "Color"),
(int, key, TYPE_INT, "Key")
)
)
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommFinalize, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr)
)
)
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllGather, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
)
)
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllReduce, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
(ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr),
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
)
)
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllToAll, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
)
)
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllToAllv, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
(size_t, sendBytes, TYPE_SIZE, nccl_nvtxMsgSizeSendStr),
(size_t, recvBytes, TYPE_SIZE, nccl_nvtxMsgSizeRecvStr),
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
)
)
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsBroadcast, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
(int, root, TYPE_INT, "Root"),
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
)
)
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsGather, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
(int, root, TYPE_INT, "Root"),
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
)
)
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduce, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
(int, root, TYPE_INT, "Root"),
(ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr),
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
)
)
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduceScatter, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
(ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr),
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
)
)
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsScatter, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
(int, root, TYPE_INT, "Root"),
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
)
)
// Used in NCCL APIs `ncclSend` and `ncclRecv`.
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsSendRecv, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
(int, peer, TYPE_INT, "Peer rank"),
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
)
)
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsMSCCL, static constexpr,
NCCL_NVTX_PAYLOAD_ENTRIES(
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
(ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr),
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
)
)
#endif // end include guard
+4 -1
Ver fichero
@@ -13,7 +13,10 @@ struct nccl_domain{static constexpr char const* name{"NCCL"};};
#define NVTX3_FUNC_RANGE_IN(domain)
#define nvtxNameOsThreadA(syscall, thread)
#define NVTX3_FUNC_WITH_PARAMS(ID, S, P)
#define NVTX3_FUNC_WITH_PARAMS(N, T, P)
#define NVTX3_PAYLOAD(...) __VA_ARGS__
#define NVTX3_RANGE(T)
#define NVTX3_RANGE_ADD_PAYLOAD(N, S, P)
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11
+2
Ver fichero
@@ -380,6 +380,8 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm);
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS);
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
// NB: ncclProxyMsgTypeStr[] in proxy.cc needs to match
enum ncclProxyMsgType {
ncclProxyMsgInit = 1,
ncclProxyMsgSharedInit = 2,
+3
Ver fichero
@@ -70,6 +70,9 @@ DECLARE_ROCM_PFN_EXTERN(hsa_status_string);
extern int ncclCuMemEnable();
extern int ncclCuMemHostEnable();
// Handle type used for cuMemCreate()
extern CUmemAllocationHandleType ncclCuMemHandleType;
ncclResult_t rocmLibraryInit(void);
extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
+81 -134
Ver fichero
@@ -25,6 +25,7 @@
#endif
#include "tuner.h"
#include "ras.h"
#include "mnnvl.h"
#include <fcntl.h>
#include <unistd.h>
#include <hip/hip_runtime.h>
@@ -39,6 +40,7 @@
#include "graph/xml.h"
#include "archinfo.h"
#include "param.h"
#include "nvtx_payload_schemas.h"
// [RCCL]
#include "git_version.h"
@@ -437,6 +439,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
free(comm->rankToNode);
free(comm->rankToLocalRank);
free(comm->collNetHeads);
free(comm->clique.ranks);
if (comm->bootstrap)
NCCLCHECK(bootstrapClose(comm->bootstrap));
@@ -861,6 +864,7 @@ static void showVersion() {
}
}
NCCL_PARAM(MNNVLUUID, "MNNVL_UUID", -1);
NCCL_PARAM(MNNVLCliqueId, "MNNVL_CLIQUE_ID", -1);
static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
@@ -916,12 +920,16 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
info->fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
(void) ncclNvmlDeviceGetGpuFabricInfoV(nvmlDev, &info->fabricInfo);
if (info->fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
if (ncclParamMNNVLUUID() != -1) {
((long*)&info->fabricInfo.clusterUuid)[0] = ncclParamMNNVLUUID();
((long*)&info->fabricInfo.clusterUuid)[1] = ncclParamMNNVLUUID();
}
if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
INFO(NCCL_INIT, "MNNVL busId 0x%lx fabric UUID %lx.%lx cliqueId 0x%x state %d healthMask 0x%x",
info->busId,
((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
}
if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
}
#endif
@@ -991,71 +999,6 @@ NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0);
// MNNVL: Flag to indicate whether to enable Multi-Node NVLink
NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2);
#if CUDART_VERSION >= 11030
#include <cuda.h>
#include "cudawrap.h"
// Determine if MNNVL support is available
static int checkMNNVL(struct ncclComm* comm) {
ncclResult_t ret = ncclSuccess;
// MNNVL requires cuMem to be enabled
if (!ncclCuMemEnable()) return 0;
// MNNVL also requires FABRIC handle support
int cudaDev;
int flag = 0;
CUdevice currentDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUCHECK(cuDeviceGet(&currentDev, cudaDev));
// Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
if (!flag) return 0;
// Check that all ranks have initialized the fabric fully
for (int i = 0; i < comm->nRanks; i++) {
if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return 0;
}
// Determine our MNNVL domain/clique
NCCLCHECKGOTO(ncclCalloc(&comm->clique.ranks, comm->nRanks), ret, fail);
comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId;
for (int i = 0; i < comm->nRanks; i++) {
nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo;
nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
// Check if the cluster UUID and cliqueId match
// A zero UUID means we don't have MNNVL fabric info - disable MNNVL
if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) goto fail;
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
if (i == comm->rank) {
comm->cliqueRank = comm->clique.size;
}
comm->clique.ranks[comm->clique.size++] = i;
}
}
// Determine whether to enable MNNVL or not
comm->MNNVL = ncclParamMNNVLEnable() == 2 ? comm->clique.size > 1 : ncclParamMNNVLEnable();
INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d ", comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank);
if (comm->MNNVL) {
// Force the CUMEM handle type to be FABRIC for MNNVL
ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
}
return comm->MNNVL;
fail:
if (comm->clique.ranks) free(comm->clique.ranks);
return 0;
}
#else
static int checkMNNVL(struct ncclComm* comm) {
return 0;
}
#endif
#define TIMER_INIT_TOTAL 0
#define TIMER_INIT_KERNELS 1
#define TIMER_INIT_BOOTSTRAP 2
@@ -1143,12 +1086,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
// AllGather1 - end
timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER];
// MNNVL support
if (nNodes > 1 && !checkMNNVL(comm) && ncclParamMNNVLEnable() == 1) {
// Return an error if the user specifically requested MNNVL support
WARN("MNNVL is not supported on this system");
ret = ncclSystemError;
goto fail;
// Check for MNNVL support
if ((nNodes > 1 && ncclParamMNNVLEnable() != 0) || ncclParamMNNVLEnable() == 1) {
NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail);
}
do {
@@ -1559,7 +1499,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
comm->collNetSupport = 0;
}
}
comm->isAllNvlink = ncclTopoPathAllNVLink(comm->topo);
NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink));
comm->isOneRPN = (comm->maxLocalRanks == 1);
NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
@@ -1917,6 +1857,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
int cudaDev = job->cudaDev;
int* parentRanks = NULL;
int cudaArch;
int maxSharedMem = 0;
double sum_timers = 0;
uint64_t timers[TIMERS_INIT_COUNT] = {0};
unsigned long long commIdHash;
@@ -1925,12 +1866,13 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
timers[TIMER_INIT_TOTAL] = clockNano();
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
CUDACHECKGOTO(cudaDeviceGetAttribute(&maxSharedMem, cudaDevAttrMaxSharedMemoryPerBlockOptin, cudaDev), res, fail);
CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail);
CUDACHECKGOTO(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail);
cudaArch = 100*archMajor + 10*archMinor;
timers[TIMER_INIT_KERNELS] = clockNano();
NCCLCHECK(ncclInitKernelsForDevice(cudaArch, &maxLocalSizeBytes));
NCCLCHECK(ncclInitKernelsForDevice(cudaArch, maxSharedMem, &maxLocalSizeBytes));
// Set the maximum kernel stack size of all kernels to avoid
// a CUDA memory reconfig on load (c.f. NVSHMEM issue)
#ifdef USE_INDIRECT_FUNCTION_CALL
@@ -2119,18 +2061,24 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) {
comm->config.cgaClusterSize = cgaClusterSizeEnv;
} else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) {
WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
INFO(NCCL_ENV, "NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
comm->config.cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
}
minCTAsEnv = ncclParamMinCTAs();
if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
comm->config.minCTAs = minCTAsEnv;
if (minCTAsEnv <= 0)
INFO(NCCL_ENV, "NCCL_MIN_CTAS %d is too low, leaving it set at %d", minCTAsEnv, comm->config.minCTAs);
else
comm->config.minCTAs = minCTAsEnv;
}
maxCTAsEnv = ncclParamMaxCTAs();
if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
comm->config.maxCTAs = maxCTAsEnv;
if (maxCTAsEnv <= 0)
INFO(NCCL_ENV, "NCCL_MAX_CTAS %d is too low, leaving it set at %d", maxCTAsEnv, comm->config.maxCTAs);
else
comm->config.maxCTAs = maxCTAsEnv;
}
envNetName = ncclGetEnv("NCCL_NET");
@@ -2151,22 +2099,22 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
/* cap channels if needed */
if (comm->config.minCTAs > MAXCHANNELS) {
WARN("minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS);
INFO(NCCL_ENV, "minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS);
comm->config.minCTAs = MAXCHANNELS;
}
if (comm->config.maxCTAs > MAXCHANNELS) {
WARN("maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS);
INFO(NCCL_ENV, "maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS);
comm->config.maxCTAs = MAXCHANNELS;
}
if (comm->config.minCTAs > comm->config.maxCTAs) {
WARN("minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs);
INFO(NCCL_ENV, "minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs);
comm->config.minCTAs = comm->config.maxCTAs;
}
if (comm->config.splitShare != 1 && comm->config.splitShare != 0) {
WARN("splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare);
INFO(NCCL_ENV, "splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare);
comm->config.splitShare = 0;
}
@@ -2352,21 +2300,10 @@ fail:
goto exit;
}
struct NvtxParamsCommInitRank
{
int rank;
int nranks;
int cudaDev;
};
constexpr nvtxPayloadSchemaEntry_t CommInitRankSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommInitRank, nranks)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommInitRank, cudaDev)},
};
NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
ncclResult_t ncclCommInitRank_impl(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
NCCLCHECK(Recorder::instance().record(rrCommInitRank, nranks, myrank, &commId));
NVTX3_RANGE(NcclNvtxParamsCommInitRank)
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
rocmLibraryInit();
@@ -2374,10 +2311,11 @@ ncclResult_t ncclCommInitRank_impl(ncclComm_t* newcomm, int nranks, ncclUniqueId
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
CUDACHECK(cudaGetDevice(&cudaDev));
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, &config, __func__));
NVTX3_RANGE_ADD_PAYLOAD(CommInitRank, NcclNvtxParamsCommInitRankSchema,
NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
return ncclSuccess;
}
@@ -2390,10 +2328,7 @@ ncclResult_t ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlis
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
int oldDev = 0;
constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"}
};
NVTX3_FUNC_WITH_PARAMS(CommInitAll, CommInitAllSchema, ndev)
NVTX3_RANGE(NcclNvtxParamsCommInitAll);
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
rocmLibraryInit();
@@ -2431,14 +2366,17 @@ ncclResult_t ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlis
ncclUniqueId uniqueId;
NCCLCHECKGOTO(ncclGetUniqueId(&uniqueId), ret, fail);
NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
NCCLCHECKGOTO(ncclGroupStartInternal(), ret, fail);
for (int i=0; i<ndev; i++) {
// Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
int dev = devlist ? devlist[i] : i;
CUDACHECKGOTO(cudaSetDevice(dev), ret, fail);
ncclCommInitRankDev(comms+i, ndev,1, &uniqueId, i, dev, &config, __func__);
}
NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
NCCLCHECKGOTO(ncclGroupEndInternal(), ret, fail);
NVTX3_RANGE_ADD_PAYLOAD(CommInitAll, NcclNvtxParamsCommInitAllSchema,
NVTX3_PAYLOAD(comms[0]->commHash, ndev));
exit:
(void)cudaSetDevice(oldDev);
@@ -2465,14 +2403,14 @@ ncclResult_t ncclCommInitRankConfig_impl(ncclComm_t *newcomm, int nranks, ncclUn
ncclResult_t ret = ncclSuccess;
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
ncclConfig_t *internalConfigPtr = NULL;
NVTX3_RANGE(NcclNvtxParamsCommInitRankConfig);
NCCLCHECK(ncclGroupStartInternal());
rocmLibraryInit();
CUDACHECK(cudaGetDevice(&cudaDev));
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommInitRankConfig, CommInitRankSchema, payload)
if (config == NULL)
internalConfigPtr = &internalConfig;
else
@@ -2482,7 +2420,13 @@ ncclResult_t ncclCommInitRankConfig_impl(ncclComm_t *newcomm, int nranks, ncclUn
exit:
ncclGroupErrCheck(ret);
NCCLCHECK(ncclGroupEndInternal());
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
if (newcomm && *newcomm) {
if (!(*newcomm)->config.blocking) {
(void) ncclCommGetAsyncError(*newcomm, &ret);
}
NVTX3_RANGE_ADD_PAYLOAD(CommInitRankConfig, NcclNvtxParamsCommInitRankSchema,
NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
}
return ret;
fail:
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
@@ -2491,6 +2435,8 @@ fail:
NCCL_API(ncclResult_t, ncclCommInitRankScalable, ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config);
ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config) {
NVTX3_RANGE(NcclNvtxParamsCommInitRankScalable);
int cudaDev;
ncclResult_t ret = ncclSuccess;
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
@@ -2500,9 +2446,6 @@ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myran
rocmLibraryInit();
CUDACHECK(cudaGetDevice(&cudaDev));
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommInitRankScalable, CommInitRankSchema, payload)
if (config == NULL)
internalConfigPtr = &internalConfig;
else
@@ -2512,7 +2455,13 @@ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myran
exit:
ncclGroupErrCheck(ret);
NCCLCHECK(ncclGroupEndInternal());
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
if (newcomm && *newcomm) {
if (!(*newcomm)->config.blocking) {
(void) ncclCommGetAsyncError(*newcomm, &ret);
}
NVTX3_RANGE_ADD_PAYLOAD(CommInitRankScalable, NcclNvtxParamsCommInitRankSchema,
NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
}
return ret;
fail:
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
@@ -2590,7 +2539,8 @@ static ncclResult_t commCleanup(ncclComm_t comm) {
NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm);
ncclResult_t ncclCommFinalize_impl(ncclComm_t comm) {
NCCLCHECK(Recorder::instance().record(rrCommFinalize, comm));
NVTX3_FUNC_RANGE_IN(nccl_domain);
NVTX3_RANGE(NcclNvtxParamsCommFinalize);
ncclResult_t ret = ncclSuccess;
struct ncclCommFinalizeAsyncJob *job = NULL;
@@ -2615,7 +2565,13 @@ ncclResult_t ncclCommFinalize_impl(ncclComm_t comm) {
exit:
ncclGroupErrCheck(ret);
NCCLCHECK(ncclGroupEndInternal());
if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); }
if (comm) {
if (!comm->config.blocking) {
NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
}
NVTX3_RANGE_ADD_PAYLOAD(CommFinalize, NcclNvtxParamsCommFinalizeSchema,
NVTX3_PAYLOAD(comm->commHash));
}
return ret;
fail:
free(job);
@@ -2711,8 +2667,8 @@ ncclResult_t ncclCommDestroy_impl(ncclComm_t comm) {
struct ncclCommFinalizeAsyncJob *job = NULL;
ncclResult_t res = ncclSuccess;
NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommDestroy, CommInitRankSchema, payload)
NVTX3_FUNC_WITH_PARAMS(CommDestroy, NcclNvtxParamsCommInitRank,
NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
NCCLCHECK(ncclGroupStartInternal());
@@ -2740,8 +2696,9 @@ fail:
NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
ncclResult_t ncclCommAbort_impl(ncclComm_t comm) {
NCCLCHECK(Recorder::instance().record(rrCommAbort, comm));
NVTX3_RANGE(NcclNvtxParamsCommAbort);
if (comm == NULL) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
return ncclSuccess;
}
NCCLCHECK(ncclGroupStartInternal());
@@ -2762,8 +2719,8 @@ ncclResult_t ncclCommAbort_impl(ncclComm_t comm) {
struct ncclCommFinalizeAsyncJob *job = NULL;
ncclResult_t res = ncclSuccess;
NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload)
NVTX3_RANGE_ADD_PAYLOAD(CommAbort, NcclNvtxParamsCommInitRankSchema,
NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
@@ -2779,29 +2736,13 @@ fail:
goto exit;
}
struct NvtxParamsCommSplit {
int rank;
int nranks;
int cudaDev;
int color;
int key;
};
constexpr nvtxPayloadSchemaEntry_t CommSplitSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommSplit, nranks)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommSplit, cudaDev)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "color", nullptr, 0, offsetof(NvtxParamsCommSplit, color)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "key", nullptr, 0, offsetof(NvtxParamsCommSplit, key)},
};
NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
ncclResult_t ncclCommSplit_impl(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
struct ncclCommInitRankAsyncJob *job = NULL;
struct ncclComm* childComm = NCCL_COMM_NULL;
ncclResult_t res = ncclSuccess;
NvtxParamsCommSplit payload{comm->rank, comm->nRanks, comm->cudaDev, color, key};
NVTX3_FUNC_WITH_PARAMS(CommSplit, CommSplitSchema, payload)
NVTX3_RANGE(NcclNvtxParamsCommSplit)
int oldDev;
CUDACHECK(cudaGetDevice(&oldDev));
@@ -2863,6 +2804,12 @@ exit:
(void)cudaSetDevice(oldDev);
(void)ncclGroupErrCheck(res);
NCCLCHECK(ncclGroupEndInternal());
if (res == ncclSuccess && *newcomm) {
NVTX3_RANGE_ADD_PAYLOAD(CommSplit, NcclNvtxParamsCommSplitSchema,
NVTX3_PAYLOAD((*newcomm)->commHash, comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, color, key));
}
return res;
fail:
if (childComm) {
+2
Ver fichero
@@ -24,6 +24,8 @@ DECLARE_ROCM_PFN(hsa_init);
DECLARE_ROCM_PFN(hsa_system_get_info);
DECLARE_ROCM_PFN(hsa_status_string);
// Handle type used for cuMemCreate()
CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
static void *hsaLib;
static uint16_t version_major, version_minor;
+82
Ver fichero
@@ -0,0 +1,82 @@
/*************************************************************************
* Copyright (c) 2015-2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "mnnvl.h"
#include "transport.h"
#include <cuda.h>
#include "rocmwrap.h"
// Determine if MNNVL support is available
ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) {
// MNNVL requires cuMem to be enabled
if (!ncclCuMemEnable()) return ncclSuccess;
// MNNVL also requires FABRIC handle support
int cudaDev;
int flag = 0;
CUdevice currentDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUDACHECK(cuDeviceGet(&currentDev, cudaDev));
// Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
(void) cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev);
if (!flag) return ncclSuccess;
// Check that all ranks have initialized the fabric fully
for (int i = 0; i < comm->nRanks; i++) {
if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return ncclSuccess;
}
// Determine our MNNVL domain/clique
NCCLCHECK(ncclCalloc(&comm->clique.ranks, comm->nRanks));
comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId;
for (int i = 0; i < comm->nRanks; i++) {
nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo;
nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
// Check if the cluster UUID and cliqueId match
// A zero UUID means we don't have MNNVL fabric info - disable MNNVL
if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess;
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
if (i == comm->rank) {
comm->cliqueRank = comm->clique.size;
}
comm->clique.ranks[comm->clique.size++] = i;
}
}
// No MNNVL clique found
if (comm->clique.size <= 1) return ncclSuccess;
// Check that FABRIC handles can be exported & imported by IMEX
{
void *ptr = NULL;
CUmemGenericAllocationHandle handle;
ncclCuDesc cuDesc;
CUresult err;
// Allocate FABRIC handle compatible memory
ncclResult_t ret = ncclCuMemAlloc(&ptr, &handle, CU_MEM_HANDLE_TYPE_FABRIC, CUDA_IPC_MIN);
if (ret != ncclSuccess) return ncclSuccess;
err = cuMemExportToShareableHandle(&cuDesc, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0);
if (err != CUDA_SUCCESS ||
(err = cuMemImportFromShareableHandle(&handle, &cuDesc, CU_MEM_HANDLE_TYPE_FABRIC)) != CUDA_SUCCESS) {
const char *errStr;
(void) cuGetErrorString(err, &errStr);
NCCLCHECK(ncclCuMemFree(ptr));
// Return an error if this is a MNNVL capable system but it's not working
WARN("MNNVL (cliqueSize %d) is available but not supported on this system. Check the IMEX configuration.",
comm->clique.size);
return ncclSystemError;
}
NCCLCHECK(ncclCuMemFree(ptr));
// Force the CUMEM handle type to be FABRIC for MNNVL
ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
comm->MNNVL = 1;
INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d",
comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank);
}
return ncclSuccess;
}
+3 -15
Ver fichero
@@ -8,6 +8,7 @@
#include "msccl/msccl_setup.h"
#include "msccl/msccl_status.h"
#include "api_trace.h"
#include "nvtx_payload_schemas.h"
#include <cstdio>
#include <cstdlib>
@@ -49,21 +50,8 @@ ncclResult_t mscclRunAlgo_impl(
size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream) {
Recorder::instance().record("mscclRunAlgo");
struct NvtxParamsMsccl {
size_t bytes;
ncclRedOp_t op;
ncclDataType_t dataType;
};
// Just pass the size of one send/recv messages and not the total bytes sent/received.
constexpr nvtxPayloadSchemaEntry_t MscclSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsMsccl, op)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
offsetof(NvtxParamsMsccl, dataType)}
};
NvtxParamsMsccl payload{count * ncclTypeSize(dataType), op, dataType};
NVTX3_FUNC_WITH_PARAMS(MSCCL, MscclSchema, payload)
NVTX3_FUNC_WITH_PARAMS(MSCCL, NcclNvtxParamsMSCCL,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(dataType), op, dataType));
mscclStatus& status = mscclGetStatus(comm->rank);
struct mscclAlgo* hostAlgo = status.hostAlgos[mscclAlgoHandle];
+21 -15
Ver fichero
@@ -23,6 +23,8 @@
#include <sys/time.h>
#include <sched.h>
#define NCCL_MAX_PROXY_CONNECTIONS (NCCL_MAX_LOCAL_RANKS+1)
void* ncclProxyServiceUDS(void* _args);
static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
@@ -799,8 +801,8 @@ process_nextops:
ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlAppend);
TIME_START(2);
int freeOp[NCCL_MAX_LOCAL_RANKS];
int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
int freeOp[NCCL_MAX_PROXY_CONNECTIONS];
int freeOpEnd[NCCL_MAX_PROXY_CONNECTIONS];
for (int i = 0; i < proxyState->tpLocalnRanks; i++) freeOp[i] = -1;
uint64_t lastOpCount = 0;
@@ -1090,7 +1092,8 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
struct ncclProxyState* sharedProxyState = comm->proxyState;
int tpProxyRank = comm->topParentRanks[proxyRank];
proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
proxyConn->sameProcess = ((comm->peerInfo[proxyRank].hostHash == comm->peerInfo[comm->rank].hostHash) &&
(comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash)) ? 1 : 0;
// Keep one connection per local rank
proxyConn->connection = NULL;
proxyConn->tpRank = tpProxyRank;
@@ -1223,7 +1226,7 @@ fail:
goto exit;
}
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" };
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd", "QueryFd", "Register", "Deregister" };
ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
struct ncclSocket* sock;
ncclResult_t ret = ncclSuccess;
@@ -1583,18 +1586,18 @@ void* ncclProxyService(void* _args) {
connectionPool.banks = 0;
connectionPool.offset = NCCL_PROXY_CONN_POOL_SIZE;
struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1];
struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS);
for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
struct pollfd pollfds[NCCL_MAX_PROXY_CONNECTIONS+1]; // one extra for listenSock fd
struct ncclProxyLocalPeer peers[NCCL_MAX_PROXY_CONNECTIONS];
memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_PROXY_CONNECTIONS);
for (int s=0; s<NCCL_MAX_PROXY_CONNECTIONS; s++) {
pollfds[s].fd = -1;
pollfds[s].events = POLLHUP|POLLIN;
}
if (ncclSocketGetFd(proxyState->listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) {
if (ncclSocketGetFd(proxyState->listenSock, &pollfds[NCCL_MAX_PROXY_CONNECTIONS].fd) != ncclSuccess) {
WARN("[Proxy Service] Get listenSock fd fails");
return NULL;
};
pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
pollfds[NCCL_MAX_PROXY_CONNECTIONS].events = POLLIN;
int maxnpeers = 0;
int npeers = 0;
@@ -1608,17 +1611,19 @@ void* ncclProxyService(void* _args) {
/* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
int ret;
do {
ret = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500);
// poll all fds including the listenSock
ret = poll(pollfds, NCCL_MAX_PROXY_CONNECTIONS+1, asyncOpCount ? 0 : 500);
} while (ret < 0 && errno == EINTR);
if (ret < 0) {
WARN("[Proxy Service] Poll failed: %s", strerror(errno));
return NULL;
}
if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) {
if (pollfds[NCCL_MAX_PROXY_CONNECTIONS].revents) {
// We got an event on the listenSock
int s = 0;
while (s < NCCL_MAX_LOCAL_RANKS && pollfds[s].fd >= 0) s++;
if (s == NCCL_MAX_LOCAL_RANKS) {
WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS);
while (s < NCCL_MAX_PROXY_CONNECTIONS && pollfds[s].fd >= 0) s++;
if (s == NCCL_MAX_PROXY_CONNECTIONS) {
WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_PROXY_CONNECTIONS);
return NULL;
}
if (maxnpeers < s+1) maxnpeers = s+1;
@@ -1851,6 +1856,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
if (*comm->abortFlag == 0 && sharedProxyState->peerAddresses) {
// We need to send a ncclProxyMsgStop message to our own proxy
struct ncclSocket sock;
int type = ncclProxyMsgStop;
NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag));
+3 -3
Ver fichero
@@ -80,7 +80,7 @@ static int rasOutBufferSize = 0;
// We use them all over the place; no point in wasting the stack...
static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS_CLIENT_DETAIL_THRESHOLD) rank numbers
// or for printing the local GPU devices, which can't be more than 64 (NCCL_MAX_LOCAL_RANKS)
// or for printing the local GPU devices, which can't be more than 64
// small numbers (times two if the NVML mask is different than the CUDA mask).
// Still, 1024 should normally be plenty (verbose output may make things more difficult,
// but we do check for overflows, so it will just be trimmed).
@@ -1687,7 +1687,7 @@ static int rasCommRanksCollOpCompare(const void* p1, const void* p2) {
const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size) {
bool first = true;
buf[0] = '\0';
for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++)
for (int i = 0; i < sizeof(cudaDevs)*8; i++)
if (cudaDevs & (1UL << i)) {
snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i);
first = false;
@@ -1695,7 +1695,7 @@ const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf,
if (cudaDevs != nvmlDevs) {
snprintf(buf+strlen(buf), size-strlen(buf), " (NVML ");
first = true;
for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++)
for (int i = 0; i < sizeof(nvmlDevs)*8; i++)
if (nvmlDevs & (1UL << i)) {
snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i);
first = false;
+1 -1
Ver fichero
@@ -78,7 +78,7 @@ struct rasCollResponse {
struct rasPeerInfo {
union ncclSocketAddress addr;
pid_t pid;
uint64_t cudaDevs; // Bitmask. Conveniently, NCCL_MAX_LOCAL_RANKS == 64.
uint64_t cudaDevs; // Bitmask. This is for local devices so 64 bits is enough.
uint64_t nvmlDevs; // Same, but not affected by CUDA_VISIBLE_DEVICES.
};
+11 -7
Ver fichero
@@ -73,15 +73,19 @@ ncclResult_t ncclRegisterCollNvlsBuffers(
if (nvlsReged) {
*regNeedConnect = 0;
/* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
* saturate bandwidth. */
/* tweak NVLS channels usage; for registered NVLS buffer to saturate bandwidth. */
if (comm->nNodes == 1) {
if (info->func == ncclFuncReduceScatter)
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
else
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
if (info->func == ncclFuncReduceScatter) {
// RS: Further tweaks for Blackwell with NVLS registered buffers
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 6 : 5));
}
else {
// AR/AG: Further tweaks for Blackwell with NVLS registered buffers
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 8 : 4));
}
} else {
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6));
// Further tweaks for Blackwell with NVLS registered buffers
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 7 : 6));
}
info->regBufType |= NCCL_NVLS_REG_BUFFER;
}
+9 -3
Ver fichero
@@ -141,9 +141,11 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr,
#include "channel.h"
#define NVLS_MEM_ALIGN_SIZE (1 << 21)
#define NVLS_NCHANNELS_SM90 16
#define NVLS_NCHANNELS_SM100 32
NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2);
NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16);
NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", -2);
NCCL_PARAM(NvlsChunkSize, "NVLS_CHUNKSIZE", 128*1024);
ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
@@ -152,7 +154,7 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
int gpuCount;
NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount));
if (!ncclParamNvlsEnable() || ((!comm->MNNVL && gpuCount <= 2) || (comm->MNNVL && comm->clique.size <= 2))) return ncclSuccess;
if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess;
CUdevice dev;
int driverVersion;
@@ -170,7 +172,11 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
}
INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev);
if (comm->nvlsSupport == 1) comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (int)ncclParamNvlsChannels()));
if (comm->nvlsSupport) {
int channels = (comm->compCap >= 100) ? NVLS_NCHANNELS_SM100 : NVLS_NCHANNELS_SM90;
if (ncclParamNvlsChannels() >= 0) channels = ncclParamNvlsChannels();
comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, channels));
}
return ncclSuccess;
}
+9 -5
Ver fichero
@@ -228,7 +228,7 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int refcount, ncclIpcDe
// cuMem API support
CUmemGenericAllocationHandle handle;
NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size));
NCCLCHECK(ncclCuMemAlloc(ptr, &handle, type, size));
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
// Return the native cuMem handle for later Export/Import via UDS
memcpy(&ipcDesc->cuDesc.data, &handle, sizeof(handle));
@@ -852,7 +852,7 @@ ncclResult_t ret = ncclSuccess;
if (isLegacyIpc) *isLegacyIpc = false;
if (regRecord) {
// buffer was registered by by users, we need to start to register or reuse it
int peerLocalRank;
int peerLocalRank = -1;
for (int p = 0; p < nPeers; p++) {
int peerRank = peerRanks[p];
peerLocalRank = comm->rankToLocalRank[peerRank];
@@ -924,8 +924,10 @@ ncclResult_t ret = ncclSuccess;
ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr;
// Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side
// and get the remote register address back.
if (proxyConn)
if (proxyConn) {
INFO(NCCL_REG, "rank %d - IPC registering buffer %p size %ld (baseAddr %p size %ld) to peer %d", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank);
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
}
if (rmtRegAddr) {
NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail);
assert(regRecord->ipcInfos[peerLocalRank] == NULL);
@@ -943,7 +945,7 @@ ncclResult_t ret = ncclSuccess;
regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr;
needUpdate = true;
*regBufFlag = 1;
INFO(NCCL_REG, "rank %d - IPC register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
INFO(NCCL_REG, "rank %d - IPC registered buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
}
}
}
@@ -1077,6 +1079,8 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str
assert(sizeof(struct p2pIpcExpInfo) == reqSize);
assert(sizeof(void*) == respSize);
INFO(NCCL_REG, "Proxy rank %d register reqBuff %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, reqBuff, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
// request peer passes all necessary buffer info to import. The proxy thread would register
// the buffer locally and return register addr back
if (ipcExpInfo->legacyIpcCap) {
@@ -1110,7 +1114,7 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str
regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset);
#endif
}
INFO(NCCL_REG, "Proxy rank %d register succeeds, regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
INFO(NCCL_REG, "Proxy rank %d register success regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
exit:
memcpy(respBuff, (void*)&regAddr, sizeof(void*));