Merge remote-tracking branch 'nccl/master' into develop
This commit is contained in:
@@ -419,6 +419,7 @@ set(SRC_FILES
|
||||
src/group.cc
|
||||
src/init.cc
|
||||
src/init_nvtx.cc
|
||||
src/mnnvl.cc
|
||||
src/net.cc
|
||||
src/msccl.cc
|
||||
src/proxy.cc
|
||||
@@ -485,6 +486,7 @@ set(SRC_FILES
|
||||
src/include/ibvwrap.h
|
||||
src/include/info.h
|
||||
src/include/ipcsocket.h
|
||||
src/include/mnnvl.h
|
||||
src/include/nccl_common.h
|
||||
src/include/nccl_net.h
|
||||
src/include/nccl_profiler.h
|
||||
@@ -493,6 +495,7 @@ set(SRC_FILES
|
||||
src/include/net.h
|
||||
src/include/nvmlwrap.h
|
||||
src/include/nvtx.h
|
||||
src/include/nvtx_payload_schemas.h
|
||||
src/include/nvtx_stub.h
|
||||
src/include/p2p.h
|
||||
src/include/param.h
|
||||
|
||||
@@ -0,0 +1,318 @@
|
||||
# NCCL Profiler Plugin Documentation
|
||||
|
||||
This page describes the NCCL Profiler plugin API and how to implement a profiler plugin for NCCL.
|
||||
|
||||
# Overview
|
||||
|
||||
To allow NCCL to better integrate with DL frameworks, NCCL v2.23 introduced a profiler plugin
|
||||
interface. Any NCCL user can write profiler plugins to extract performance data from NCCL and
|
||||
use it for debugging and analysis.
|
||||
|
||||
Similarly to other plugins (e.g., network plugin), the profiler plugins come as a shared library
|
||||
called `libnccl-profiler.so`. That shared library contains one or more implementations of the
|
||||
NCCL PROFILER API, in the form of versioned structs, filled with pointers to all required
|
||||
functions.
|
||||
|
||||
# Plugin architecture
|
||||
|
||||
## Plugin name and supporting multiple profiler plugins
|
||||
|
||||
When NCCL is initialized, it will look for a `libnccl-profiler.so` library and dynamically load
|
||||
it, then look for symbols inside the library.
|
||||
|
||||
The `NCCL_PROFILER_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL
|
||||
will look for a library with a name of `libnccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. It is therefore
|
||||
advised to name the library following that pattern, with a symlink pointing `libnccl-profiler.so`
|
||||
to `libnccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. That way, if there are multiple plugins in the
|
||||
path, setting `NCCL_PROFILER_PLUGIN` will allow users to select the right plugin. Alternatively,
|
||||
the user can also set `NCCL_PROFILER_PLUGIN` to the pathname of the `libnccl-profiler.so` library.
|
||||
|
||||
## Struct versioning
|
||||
|
||||
Once a library is found, NCCL will look for a symbol named `ncclProfiler_vX`, with `X` increasing
|
||||
over time. The versioning ensures that the plugin and the NCCL core are compatible.
|
||||
|
||||
Plugins are encouraged to provide multiple of those symbols, implementing multiple versions of the
|
||||
NCCL PROFILER API, so that the same plugin can be compiled and support a wide range of NCCL versions.
|
||||
|
||||
Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking
|
||||
for the latest ncclProfiler struct version, but also looking for older ones so that older plugins
|
||||
would still work.
|
||||
|
||||
## Headers management
|
||||
|
||||
To help users build plugins effortlessly, plugins should copy the `ncclProfiler_vX` definitions
|
||||
they support to their internal includes. An example is shown in `ext-profiler/example` where we
|
||||
keep all headers in the `nccl/` directory and provide thin layers to implement old version on top
|
||||
of newer ones.
|
||||
|
||||
The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
|
||||
from old API versions. It also provides error codes in `err.h`.
|
||||
|
||||
# API (v2)
|
||||
|
||||
Below is the main `ncclProfiler_v2` struct. Each function is explained in later sections.
|
||||
|
||||
```
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v2_t;
|
||||
```
|
||||
|
||||
## Error codes
|
||||
|
||||
As rule of thumb, profiler generated errors should not be propagated to NCCL and alter its normal
|
||||
functioning. Nevertheless, the profiler interface returns NCCL error codes, in case any need for
|
||||
them arises in the future. For now, any profiler interface call should only return `ncclSuccess`.
|
||||
The only exception is `init` that can return an error so that NCCL can disable the plugin.
|
||||
|
||||
## Operation overview
|
||||
|
||||
NCCL will call the `init` function first for every new communicator that is initialized. The profiler
|
||||
returns an opaque context handle that is used to isolate profiler instances across communicators.
|
||||
Similarly, NCCL will call `finalize` to destroy the profiler context, thus freeing resources.
|
||||
|
||||
The NCCL core code is instrumented with calls to `startEvent`, `stopEvent` and `recordEventState`.
|
||||
These are used to start, stop and update events in the profiler, respectively.
|
||||
|
||||
## API Functions
|
||||
|
||||
### Initialization
|
||||
|
||||
#### name
|
||||
|
||||
The `name` field should point to a character string with the name of the profiler plugin. This will
|
||||
be used for all logging, especially when `NCCL_DEBUG=INFO` is set.
|
||||
|
||||
#### init
|
||||
|
||||
As soon as NCCL finds the plugin and the correct ncclProfiler symbol, it calls its `init` function.
|
||||
This allows the plugin to initialize its internal context, used during profiling of NCCL events.
|
||||
If the `init` function does not return `ncclSuccess`, NCCL disables the plugin.
|
||||
|
||||
#### finalize
|
||||
|
||||
When the profiler is no longer needed, a call to `finalize` destroys the profiler context and frees
|
||||
up resources.
|
||||
|
||||
### Profiling
|
||||
|
||||
#### startEvent
|
||||
|
||||
When NCCL needs to start profiling a new event it calls `startEvent`. `startEvent` takes the profiler
|
||||
context, previously created by `init`, an event descriptor of type `ncclProfilerEventDescr_t` and
|
||||
returns an opaque profiler event handle that can be passed to other profiler functions, as discussed
|
||||
later in the document.
|
||||
|
||||
|
||||
The event descriptor contains all the event metadata. Every event type has its own descriptor. Below
|
||||
is the `ncclProfilerEventDescr_t` struct.
|
||||
|
||||
```
|
||||
typedef struct {
|
||||
uint8_t type; // event type (e.g., ncclProfileGroup, ncclProfileColl, ...)
|
||||
void* parentObj; // pointer to parent event used to expose the event hierarchy to the profiler
|
||||
int rank; // rank that generated the event
|
||||
union {
|
||||
struct { // collective events metadata
|
||||
const char* name; // string containing name of the communicator
|
||||
uint64_t commHash; // unique hash/id for the communicator
|
||||
uint64_t seqNumber; // sequence number of this collective operation in the communicator
|
||||
const char* func; // string containing name of the collective
|
||||
void const* sendBuff; // address of send buffer
|
||||
void* recvBuff; // address of recv buffer
|
||||
size_t count; // data count
|
||||
int root; // root rank
|
||||
const char* datatype; // string containing the name of the datatype
|
||||
size_t trafficBytes; // number of transfer bytes
|
||||
uint8_t nMaxChannels; // max number of channels for this collective
|
||||
uint8_t nWarps; // number of GPU warps for this collective
|
||||
const char* algo; // string containing name of the algorithm for this collective
|
||||
const char* proto; // string containing name of the protocol for this collective
|
||||
} coll;
|
||||
|
||||
struct { // point-to-point events metadata
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
const char* func;
|
||||
void* buff;
|
||||
const char* datatype;
|
||||
size_t count;
|
||||
int peer; // peer rank for this point-to-point
|
||||
} p2p;
|
||||
|
||||
struct { // proxyOp events metadata
|
||||
pid_t pid; // process id that generated the associated `ncclProxyOp` object
|
||||
uint8_t channelId; // id of the channel used by the associated `ncclProxyOp` object
|
||||
int peer; // peer rank
|
||||
int nSteps; // number of network transfers/steps required by the `ncclProxyOp`
|
||||
int chunkSize; // chunk size for this `ncclProxyOp`
|
||||
int isSend; // set to 1 for sends and 0 for recvs
|
||||
} proxyOp;
|
||||
|
||||
struct { // proxyStep events metadata
|
||||
int step; // individual step in `ncclProxyOp`
|
||||
} proxyStep;
|
||||
};
|
||||
} ncclProfilerEventDescr_v2_t;
|
||||
```
|
||||
|
||||
NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
|
||||
`ncclProfileProxyOp`, `ncclProfileProxyStep`, and `ncclProfileProxyCtrl`.
|
||||
|
||||
#### stopEvent
|
||||
|
||||
`stopEvent` takes the event handle returned by `startEvent` to stop the event. After the event
|
||||
has been stopped the handle can no longer be used with other profiler calls. Using the event
|
||||
handle after `eventStop` is undefined behavior.
|
||||
|
||||
#### recordEventState
|
||||
|
||||
Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`,
|
||||
`ncclProfileP2p`, cannot be updated through calls to `recordEventState`.
|
||||
|
||||
`ncclProfileProxyOp`, `ncclProfileProxyStep` and `ncclProfileProxyCtrl` can be updated through
|
||||
calls to `recordEventState`.
|
||||
|
||||
The state of proxy generated events can be updated, along with event attributes, using
|
||||
`recordEventState`. These events can go through several states during their lifecycle.
|
||||
The list of supported states for the proxy-defined events is reported below.
|
||||
|
||||
```
|
||||
typedef enum {
|
||||
// ncclProfileProxyOp event states
|
||||
ncclProfilerProxyOpSendPosted, // state marks the posting of send buffer to GPU for given network transfer/step
|
||||
ncclProfilerProxyOpSendRemFifoWait, // state marks the waiting of CTS credits from peer rank
|
||||
ncclProfilerProxyOpSendTransmitted, // state marks the sending of network transfer/step to peer rank
|
||||
ncclProfilerProxyOpSendDone, // state marks the ending of network transfer/step
|
||||
ncclProfilerProxyOpRecvPosted, // state marks the posting of recv to network for given network transfer/step
|
||||
ncclProfilerProxyOpRecvReceived, // state marks the recving of network transfer/step from peer rank
|
||||
ncclProfilerProxyOpRecvTransmitted, // state marks the ending of the network transfer/step
|
||||
ncclProfilerProxyOpRecvDone, // state marks the consuming of data from GPU
|
||||
|
||||
// ncclProfileProxyStep event states
|
||||
ncclProfilerProxyStepSendGPUWait, // state marks the waiting of send data from GPU for given network transfer/step
|
||||
ncclProfilerProxyStepSendWait, // state marks the waiting of send data from network for given network transfer/step
|
||||
ncclProfilerProxyStepRecvWait, // state marks the waiting of recv data from network for given network transfer/step
|
||||
ncclProfilerProxyStepRecvFlushWait, // state marks the waiting of recv data flush to GPU for given network transfer/step
|
||||
ncclProfilerProxyStepRecvGPUWait, // state marks the waiting of recv data consumption from GPU for given network transfer/step
|
||||
|
||||
// ncclProfileProxyCtrl event states
|
||||
ncclProfilerProxyCtrlIdle, // state marks proxy progress thread idle
|
||||
ncclProfilerProxyCtrlActive, // state marks proxy progress thread active
|
||||
ncclProfilerProxyCtrlSleep, // state marks proxy progress thread sleeping
|
||||
ncclProfilerProxyCtrlWakeup, // state marks proxy progress thread waking up
|
||||
ncclProfilerProxyCtrlAppend, // state marks append of new network work item begin
|
||||
ncclProfilerProxyCtrlAppendEnd, // state marks append of new network work item end
|
||||
} ncclProfilerEventState_v2_t;
|
||||
```
|
||||
|
||||
`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
|
||||
network requests for the GPU kernel. ProxyOp events are generated for every active channel and
|
||||
provide a summary of the activity of the proxy progress thread for that channel.
|
||||
|
||||
`ncclProfileProxyStep` events are generated by the proxy progress thread while it is processing
|
||||
network requests for the GPU kernel. ProxyStep events describe individual network transfer in
|
||||
the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events.
|
||||
|
||||
`ncclProfileProxyCtrl` events are generated by the proxy progress thread while it is not processing
|
||||
network requests for the GPU kernel. This includes everything else that the proxy thread might be
|
||||
doing, including appending new `ncclProxyOp` objects to the list of work elements to process.
|
||||
|
||||
State transitions for the events described can also come with event attribute updates. For this
|
||||
reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below.
|
||||
|
||||
```
|
||||
typedef union {
|
||||
struct { // attributes to update for ncclProfileProxyOp events
|
||||
size_t transSize; // data transferred thus far
|
||||
int steps; // network transfer/steps processed thus far
|
||||
} proxyOp;
|
||||
|
||||
struct { // attributes to update for ncclProfileProxyCtrl
|
||||
int appendedProxyOps; // number of appended proxy ops thus far
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v2_t;
|
||||
```
|
||||
|
||||
The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
|
||||
|
||||
### Event hierarchy
|
||||
|
||||
NCCL core events (reported above) are organized into a hierarchy as reported below:
|
||||
|
||||
```
|
||||
Group event
|
||||
|
|
||||
+- Collective event
|
||||
| |
|
||||
| +- ProxyOp event
|
||||
| |
|
||||
| +- ProxyStep event
|
||||
|
|
||||
+- Point-to-point event
|
||||
|
|
||||
+- ProxyOp event
|
||||
|
|
||||
+- ProxyStep event
|
||||
|
||||
ProxyCtrl event
|
||||
```
|
||||
|
||||
# Profiler instrumentation and logging
|
||||
|
||||
## Profiling of collective and p2p operations
|
||||
|
||||
The NCCL code is instrumented with profiler callbacks at different levels to capture start/stop of groups,
|
||||
collective and point-to-point operations, as well as proxy progress activity. Due to the asynchronous nature
|
||||
of NCCL operations, events associated to collective and point-to-point operations are not easy to delimit
|
||||
precisely. For example, without both proxy and/or kernel activity it is impossible for the profiler to
|
||||
figure out when a collective operation completes. Therefore, `stopEvent` for collectives simply indicates to
|
||||
the profiler that the collective has been enqueued. The profiler can leverage proxy event information, if
|
||||
these are enabled, to estimate when the collective ends. In this case, the profiler can look at the `stopEvent`
|
||||
call of the last `ncclProfileProxyOp` event to mark the completion of the associated collective event. This
|
||||
can be achieved by reference counting the collective event and letting calls to `startEvent` and `stopEvent`
|
||||
increment and decrement the reference counter, respectively.
|
||||
|
||||
## PXN
|
||||
|
||||
PXN causes some proxy operations to be processed in a remote proxy thread that differs from the one that
|
||||
generated the operation. When this happens, the event hierarchy reported above breaks. Because the
|
||||
profiler can use the hierarchy information, provided by NCCL in the event descriptor, to dereference the
|
||||
parent event during `startEvent`, the remote proxy thread must be in the same address space of the proxy
|
||||
thread originating the operation. To avoid the profiler instance in the remote proxy address space to
|
||||
dereference a pointer from another address space the event descriptor includes the PID of the originator.
|
||||
The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the
|
||||
parent event.
|
||||
@@ -0,0 +1,239 @@
|
||||
# NCCL Example Profiler Plugin Usage
|
||||
|
||||
This page describes how to use the NCCL example profiler plugin
|
||||
|
||||
# Overview
|
||||
|
||||
The example profiler plugin implements the NCCL profiler plugin API introduced in NCCL v2.23. The API
|
||||
defines a set of events and data structures that NCCL uses to share event information with profiler
|
||||
plugins. The user can control what events are instrumented by NCCL and when traces collected by the
|
||||
profiler should be dumped through environment variables, as described in the rest of the document.
|
||||
The user can also control other profiler parameters that alter its behavior. For example, users can
|
||||
change the size of the event window the profiler keeps track of.
|
||||
|
||||
## Building the profiler plugin
|
||||
|
||||
To use the example plugin, just type `make`. You will need a NCCL build's include directory present.
|
||||
You can override `NCCL_HOME` to where the NCCL installation is on your system.
|
||||
|
||||
## Using the profiler plugin
|
||||
|
||||
1. Add the directory of this profiler plugin to your `LD_LIBRARY_PATH` or set the `NCCL_PROFILER_PLUGIN`,
|
||||
as documented in `ext-profiler/README.md`.
|
||||
|
||||
2. Set `NCCL_PROFILE_EVENT_MASK` bitmask to specify the NCCL events you want to instrument. By
|
||||
default, all collectives and send/recv operations will be traced. For more details about the event
|
||||
representation used by the profiler refer to `ext-profiler/README.md`.
|
||||
|
||||
As an example, setting:
|
||||
|
||||
`NCCL_PROFILE_EVENT_MASK` to 1 (`ncclProfileGroup`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
|
||||
|
||||
enables the profiling of the group, the collective and the proxy op events. The same events can be
|
||||
expressed more concisely by setting `NCCL_PROFILE_EVENT_MASK` to 8 (`ncclProfileProxyOp`). Indeed,
|
||||
in NCCL all the events above (in the event hierarchy) the one requested are also captured. The advantage
|
||||
is that the profiler can easily correlate events that belong to the same NCCL operation and present
|
||||
them accordingly.
|
||||
|
||||
3. Set `NCCL_PROFILE_DUMP_FILE` to the name of the dump file for the collected traces. A file named
|
||||
${NCCL_PROFILE_DUMP_FILE}-hostname-tid.txt is created. Profiler traces are saved using the chrome
|
||||
event format (more precisely, using asynchronous events).
|
||||
|
||||
4. If you set the dump file variable, type chrome://tracing on your chromium browser search bar and
|
||||
open the created dump file to visualize the traces.
|
||||
|
||||
# Changing the profiler memory pool sizes
|
||||
|
||||
The example profiler uses separate memory pools for different types of events. The size of these memory
|
||||
pools (i.e., the # events) determines the number of events that the profiler can keep track of at the
|
||||
same time. When NCCL requests a new event (e.g., collective event) to profile a `ncclAllReduce`
|
||||
operation, by calling `startEvent`, the profiler searches in the collective pool for a free event. If it
|
||||
finds one, it marks it as in use and returns the handle to NCCL. If the pool is completely used the
|
||||
profiler returns `NULL` to NCCL and ignores all the following NCCL profiler calls for the `NULL` event
|
||||
handle. When the `ncclAllReduce` has been processed, NCCL calls `stopEvent` with the previosly returned
|
||||
event handle. The profiler has a total of 5 memory pools.
|
||||
|
||||
The group, collective and p2p pools contain objects for the corresponding events. The `ProxyCtrl` pool
|
||||
contains objects for `ProxyCtrl` events and the `ProxyDetach` pool contains objects for `ProxyOp` events
|
||||
generated by remote proxies. A list of pools and their size is reported below:
|
||||
|
||||
- `NCCL_PROFILE_GROUP_POOL_SIZE` (16)
|
||||
- `NCCL_PROFILE_COLL_POOL_SIZE` (16)
|
||||
- `NCCL_PROFILE_P2P_POOL_SIZE` (1024)
|
||||
- `NCCL_PROFILE_PROXY_CTRL_POOL_SIZE` (16)
|
||||
- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (128)
|
||||
|
||||
Remote proxy operations are generated when PXN is in use. Refer to this article for more information
|
||||
about PXN and how it works:
|
||||
https://developer.nvidia.com/blog/doubling-all2all-performance-with-nvidia-collective-communication-library-2-12/
|
||||
|
||||
# Reported events
|
||||
|
||||
The example profiler generates traces using the json format. An example of trace is reported below:
|
||||
|
||||
```
|
||||
[
|
||||
{"name": "Group", "cat": "GROUP", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764234.611328, "args": {"groupId": 0}},
|
||||
{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764237.294922, "args": {"SeqNum": 0, "CommHash": 673864846479792718, "Rank": 1, "Count": 32768, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "LL", "nMaxChannels": 2}},
|
||||
{"name": "Recv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768464.936523, "args": {"Channel": 0, "Peer": 0, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 772020.300781}, "RECEIVED": {"step": 14, "ts": 772196.049805}, "TRANSMITTED": {"step": 14, "ts": 772197.326172}, "DONE": {"step": 14, "ts": 772201.538086}}},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768465.158203, "args": {"Step": 0}},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805, "args": {"Step": 0}},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266, "args": {"Step": 0}},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805, "args": {"Step": 0}},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768568.276367},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768503.604492, "args": {"Step": 1}},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805, "args": {"Step": 1}},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234, "args": {"Step": 1}},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695, "args": {"Step": 1}},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 770006.914062},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768506.941406, "args": {"Step": 2}},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547, "args": {"Step": 2}},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133, "args": {"Step": 2}},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547, "args": {"Step": 2}},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771468.458008},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768509.484375, "args": {"Step": 3}},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000, "args": {"Step": 3}},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023, "args": {"Step": 3}},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211, "args": {"Step": 3}},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771910.500000},
|
||||
{"name": "Send", "cat": "PROXY", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768482.878906, "args": {"Channel": 0, "Peer": 2, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 771995.675781}, "REM_FIFO_WAIT": {"step": 14, "ts": 772190.692383}, "TRANSMITTED": {"step": 14, "ts": 772191.516602}, "DONE": {"step": 14, "ts": 772208.473633}}},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.019531, "args": {"Step": 0}},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781, "args": {"Step": 0}},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234, "args": {"Step": 0}},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769618.889648},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.083008, "args": {"Step": 1}},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086, "args": {"Step": 1}},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664, "args": {"Step": 1}},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769622.517578},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768507.937500, "args": {"Step": 2}},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578, "args": {"Step": 2}},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883, "args": {"Step": 2}},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770013.848633},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.742188, "args": {"Step": 3}},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266, "args": {"Step": 3}},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477, "args": {"Step": 3}},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771469.171875},
|
||||
... [ trace truncated for brevity ]
|
||||
{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.317383},
|
||||
{"name": "Group", "cat": "GROUP", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.418945},
|
||||
{}]
|
||||
```
|
||||
|
||||
Details about the fields used in the trace can be found at this link:
|
||||
https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw
|
||||
|
||||
The trace above is obtained by running a `ncclAllReduce` operation on 8 GPUs, communicating with each other through
|
||||
the network interface. The `Group` event encloses all traces that are related to the single `ncclAllReduce` call.
|
||||
(Note that for single collective invocations, where there are no explicit group calls, NCCL creates a group with only
|
||||
one collective and this is what is presented in the traces above).
|
||||
|
||||
|
||||
The `AllReduce` event encloses traces for the proxy operation associated to the `ncclAllReduce` operation. The `args`
|
||||
field in the traces contains NCCL specific information (aside from the chrome trace event format).
|
||||
|
||||
## AllReduce trace
|
||||
|
||||
The `AllReduce` entry presents information about the `ncclAllReduce` operation. It contains the following info in the args field:
|
||||
|
||||
- seqNum : sequential number of the collective in the communicator (every collective type has its own sequence number in the communicator)
|
||||
- commHash : communicator unique identifier
|
||||
- rank : NCCL rank for the ncclAllReduce
|
||||
- datatype : NCCL datatype
|
||||
- algorithm : algorithm used to process the ncclAllReduce
|
||||
- protocol : protocol used to process the ncclAllReduce
|
||||
- nMaxChannels: max number of channels used to process the ncclAllReduce
|
||||
|
||||
If the proxy events are not active (e.g., the `ncclAllReduce` is intranode) the end timestamp will match the time
|
||||
consumed by the CPU to launch the collective. For more details refer to `ext-profiler/README.md`, section `Profiling
|
||||
of collective and p2p operations`.
|
||||
|
||||
### Proxy Send
|
||||
The `Send` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
|
||||
info in the args field:
|
||||
|
||||
- Channel : id of the channel used by this proxy operation to send data to the peer
|
||||
- Peer : peer rank
|
||||
- Steps : number of network steps required to transfer transSize bytes to the peer
|
||||
- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread
|
||||
- transSize : bytes transferred across the channel by this proxy operation
|
||||
- POSTED : struct containing the number of buffer posts to the GPU and the time stamp for the last post
|
||||
- REM_FIFO_WAIT: struct containing the number of remote buffer waits and the time stamp for the last wait
|
||||
- TRANSMITTED : struct containing the number of network sends and the time stamp of the last send
|
||||
- DONE : struct containing the number of network sends completed and the time stamp of the last send completed
|
||||
|
||||
In case of a network problem the POSTED, REM_FIFO_WAIT, TRANSMITTED and DONE might all have partially updated steps,
|
||||
which could help identify at which point the network problem occurred.
|
||||
|
||||
The Proxy send trace gives a summary of the proxy progress thread activity for the channel. If more details are
|
||||
needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
|
||||
entries below are also reported by the profiler.
|
||||
|
||||
#### Proxy SendBufferWait
|
||||
|
||||
Presents, for every network step, the time the CPU proxy spends waiting for the channel staging buffer to become available.
|
||||
|
||||
#### Proxy SendGPUWait
|
||||
|
||||
Presents, for every network step, the time the CPU proxy spends waiting for the GPU to provide the data in the staging
|
||||
buffer.
|
||||
|
||||
#### Proxy SendWait
|
||||
|
||||
Presents, for every network step, the time the CPU proxy spends waiting for the `isend` to complete
|
||||
|
||||
### Proxy Recv
|
||||
|
||||
The `Recv` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
|
||||
info in the args field:
|
||||
|
||||
- Channel : id of the channel used by this proxy operation to recv data from the peer
|
||||
- Peer : peer rank
|
||||
- Steps : number of network steps required to transfer transSize bytes from the peer
|
||||
- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread
|
||||
- transSize : bytes transferred across the channel by this proxy operation
|
||||
- POSTED : struct containing the number of recvs posted and the time stamp for the last recv posted
|
||||
- RECEIVED : struct containing the number of recvs completed and the time stamp for the last recv completed
|
||||
- TRANSMITTED: struct containing the number of recvs flushed to the GPU memory and the time stamp for the last recv flushed
|
||||
- DONE : struct containing the number of flush completed and the time stamp for the last flush completed
|
||||
|
||||
The Proxy Recv trace gives a summary of the proxy progress thread activity for the channel. If more details are
|
||||
needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
|
||||
entries below are also reported by the profiler.
|
||||
|
||||
|
||||
#### Proxy RecvBufferWait
|
||||
|
||||
Presents, for every network step, the time the CPU proxy spends waiting for the staging buffer for the channel to
|
||||
become available.
|
||||
|
||||
#### Proxy RecvWait
|
||||
|
||||
Presents, for every network step, the time the CPU proxy spends waiting for a posted `irecv` to complete
|
||||
|
||||
#### Proxy RecvFlushWait
|
||||
|
||||
Presents, for every network step, the time the CPU proxy spends waitng for the recv data to be flushed to the GPU
|
||||
|
||||
#### Proxy RecvGPUWait
|
||||
|
||||
Presents, for every network step, the time the CPU proxy spends waiting for the GPU to consume the recv data
|
||||
@@ -39,14 +39,20 @@ endif
|
||||
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
|
||||
CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
|
||||
CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90
|
||||
CUDA13_GENCODE = -gencode=arch=compute_100,code=sm_100 \
|
||||
-gencode=arch=compute_120,code=sm_120
|
||||
|
||||
CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
|
||||
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
|
||||
CUDA11_PTX = -gencode=arch=compute_80,code=compute_80
|
||||
CUDA12_PTX = -gencode=arch=compute_90,code=compute_90
|
||||
CUDA13_PTX = -gencode=arch=compute_120,code=compute_120
|
||||
|
||||
|
||||
ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
|
||||
ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 12; echo $$?),0)
|
||||
# Include Blackwell support if we're using CUDA12.8 or above
|
||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
|
||||
else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
|
||||
# Include Hopper support if we're using CUDA11.8 or above
|
||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
|
||||
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 24
|
||||
NCCL_PATCH := 3
|
||||
NCCL_MINOR := 25
|
||||
NCCL_PATCH := 1
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
@@ -25,7 +25,7 @@ prep : $(DEBTARGETS)
|
||||
build : prep
|
||||
$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
|
||||
@printf "Building Debian package\n"
|
||||
(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
|
||||
(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b -Zxz)
|
||||
mkdir -p $(PKGDIR)
|
||||
mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
bin/ncclras /usr/bin
|
||||
include/nccl.h /usr/include
|
||||
include/nccl_net.h /usr/include
|
||||
lib/libnccl.so /usr/lib/${pkg:MultiArch}
|
||||
lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
|
||||
|
||||
@@ -11,3 +11,6 @@ override_dh_auto_test:
|
||||
|
||||
override_dh_auto_clean:
|
||||
# Do not make clean
|
||||
|
||||
override_dh_builddeb:
|
||||
dh_builddeb -- -Zxz
|
||||
|
||||
@@ -20,6 +20,7 @@ sockets.
|
||||
%package devel
|
||||
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
|
||||
Group: Development/Libraries
|
||||
Requires: libnccl >= ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
|
||||
%description devel
|
||||
NCCL development files
|
||||
|
||||
@@ -44,9 +45,10 @@ install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUI
|
||||
ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major}
|
||||
|
||||
# devel
|
||||
install -m 755 -d $RPM_BUILD_ROOT/%{_bindir}
|
||||
install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
|
||||
install -m 755 bin/ncclras $RPM_BUILD_ROOT/%{_bindir}
|
||||
install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
|
||||
install -m 644 include/nccl_net.h $RPM_BUILD_ROOT/%{_includedir}
|
||||
ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
|
||||
|
||||
# static
|
||||
@@ -64,8 +66,8 @@ rm -rf $RPM_BUILD_ROOT
|
||||
%files devel
|
||||
%doc LICENSE.txt
|
||||
%defattr(-,root,root,-)
|
||||
%{_bindir}/ncclras
|
||||
%{_includedir}/nccl.h
|
||||
%{_includedir}/nccl_net.h
|
||||
%{_libdir}/libnccl.so
|
||||
|
||||
%files static
|
||||
|
||||
@@ -21,4 +21,4 @@ PKG_ARCH=${pkg:Arch}
|
||||
|
||||
NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}"
|
||||
|
||||
tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt
|
||||
tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/bin $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt
|
||||
|
||||
+1
-1
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
|
||||
INCEXPORTS := nccl.h
|
||||
LIBSRCFILES := \
|
||||
bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
|
||||
init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
|
||||
init.cc init_nvtx.cc net.cc proxy.cc transport.cc mnnvl.cc \
|
||||
$(wildcard graph/*.cc) \
|
||||
$(wildcard misc/*.cc) \
|
||||
$(wildcard transport/*.cc) \
|
||||
|
||||
+70
-187
@@ -10,7 +10,7 @@
|
||||
#include "graph/topo.h"
|
||||
#include "nccl.h"
|
||||
#include "api_trace.h"
|
||||
|
||||
#include "nvtx_payload_schemas.h"
|
||||
#include "msccl/msccl_lifecycle.h"
|
||||
|
||||
using namespace rccl;
|
||||
@@ -84,18 +84,8 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
|
||||
|
||||
ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
|
||||
struct NvtxParamsAllGather {
|
||||
size_t bytes;
|
||||
ncclDataType_t datatype;
|
||||
};
|
||||
// Just pass the size of one message and not the total bytes sent/received.
|
||||
constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
|
||||
offsetof(NvtxParamsAllGather, datatype)}
|
||||
};
|
||||
NvtxParamsAllGather payload{sendcount * ncclTypeSize(datatype), datatype};
|
||||
NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, payload)
|
||||
NVTX3_FUNC_WITH_PARAMS(AllGather, NcclNvtxParamsAllGather,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype), datatype));
|
||||
|
||||
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
|
||||
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||
@@ -121,20 +111,8 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size
|
||||
|
||||
ncclResult_t ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
|
||||
struct NvtxParamsAllReduce {
|
||||
size_t bytes;
|
||||
ncclRedOp_t op;
|
||||
ncclDataType_t datatype;
|
||||
};
|
||||
// Just pass the size of one message and not the total bytes sent/received.
|
||||
static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
|
||||
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, offsetof(NvtxParamsAllReduce, op)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
|
||||
offsetof(NvtxParamsAllReduce, datatype)}
|
||||
};
|
||||
NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op, datatype};
|
||||
NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
|
||||
NVTX3_FUNC_WITH_PARAMS(AllReduce, NcclNvtxParamsAllReduce,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), op, datatype));
|
||||
|
||||
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
|
||||
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
|
||||
@@ -162,25 +140,14 @@ NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_
|
||||
|
||||
ncclResult_t ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
NVTX3_FUNC_WITH_PARAMS(AllToAll, NcclNvtxParamsAllToAll,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), datatype));
|
||||
|
||||
if (!mscclIsCaller()) // when msccl falls back to
|
||||
{
|
||||
NCCLCHECK(Recorder::instance().record(rrAllToAll, sendbuff, recvbuff, count, datatype, comm, stream));
|
||||
}
|
||||
|
||||
struct NvtxParamsAllToAll {
|
||||
size_t bytes;
|
||||
ncclDataType_t datatype;
|
||||
};
|
||||
// Just pass the size of one message and not the total bytes sent/received.
|
||||
constexpr nvtxPayloadSchemaEntry_t AllToAllSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
|
||||
offsetof(NvtxParamsAllToAll, datatype)}
|
||||
};
|
||||
NvtxParamsAllToAll payload{count * ncclTypeSize(datatype), datatype};
|
||||
NVTX3_FUNC_WITH_PARAMS(AllToAll, AllToAllSchema, payload)
|
||||
|
||||
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
|
||||
return mscclEnqueueCheck(
|
||||
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
|
||||
@@ -218,28 +185,15 @@ NCCL_API(ncclResult_t, ncclAllToAllv, const void *sendbuff, const size_t sendcou
|
||||
ncclResult_t ncclAllToAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
|
||||
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
|
||||
NVTX3_FUNC_WITH_PARAMS(AllToAllv, NcclNvtxParamsAllToAllv,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcounts[comm->rank] * ncclTypeSize(datatype),
|
||||
recvcounts[comm->rank] * ncclTypeSize(datatype), datatype));
|
||||
|
||||
if (!mscclIsCaller()) // when msccl falls back to
|
||||
{
|
||||
NCCLCHECK(Recorder::instance().record(rrAllToAllv, sendbuff, recvbuff, 0, datatype, comm, stream, -1, sendcounts, sdispls, recvcounts, rdispls));
|
||||
}
|
||||
|
||||
struct NvtxParamsAllToAllv {
|
||||
size_t sendbytes;
|
||||
size_t recvbytes;
|
||||
ncclDataType_t datatype;
|
||||
};
|
||||
// Just pass the size of one send/recv messages and not the total bytes sent/received.
|
||||
constexpr nvtxPayloadSchemaEntry_t AllToAllvSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes] (Send)"},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes] (Recv)", nullptr, 0,
|
||||
offsetof(NvtxParamsAllToAllv, recvbytes)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
|
||||
offsetof(NvtxParamsAllToAllv, datatype)}
|
||||
};
|
||||
NvtxParamsAllToAllv payload{sendcounts[comm->rank] * ncclTypeSize(datatype), recvcounts[comm->rank] * ncclTypeSize(datatype), datatype};
|
||||
NVTX3_FUNC_WITH_PARAMS(AllToAllv, AllToAllvSchema, payload)
|
||||
|
||||
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
|
||||
return mscclEnqueueCheck(
|
||||
sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls,
|
||||
@@ -274,19 +228,8 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
|
||||
|
||||
ncclResult_t ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, cudaStream_t stream) {
|
||||
struct NvtxParamsBroadcast {
|
||||
size_t bytes;
|
||||
int root;
|
||||
ncclDataType_t datatype;
|
||||
};
|
||||
constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
|
||||
offsetof(NvtxParamsBroadcast, datatype)}
|
||||
};
|
||||
NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root, datatype};
|
||||
NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
|
||||
NVTX3_FUNC_WITH_PARAMS(Broadcast, NcclNvtxParamsBroadcast,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, datatype));
|
||||
|
||||
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
|
||||
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
|
||||
@@ -319,45 +262,34 @@ NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t
|
||||
|
||||
ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
struct NvtxParamsGather {
|
||||
size_t bytes;
|
||||
int root;
|
||||
ncclDataType_t datatype;
|
||||
};
|
||||
constexpr nvtxPayloadSchemaEntry_t GatherSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsGather, root)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
|
||||
offsetof(NvtxParamsGather, datatype)}
|
||||
};
|
||||
NvtxParamsGather payload{sendcount * ncclTypeSize(datatype), root, datatype};
|
||||
NVTX3_FUNC_WITH_PARAMS(Gather, GatherSchema, payload)
|
||||
NVTX3_FUNC_WITH_PARAMS(Gather, NcclNvtxParamsGather,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype), root, datatype));
|
||||
|
||||
if (!mscclIsCaller()) // when msccl falls back to
|
||||
{
|
||||
NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, sendcount, datatype, comm, stream, root));
|
||||
}
|
||||
if (!mscclIsCaller()) // when msccl falls back to
|
||||
{
|
||||
NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, sendcount, datatype, comm, stream, root));
|
||||
}
|
||||
|
||||
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
|
||||
return mscclEnqueueCheck(
|
||||
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
|
||||
sendcount, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
|
||||
}
|
||||
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
|
||||
return mscclEnqueueCheck(
|
||||
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
|
||||
sendcount, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
|
||||
}
|
||||
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
size_t rankOffset = sendcount * ncclTypeSize(datatype);
|
||||
if (sendcount == 0) return ncclSuccess;
|
||||
int rank;
|
||||
NCCLCHECK(ncclCommUserRank(comm, &rank));
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
if (rank == root) {
|
||||
for (int r=0; r<nRanks; r++)
|
||||
NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, sendcount, datatype, r, comm, stream));
|
||||
}
|
||||
NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, root, comm, stream));
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ncclSuccess;
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
size_t rankOffset = sendcount * ncclTypeSize(datatype);
|
||||
if (sendcount == 0) return ncclSuccess;
|
||||
int rank;
|
||||
NCCLCHECK(ncclCommUserRank(comm, &rank));
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
if (rank == root) {
|
||||
for (int r=0; r<nRanks; r++)
|
||||
NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, sendcount, datatype, r, comm, stream));
|
||||
}
|
||||
NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, root, comm, stream));
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||
@@ -365,22 +297,8 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t
|
||||
|
||||
ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
struct NvtxParamsReduce {
|
||||
size_t bytes;
|
||||
int root;
|
||||
ncclRedOp_t op;
|
||||
ncclDataType_t datatype;
|
||||
};
|
||||
constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
|
||||
offsetof(NvtxParamsReduce, op)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
|
||||
offsetof(NvtxParamsReduce, datatype)}
|
||||
};
|
||||
NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op, datatype};
|
||||
NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
|
||||
NVTX3_FUNC_WITH_PARAMS(Reduce, NcclNvtxParamsReduce,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, op, datatype));
|
||||
|
||||
struct ncclInfo info = { ncclFuncReduce, "Reduce",
|
||||
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
|
||||
@@ -406,20 +324,8 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
|
||||
|
||||
ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
|
||||
struct NvtxParamsReduceScatter {
|
||||
size_t bytes;
|
||||
ncclRedOp_t op;
|
||||
ncclDataType_t datatype;
|
||||
};
|
||||
constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
|
||||
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
|
||||
offsetof(NvtxParamsReduceScatter, op)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
|
||||
offsetof(NvtxParamsReduceScatter, datatype)}
|
||||
};
|
||||
NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op, datatype};
|
||||
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
|
||||
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, NcclNvtxParamsReduceScatter,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), op, datatype));
|
||||
|
||||
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
|
||||
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
|
||||
@@ -445,67 +351,44 @@ NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t
|
||||
|
||||
ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
struct NvtxParamsScatter {
|
||||
size_t bytes;
|
||||
int root;
|
||||
ncclDataType_t datatype;
|
||||
};
|
||||
constexpr nvtxPayloadSchemaEntry_t ScatterSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsScatter, root)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
|
||||
offsetof(NvtxParamsScatter, datatype)}
|
||||
};
|
||||
NvtxParamsScatter payload{recvcount * ncclTypeSize(datatype), root, datatype};
|
||||
NVTX3_FUNC_WITH_PARAMS(Scatter, ScatterSchema, payload)
|
||||
NVTX3_FUNC_WITH_PARAMS(Scatter, NcclNvtxParamsScatter,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), root, datatype));
|
||||
|
||||
if (!mscclIsCaller()) // when msccl falls back to
|
||||
{
|
||||
NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, recvcount, datatype, comm, stream, root));
|
||||
}
|
||||
if (!mscclIsCaller()) // when msccl falls back to
|
||||
{
|
||||
NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, recvcount, datatype, comm, stream, root));
|
||||
}
|
||||
|
||||
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
|
||||
return mscclEnqueueCheck(
|
||||
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
|
||||
recvcount, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
|
||||
}
|
||||
if (mscclAvailable(comm->rank) && !mscclIsCaller()) {
|
||||
return mscclEnqueueCheck(
|
||||
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
|
||||
recvcount, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
|
||||
}
|
||||
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
size_t rankOffset = recvcount * ncclTypeSize(datatype);
|
||||
if (recvcount == 0) return ncclSuccess;
|
||||
int rank;
|
||||
NCCLCHECK(ncclCommUserRank(comm, &rank));
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
if (rank == root) {
|
||||
for (int r=0; r<nRanks; r++)
|
||||
NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, recvcount, datatype, r, comm, stream));
|
||||
}
|
||||
NCCLCHECK(ncclRecv(recvbuff, recvcount, datatype, root, comm, stream));
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ncclSuccess;
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
size_t rankOffset = recvcount * ncclTypeSize(datatype);
|
||||
if (recvcount == 0) return ncclSuccess;
|
||||
int rank;
|
||||
NCCLCHECK(ncclCommUserRank(comm, &rank));
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
if (rank == root) {
|
||||
for (int r=0; r<nRanks; r++)
|
||||
NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, recvcount, datatype, r, comm, stream));
|
||||
}
|
||||
NCCLCHECK(ncclRecv(recvbuff, recvcount, datatype, root, comm, stream));
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct NvtxParamsSendRecv {
|
||||
size_t bytes;
|
||||
int peer;
|
||||
ncclDataType_t datatype;
|
||||
};
|
||||
constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
|
||||
offsetof(NvtxParamsSendRecv, datatype)}
|
||||
};
|
||||
|
||||
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, cudaStream_t stream);
|
||||
|
||||
|
||||
ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, cudaStream_t stream) {
|
||||
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer, datatype};
|
||||
NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
|
||||
NVTX3_FUNC_WITH_PARAMS(Send, NcclNvtxParamsSendRecv,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer, datatype));
|
||||
|
||||
struct ncclInfo info = { ncclFuncSend, "Send",
|
||||
NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
|
||||
@@ -530,8 +413,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
|
||||
|
||||
ncclResult_t ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, cudaStream_t stream) {
|
||||
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer, datatype};
|
||||
NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
|
||||
NVTX3_FUNC_WITH_PARAMS(Recv, NcclNvtxParamsSendRecv,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer, datatype));
|
||||
|
||||
struct ncclInfo info = { ncclFuncRecv, "Recv",
|
||||
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
|
||||
|
||||
@@ -767,7 +767,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
|
||||
int nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndReduce) {
|
||||
} else if (tid < tidEndReduce && nvls->headRank != -1) {
|
||||
// Reduce, broadcast through NVLS
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
|
||||
+20
-16
@@ -55,25 +55,22 @@ static ncclKernelMatch const ncclKerns[3] = {
|
||||
NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
|
||||
|
||||
// Returns maximum kernel stack size of all CUDA kernels
|
||||
ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
|
||||
ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize) {
|
||||
constexpr int KernelCount = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
|
||||
ncclResult_t result = ncclSuccess;
|
||||
int print = 0;
|
||||
|
||||
if (maxStackSize) *maxStackSize = 0;
|
||||
int carveout = ncclParamL1SharedMemoryCarveout();
|
||||
int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch);
|
||||
|
||||
// Keep track if we already visited a function pointer.
|
||||
void* lru[2] = {nullptr, nullptr};
|
||||
for (int i=0; i < KernelCount; i++) {
|
||||
void* fn = ncclKerns[i].kernelFn;
|
||||
if (fn == lru[0] || fn == lru[1]) goto next_kernel;
|
||||
lru[1] = lru[0];
|
||||
lru[0] = fn;
|
||||
for (int k=0; k < KernelCount; k++) {
|
||||
void* fn = ncclKerns[k].kernelFn;
|
||||
cudaFuncAttributes attr = {0};
|
||||
if (fn == nullptr) continue;
|
||||
|
||||
CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
|
||||
if (maxStackSize) {
|
||||
cudaFuncAttributes attr = {0};
|
||||
if (cudaFuncGetAttributes(&attr, fn) != cudaSuccess)
|
||||
WARN("Failed to get kernel attributes");
|
||||
if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
|
||||
ignore0:;
|
||||
}
|
||||
@@ -84,10 +81,17 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
|
||||
result, ignore1);
|
||||
ignore1:;
|
||||
}
|
||||
|
||||
if (ncclShmemDynamicSize(cudaArch) != 0) {
|
||||
if (ncclMaxSharedMem != 0) {
|
||||
int sharedMemSize = ncclMaxSharedMem;
|
||||
if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) {
|
||||
if (print++ == 0)
|
||||
INFO(NCCL_INIT, "ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
|
||||
sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
|
||||
// Reduce requested MaxDynamicSharedMemorySize attribute
|
||||
sharedMemSize = maxSharedMem - attr.sharedSizeBytes;
|
||||
}
|
||||
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)),
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize),
|
||||
result, next_kernel);
|
||||
}
|
||||
next_kernel:;
|
||||
@@ -1523,7 +1527,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
|
||||
NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
|
||||
if (driverVersion >= 11080) {
|
||||
int compCap = comm->compCap;
|
||||
unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0;
|
||||
unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0;
|
||||
|
||||
CUlaunchConfig launchConfig = {0};
|
||||
CUlaunchAttribute launchAttrs[3];
|
||||
@@ -1677,7 +1681,7 @@ static ncclResult_t updateCollCostTable(
|
||||
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue;
|
||||
if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
|
||||
/* now we only support single-node NVLS allgather and reducescatter */
|
||||
if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue;
|
||||
if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && (comm->nNodes > 1 || comm->nRanks > NCCL_MAX_NVLS_ARITY)) continue;
|
||||
/* Tree reduceScatter doesn't support scaling yet */
|
||||
if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter
|
||||
&& (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue;
|
||||
|
||||
@@ -22,7 +22,6 @@
|
||||
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
|
||||
int rank = comm->rank;
|
||||
int localRanks = comm->topo->nodes[GPU].count;
|
||||
int nvlsRanks = comm->MNNVL ? comm->clique.size : localRanks;
|
||||
int nChannels = comm->nChannels;
|
||||
|
||||
topoRanks->nvlsHeadNum = 0;
|
||||
@@ -77,7 +76,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
|
||||
// Get nvls heads and the number of heads. Duplicate head is not allowed.
|
||||
for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
|
||||
bool addHead = true;
|
||||
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * nvlsRanks;
|
||||
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
|
||||
|
||||
for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
|
||||
if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
|
||||
@@ -457,8 +456,6 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
|
||||
channel->nvls.out = -1; // NVLS+SHARP not yet implemented.
|
||||
channel->nvls.headRank = headRank;
|
||||
channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
|
||||
channel->nvls.node = comm->node;
|
||||
channel->nvls.nNodes = comm->nNodes;
|
||||
if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
|
||||
}
|
||||
if (comm->nNodes == 1) return ncclSuccess;
|
||||
@@ -853,7 +850,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
}
|
||||
|
||||
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
|
||||
if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && 2*nChannels <= maxChannels) {
|
||||
if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && 2*nChannels <= maxChannels) {
|
||||
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
|
||||
}
|
||||
|
||||
|
||||
+29
-6
@@ -975,14 +975,37 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) {
|
||||
int minPath = PATH_DIS;
|
||||
ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min) {
|
||||
int minPath = PATH_SYS;
|
||||
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU];
|
||||
for (int j=0; j<system->nodes[GPU].count; j++) {
|
||||
if (i == j) continue;
|
||||
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type];
|
||||
if (paths == NULL) continue;
|
||||
for (int j=0; j<system->nodes[type].count; j++) {
|
||||
if (type == GPU && i == j) continue;
|
||||
minPath = std::min(minPath, paths[j].type);
|
||||
}
|
||||
}
|
||||
return minPath >= PATH_PIX ? 0 : 1;
|
||||
*min = minPath;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max) {
|
||||
int maxPath = PATH_LOC;
|
||||
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type];
|
||||
if (paths == NULL) continue;
|
||||
for (int j=0; j<system->nodes[type].count; j++) {
|
||||
if (type == GPU && i == j) continue;
|
||||
maxPath = std::max(maxPath, paths[j].type);
|
||||
}
|
||||
}
|
||||
*max = maxPath;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink) {
|
||||
int maxPath;
|
||||
NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxPath));
|
||||
*allNvLink = maxPath >= PATH_PIX ? 0 : 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
+32
-15
@@ -988,15 +988,20 @@ float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0,
|
||||
float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
|
||||
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
|
||||
#endif
|
||||
|
||||
float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 11.0, 6.0, 3.0 };
|
||||
float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
|
||||
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
|
||||
#endif
|
||||
|
||||
RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0);
|
||||
|
||||
float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0 };
|
||||
float sm100SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
#define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float))
|
||||
#define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float))
|
||||
|
||||
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int crossNic = (system->nodes[NET].count > 1) &&
|
||||
@@ -1006,8 +1011,20 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
graph->crossNic = crossNic == 1 ? 1 : 0;
|
||||
graph->bwIntra = graph->bwInter = 0;
|
||||
graph->latencyInter = 0;
|
||||
graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
|
||||
graph->typeInter = PATH_PIX;
|
||||
int minTypeIntra = PATH_LOC, minTypeInter = PATH_PIX;
|
||||
int maxTypeIntra = PATH_SYS, maxTypeInter = PATH_SYS;
|
||||
if (ngpus > 1) {
|
||||
NCCLCHECK(ncclTopoGetGpuMinPath(system, GPU, &minTypeIntra));
|
||||
NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxTypeIntra));
|
||||
}
|
||||
if (system->nodes[NET].count > 0) {
|
||||
NCCLCHECK(ncclTopoGetGpuMinPath(system, NET, &minTypeInter));
|
||||
NCCLCHECK(ncclTopoGetGpuMaxPath(system, NET, &maxTypeInter));
|
||||
maxTypeIntra = maxTypeInter;
|
||||
}
|
||||
|
||||
graph->typeIntra = minTypeIntra;
|
||||
graph->typeInter = minTypeInter;
|
||||
graph->nChannels = 0;
|
||||
graph->nIntraChannels = 0;
|
||||
memset(graph->intraNets, 0, MAXCHANNELS*NCCL_TOPO_MAX_NODES*2*sizeof(int));
|
||||
@@ -1080,14 +1097,14 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
|
||||
// NVLS and COLLNET_DIRECT search must have ngpus heads at most.
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT)
|
||||
graph->maxChannels = system->nodes[GPU].count;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) graph->maxChannels = std::min(NCCL_MAX_NVLS_ARITY, system->nodes[GPU].count);
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) graph->maxChannels = std::min(NCCL_MAX_DIRECT_ARITY+1, system->nodes[GPU].count);
|
||||
|
||||
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
|
||||
|
||||
if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
|
||||
// Force intra-node NVLS algorithm to pull evenly from all GPUs.
|
||||
graph->minChannels = graph->maxChannels = system->nodes[GPU].count;
|
||||
graph->minChannels = graph->maxChannels;
|
||||
}
|
||||
|
||||
struct ncclTopoGraph tmpGraph;
|
||||
@@ -1097,11 +1114,11 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
int nspeeds = 0;
|
||||
float* speedArray = NULL;
|
||||
if (system->nodes[NET].count == 0) {
|
||||
nspeeds = NSPEEDSINTRA;
|
||||
speedArray = speedArrayIntra;
|
||||
nspeeds = ccMin >= 100 ? NSPEEDSINTRA_SM100 : (ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA);
|
||||
speedArray = ccMin >= 100 ? sm100SpeedArrayIntra : (ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra);
|
||||
} else {
|
||||
nspeeds = NSPEEDSINTER;
|
||||
speedArray = speedArrayInter;
|
||||
nspeeds = ccMin >= 100 ? NSPEEDSINTER_SM100 : (ccMin >= 90 ? NSPEEDSINTER_SM90 : NSPEEDSINTER);
|
||||
speedArray = ccMin >= 100 ? sm100SpeedArrayInter : (ccMin >= 90 ? sm90SpeedArrayInter : speedArrayInter);
|
||||
}
|
||||
int pass = 1;
|
||||
int speedIndex = 0;
|
||||
@@ -1156,18 +1173,18 @@ search:
|
||||
}
|
||||
tmpGraph.pattern = graph->pattern;
|
||||
|
||||
int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
|
||||
if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
|
||||
int maxIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : maxTypeIntra;
|
||||
if (tmpGraph.typeIntra < maxIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
|
||||
tmpGraph.typeIntra += 1;
|
||||
goto search;
|
||||
}
|
||||
tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
|
||||
tmpGraph.typeIntra = minTypeIntra;
|
||||
|
||||
if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
|
||||
if (system->nodes[NET].count > 0 && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
|
||||
tmpGraph.typeInter += 1;
|
||||
goto search;
|
||||
}
|
||||
tmpGraph.typeInter = PATH_PIX;
|
||||
tmpGraph.typeInter = minTypeInter;
|
||||
|
||||
if (crossNic == 2 && tmpGraph.crossNic == 0
|
||||
&& (graph->pattern == NCCL_TOPO_PATTERN_RING || graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE)) {
|
||||
|
||||
+36
-44
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -1445,11 +1445,11 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) {
|
||||
static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType,
|
||||
int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType) {
|
||||
int minType = PATH_DIS;
|
||||
float maxBw = 0;
|
||||
int count = 0;
|
||||
NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count));
|
||||
struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType];
|
||||
if (paths == NULL) { *localCount = 0; return ncclSuccess; }
|
||||
for (int i=0; i<system->nodes[resultType].count; i++) {
|
||||
@@ -1459,7 +1459,15 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index
|
||||
if (pathType) *pathType = minType;
|
||||
count = 0;
|
||||
}
|
||||
if (paths[i].bw == maxBw && paths[i].type == minType) (*locals)[count++] = i;
|
||||
if (paths[i].bw == maxBw && paths[i].type == minType) {
|
||||
if (count == NCCL_TOPO_MAX_NODES) {
|
||||
WARN("Error : ran out of room to store found nodes in ncclTopoGetLocal."
|
||||
" Filled %d of type %d, starting from index %d of type %d.",
|
||||
NCCL_TOPO_MAX_NODES, resultType, index, type);
|
||||
return ncclInternalError;
|
||||
}
|
||||
locals[count++] = i;
|
||||
}
|
||||
}
|
||||
*localCount = count;
|
||||
return ncclSuccess;
|
||||
@@ -1467,7 +1475,7 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index
|
||||
|
||||
ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count) {
|
||||
int localNetCount = 0, netCountByBw = 0;
|
||||
int* localNets;
|
||||
int localNets[NCCL_TOPO_MAX_NODES];
|
||||
float totalNetBw = 0, gpuBw = 0;
|
||||
|
||||
for (int l=0; l<system->nodes[GPU].nodes[gpu].nlinks; l++) {
|
||||
@@ -1479,69 +1487,55 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
|
||||
}
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
|
||||
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL));
|
||||
for (int l=0; (l < localNetCount) && (totalNetBw < gpuBw); l++, netCountByBw++) {
|
||||
totalNetBw += system->nodes[GPU].nodes[gpu].paths[NET][localNets[l]].bw;
|
||||
}
|
||||
*count = netCountByBw;
|
||||
|
||||
free(localNets);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int gpu;
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
|
||||
int* localNets = NULL;
|
||||
|
||||
int localNets[NCCL_TOPO_MAX_NODES];
|
||||
int localNetCount;
|
||||
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
|
||||
int* localGpus = NULL;
|
||||
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL));
|
||||
if (localNetCount==0) {
|
||||
WARN("Could not find any local path from gpu %d to net.", gpu);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
int localGpus[NCCL_TOPO_MAX_NODES];
|
||||
int localGpuCount;
|
||||
int net = 0;
|
||||
if (localNetCount == 0) {
|
||||
*id = -1;
|
||||
free(localNets);
|
||||
return ncclSuccess;
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL), ret, fail);
|
||||
for (int i = 0; i < localGpuCount; i++) {
|
||||
if (gpu == localGpus[i]) {
|
||||
net = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL));
|
||||
|
||||
int net = system->nodes[GPU].nodes[gpu].gpu.dev;
|
||||
if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
|
||||
if (localNetCount == 0) {
|
||||
if (id) *id = -1;
|
||||
if (dev) *dev = -1;
|
||||
} else {
|
||||
net += channelId%(DIVUP(localNetCount,localGpuCount));
|
||||
if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
|
||||
if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
|
||||
}
|
||||
exit:
|
||||
free(localNets);
|
||||
if (localGpus) free(localGpus);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
net += channelId%(DIVUP(localNetCount,localGpuCount));
|
||||
if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
|
||||
if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int netIndex;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &netIndex));
|
||||
int* localGpus = NULL;
|
||||
|
||||
int localGpus[NCCL_TOPO_MAX_NODES];
|
||||
int localGpuCount;
|
||||
NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, localGpus, &localGpuCount, NULL));
|
||||
|
||||
int foundGpu = -1;
|
||||
NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
|
||||
for (int c=0; c<MAXCHANNELS; c++) {
|
||||
for (int lg=0; lg<localGpuCount; lg++) {
|
||||
int g = localGpus[lg];
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
int64_t id;
|
||||
NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL), ret, fail);
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL));
|
||||
if (netId == id) {
|
||||
foundGpu = g;
|
||||
goto exit;
|
||||
@@ -1550,8 +1544,6 @@ ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, i
|
||||
}
|
||||
exit:
|
||||
*gpuIndex = foundGpu;
|
||||
fail:
|
||||
free(localGpus);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#define SM80_NVLINK_BW 20.0
|
||||
#define SM90_NVLINK_BW 20.6
|
||||
#define SM86_NVLINK_BW 12.0
|
||||
#define SM100_NVLINK_BW 40.0
|
||||
#define PCI_BW 12.0 // PCI Gen3 x16
|
||||
#define QPI_BW 6.0
|
||||
#define AMD_BW 16.0
|
||||
@@ -98,8 +99,8 @@ struct ncclTopoLink {
|
||||
float bw;
|
||||
struct ncclTopoNode* remNode;
|
||||
};
|
||||
#define NCCL_TOPO_MAX_LINKS 128
|
||||
|
||||
// Allows for up to 32 NICs per node on GB200-NVL72
|
||||
#define NCCL_TOPO_MAX_LINKS 576
|
||||
#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
|
||||
|
||||
struct ncclTopoLinkList {
|
||||
@@ -209,6 +210,8 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode
|
||||
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank);
|
||||
ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min);
|
||||
ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max);
|
||||
|
||||
#define NCCL_TOPO_XML_MAX_NODES 8192
|
||||
#define NCCL_GRAPH_XML_MAX_NODES 8192
|
||||
@@ -279,6 +282,7 @@ static float ncclTopoXGMISpeed(const char* gcn) {
|
||||
// Returns NVLink bw in GB/s
|
||||
static float ncclTopoNVLinkBw(int cudaCompCap) {
|
||||
return
|
||||
cudaCompCap >= 100 ? SM100_NVLINK_BW :
|
||||
cudaCompCap >= 90 ? SM90_NVLINK_BW :
|
||||
cudaCompCap == 86 ? SM86_NVLINK_BW :
|
||||
cudaCompCap >= 80 ? SM80_NVLINK_BW :
|
||||
|
||||
+13
-6
@@ -350,28 +350,33 @@ static struct tuningModel rcclTuningModel[] = {
|
||||
#define VOLTA_COMPCAP_IDX 0
|
||||
#define AMPERE_COMPCAP_IDX 1
|
||||
#define HOPPER_COMPCAP_IDX 2
|
||||
#define BLACKWELL_COMPCAP_IDX 3
|
||||
|
||||
// LL128 max BW per channel
|
||||
static const double llMaxBws[3][3] = {
|
||||
static const double llMaxBws[][3] = {
|
||||
/* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
|
||||
/* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
|
||||
/* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0}
|
||||
/* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0},
|
||||
/* Blackwell-N1/AMD-N2/AMD-N4) */ {2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0},
|
||||
};
|
||||
|
||||
static const double perChMaxRingLL128Bws[3][3] = {
|
||||
static const double perChMaxRingLL128Bws[][3] = {
|
||||
/* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
|
||||
/* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
|
||||
/* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7},
|
||||
/* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*36.7},
|
||||
};
|
||||
static const double perChMaxTreeLL128Bws[3][3] = {
|
||||
static const double perChMaxTreeLL128Bws[][3] = {
|
||||
/* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
|
||||
/* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
|
||||
/* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0},
|
||||
/* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*29.0},
|
||||
};
|
||||
static const double perChMaxTreeBws[3][3] = {
|
||||
static const double perChMaxTreeBws[][3] = {
|
||||
/* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
|
||||
/* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
|
||||
/* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
|
||||
/* Blackwell (N1/N2/N4) */ {2*38.7, 2*41.4, 2*36.0},
|
||||
};
|
||||
|
||||
NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
|
||||
@@ -422,7 +427,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
int nRanks = comm->nRanks;
|
||||
if (nRanks <= 1) return ncclSuccess;
|
||||
|
||||
int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX;
|
||||
int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
|
||||
int index2 = nNodes <= 2 ? nNodes-1 : 2;
|
||||
// LL: for single node, we look at GPU type; for multi-node, we look at CPU type
|
||||
int index1 = nNodes == 1 ? compCapIndex :
|
||||
@@ -666,6 +671,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
case 70: pEnable &= 1; break;
|
||||
case 80: pEnable &= 1; break;
|
||||
case 90: pEnable &= !(CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2); break;
|
||||
case 100: pEnable &= 1; break;
|
||||
case 120: pEnable &= 1; break;
|
||||
default: pEnable &= 0; break;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -228,14 +228,13 @@ static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
|
||||
static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, CUmemAllocationHandleType type, size_t size) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
size_t granularity = 0;
|
||||
CUdevice currentDev;
|
||||
CUmemAllocationProp prop = {};
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
CUmemGenericAllocationHandle handle;
|
||||
CUmemAllocationHandleType type = ncclCuMemHandleType;
|
||||
int cudaDev;
|
||||
int flag = 0;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
@@ -284,7 +283,7 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
|
||||
|
||||
extern int ncclCuMemEnable();
|
||||
|
||||
static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) {
|
||||
static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, int type, size_t size) {
|
||||
WARN("CUMEM not supported prior to CUDA 11.3");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
@@ -70,7 +70,7 @@ union ncclLLFifoLine {
|
||||
#define WARP_SIZE warpSize
|
||||
#define MAXCHANNELS 128
|
||||
#define CHANNEL_LIMIT 16
|
||||
#define NCCL_MAX_LOCAL_RANKS 64
|
||||
#define NCCL_MAX_LOCAL_RANKS 72
|
||||
#define NCCL_MAX_NTHREADS 256
|
||||
#define NCCL_MIN_NTHREADS (4*WARP_SIZE)
|
||||
#define NCCL_SIMPLE_MAX_NTHREADS NCCL_MAX_NTHREADS
|
||||
@@ -204,8 +204,6 @@ struct ncclNvls {
|
||||
int down;
|
||||
int treeUp;
|
||||
int treeDown[NCCL_MAX_NVLS_TREE_ARITY];
|
||||
int node;
|
||||
int nNodes;
|
||||
};
|
||||
|
||||
#if __CUDA_ARCH__ >= 900
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
#define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL)
|
||||
#define NCCL_BYTES_ALIGNMENT 16
|
||||
|
||||
ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
|
||||
ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize);
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
||||
ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
|
||||
ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
|
||||
|
||||
@@ -29,7 +29,8 @@ void ncclTopoFree(struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
|
||||
int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink);
|
||||
|
||||
ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
|
||||
|
||||
// Query topology
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_MNNVL_H_
|
||||
#define NCCL_MNNVL_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "comm.h"
|
||||
|
||||
ncclResult_t ncclMnnvlCheck(struct ncclComm* comm);
|
||||
|
||||
#endif
|
||||
+93
-18
@@ -36,19 +36,22 @@
|
||||
#define NVTX_SID_CommInitRankConfig 16 // same schema as NVTX_SID_CommInitRank
|
||||
#define NVTX_SID_CommInitRankScalable 17 // same schema as NVTX_SID_CommInitRank
|
||||
#define NVTX_SID_CommSplit 18
|
||||
#define NVTX_SID_CommFinalize 19
|
||||
|
||||
// Define static schema ID for the reduction operation.
|
||||
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
|
||||
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 19 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
|
||||
|
||||
extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
|
||||
|
||||
struct nccl_domain{static constexpr char const* name{"NCCL"};};
|
||||
|
||||
/// @brief Register an NVTX payload schema for static-size payloads.
|
||||
class payload_schema {
|
||||
public:
|
||||
explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
|
||||
public:
|
||||
explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries,
|
||||
const uint64_t schemaId, const size_t size) noexcept
|
||||
{
|
||||
schema_attr.name = schemaName;
|
||||
schema_attr.payloadStaticSize = size;
|
||||
schema_attr.entries = entries;
|
||||
schema_attr.numEntries = numEntries;
|
||||
schema_attr.schemaId = schemaId;
|
||||
@@ -69,33 +72,105 @@ private:
|
||||
NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
|
||||
NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE |
|
||||
NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
|
||||
nullptr,
|
||||
nullptr, /* schema name is not needed */
|
||||
NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
|
||||
NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
|
||||
nullptr, 0, 0, 0, 0, nullptr};
|
||||
};
|
||||
|
||||
// Convenience macro to give the payload parameters a scope.
|
||||
#define NVTX3_PAYLOAD(...) __VA_ARGS__
|
||||
|
||||
// Create NVTX push/pop range with parameters
|
||||
// @param name of the operation (see `NVTX_SID_*`)
|
||||
// @param N schema name
|
||||
// @param S schema (entries)
|
||||
// @param P payload (struct)
|
||||
// @param N NCCL API name without the `nccl` prefix.
|
||||
// @param T name of the used NVTX payload schema without "Schema" suffix.
|
||||
// @param P payload parameters/entries
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
|
||||
nvtxPayloadData_t nvtx3_bpl__[] = { \
|
||||
{NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
|
||||
roctx_scoped_range_in const roctx_range__{S, nvtx3_bpl__, std::extent<decltype(S)>::value, "RCCL_" #ID};
|
||||
#define NVTX3_FUNC_WITH_PARAMS(N, T, P) \
|
||||
constexpr uint64_t schemaId = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \
|
||||
static const payload_schema schema{T##Schema, std::extent<decltype(T##Schema)>::value - 1, \
|
||||
schemaId, sizeof(T)}; \
|
||||
const T _payload = {P}; \
|
||||
nvtxPayloadData_t nvtx3_bpl__[] = {{schemaId, sizeof(_payload), &_payload}}; \
|
||||
roctx_scoped_range_in const roctx_range__{T##Schema, nvtx3_bpl__, std::extent<decltype(T##Schema)>::value - 1, "RCCL_" #N};
|
||||
#else
|
||||
#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
|
||||
static const payload_schema schema{S, std::extent<decltype(S)>::value, \
|
||||
NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
|
||||
#define NVTX3_FUNC_WITH_PARAMS(N, T, P) \
|
||||
constexpr uint64_t schemaId = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \
|
||||
static const payload_schema schema{T##Schema, std::extent<decltype(T##Schema)>::value - 1, \
|
||||
schemaId, sizeof(T)}; \
|
||||
static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
|
||||
nvtxPayloadData_t nvtx3_bpl__[] = { \
|
||||
{NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
|
||||
const T _payload = {P}; \
|
||||
nvtxPayloadData_t nvtx3_bpl__[] = {{schemaId, sizeof(_payload), &_payload}}; \
|
||||
::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
|
||||
::nvtx3::v1::scoped_range_in<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
|
||||
#endif
|
||||
|
||||
/// @brief Creates an NVTX range with extended payload using the RAII pattern.
|
||||
/// @tparam PayloadType Data type of the payload.
|
||||
template <typename PayloadType>
|
||||
class ncclNvtxRange {
|
||||
public:
|
||||
explicit ncclNvtxRange(const nvtxEventAttributes_t* evtAttr) noexcept {
|
||||
nvtxDomainRangePushEx(nvtx3::domain::get<nccl_domain>(), evtAttr);
|
||||
}
|
||||
|
||||
~ncclNvtxRange() noexcept {
|
||||
if (payloadData.payload) {
|
||||
nvtxRangePopPayload(nvtx3::domain::get<nccl_domain>(), &payloadData, 1);
|
||||
} else {
|
||||
nvtxDomainRangePop(nvtx3::domain::get<nccl_domain>());
|
||||
}
|
||||
}
|
||||
|
||||
void setPayloadData(const uint64_t schemaId) noexcept
|
||||
{
|
||||
payloadData = {schemaId, sizeof(PayloadType), &payload};
|
||||
}
|
||||
|
||||
ncclNvtxRange() = delete;
|
||||
ncclNvtxRange(ncclNvtxRange const&) = default;
|
||||
ncclNvtxRange& operator=(ncclNvtxRange const&) = default;
|
||||
ncclNvtxRange(ncclNvtxRange&&) = default;
|
||||
ncclNvtxRange& operator=(ncclNvtxRange&&) = default;
|
||||
|
||||
// Holds the payload data.
|
||||
PayloadType payload{};
|
||||
|
||||
nvtxPayloadData_t payloadData = {NVTX_PAYLOAD_ENTRY_TYPE_INVALID, 0, NULL};
|
||||
};
|
||||
|
||||
// Create an NVTX range with the function name as the range name. Use RAII pattern.
|
||||
// @param T Type ID of the NVTX payload (pointer for variable-size payloads).
|
||||
#define NVTX3_RANGE(T) \
|
||||
static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
|
||||
::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \
|
||||
ncclNvtxRange<T> nvtx3_range__{nvtx3_func_attr__.get()};
|
||||
|
||||
// Add static-size payload to the NVTX range created with `NVTX3_RANGE()`,
|
||||
// which must be in this or an outer scope.
|
||||
// @param N NCCL API name without the `nccl` prefix.
|
||||
// @param S name of the used NVTX payload schema.
|
||||
// @param P payload parameters/entries
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
#define NVTX3_RANGE_ADD_PAYLOAD(N, S, P) do { \
|
||||
constexpr uint64_t schema_id = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \
|
||||
static const payload_schema schema{S, std::extent<decltype(S)>::value - 1, schema_id, \
|
||||
sizeof(nvtx3_range__.payload)}; \
|
||||
nvtx3_range__.payload = {P}; \
|
||||
nvtx3_range__.setPayloadData(schema_id); \
|
||||
nvtxPayloadData_t nvtx3_bpl__[] = {{schema_id, sizeof(nvtx3_range__.payloadData), &nvtx3_range__.payloadData}}; \
|
||||
roctx_scoped_range_in const roctx_range__{S, nvtx3_bpl__, std::extent<decltype(S)>::value - 1, "RCCL_" #N}; \
|
||||
} while (0)
|
||||
#else
|
||||
#define NVTX3_RANGE_ADD_PAYLOAD(N, S, P) do { \
|
||||
constexpr uint64_t schema_id = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \
|
||||
static const payload_schema schema{S, std::extent<decltype(S)>::value - 1, schema_id, \
|
||||
sizeof(nvtx3_range__.payload)}; \
|
||||
nvtx3_range__.payload = {P}; \
|
||||
nvtx3_range__.setPayloadData(schema_id); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
extern void initNvtxRegisteredEnums();
|
||||
|
||||
#endif
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
|
||||
/* This is just an empty marker (for readability), which can be omitted. */
|
||||
/* TODO: Fix issue with trailing comma at end of entry list. */
|
||||
#define NVTX_PAYLOAD_ENTRIES
|
||||
#define NCCL_NVTX_PAYLOAD_ENTRIES
|
||||
|
||||
|
||||
/**
|
||||
@@ -32,7 +32,7 @@
|
||||
*
|
||||
* Example:
|
||||
* NVTX_DEFINE_SCHEMA_FOR_STRUCT(your_struct, "SchemaName",
|
||||
* NVTX_PAYLOAD_ENTRIES(
|
||||
* NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
* (index, TYPE_INT, "integer value"),
|
||||
* (dpfloat, TYPE_DOUBLE, "fp64 value"),
|
||||
* (text, TYPE_CSTRING, "text", NULL, 24)
|
||||
@@ -80,7 +80,7 @@
|
||||
*
|
||||
* Example:
|
||||
* NVTX_DEFINE_STRUCT_WITH_SCHEMA(your_struct_name, "Your schema name",
|
||||
* NVTX_PAYLOAD_ENTRIES(
|
||||
* NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
* (int, index, TYPE_INT, "integer value"),
|
||||
* (double, dpfloat, TYPE_DOUBLE, "fp64 value"),
|
||||
* (const char, (text, 24), TYPE_CSTRING, "text", NULL, 24)
|
||||
|
||||
@@ -0,0 +1,178 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
/// Definitions of NVTX payload types and schemas used for the NVTX
|
||||
/// instrumentation in init.cc and collectives.cc.
|
||||
|
||||
#ifndef NVTX_PAYLOAD_SCHEMAS_H_
|
||||
#define NVTX_PAYLOAD_SCHEMAS_H_
|
||||
|
||||
|
||||
#include "nccl.h"
|
||||
#include "nvtx3/nvToolsExtPayload.h"
|
||||
#include "nvtx3/nvToolsExtPayloadHelper.h"
|
||||
|
||||
/**
|
||||
* \brief Define a C struct together with the matching schema entries.
|
||||
*
|
||||
* Does the same as `NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA`, but without creating the
|
||||
* schema attributes. (Remove this helper when it is available in the NVTX headers.)
|
||||
*/
|
||||
#define NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(struct_id, prefix, entries) \
|
||||
_NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
|
||||
prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries)
|
||||
|
||||
// C strings used as NVTX payload entry names.
|
||||
static constexpr char const* nccl_nvtxCommStr = "NCCL communicator ID";
|
||||
static constexpr char const* nccl_nvtxCudaDevStr = "CUDA device";
|
||||
static constexpr char const* nccl_nvtxRankStr = "Rank";
|
||||
static constexpr char const* nccl_nvtxNranksStr = "No. of ranks";
|
||||
static constexpr char const* nccl_nvtxMsgSizeStr = "Message size [bytes]";
|
||||
static constexpr char const* nccl_nvtxMsgSizeSendStr = "Message size [bytes] (Send)";
|
||||
static constexpr char const* nccl_nvtxMsgSizeRecvStr = "Message size [bytes] (Recv)";
|
||||
static constexpr char const* nccl_nvtxReductionOpStrpStr = "Reduction operation";
|
||||
static constexpr char const* nccl_nvtxDataTypeStr = "Data type";
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommInitAll, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, commhash, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(int, ndev, TYPE_INT, "No. of devices")
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommInitRank, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(int, nranks, TYPE_INT, nccl_nvtxNranksStr),
|
||||
(int, myrank, TYPE_INT, nccl_nvtxRankStr),
|
||||
(int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr)
|
||||
)
|
||||
)
|
||||
// The typedef and payload schema for ncclCommInitRank is also used for,
|
||||
// ncclCommInitRankConfig, ncclCommInitRankScalable, ncclCommDestroy, and ncclCommAbort.
|
||||
typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankConfig;
|
||||
typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankScalable;
|
||||
typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommAbort;
|
||||
typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommDestroy;
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommSplit, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(uint64_t, parentcomm, TYPE_UINT64, "Parent NCCL communicator ID"),
|
||||
(int, nranks, TYPE_INT, nccl_nvtxNranksStr),
|
||||
(int, myrank, TYPE_INT, nccl_nvtxRankStr),
|
||||
(int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr),
|
||||
(int, color, TYPE_INT, "Color"),
|
||||
(int, key, TYPE_INT, "Key")
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommFinalize, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr)
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllGather, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
|
||||
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllReduce, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
|
||||
(ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr),
|
||||
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllToAll, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
|
||||
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllToAllv, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(size_t, sendBytes, TYPE_SIZE, nccl_nvtxMsgSizeSendStr),
|
||||
(size_t, recvBytes, TYPE_SIZE, nccl_nvtxMsgSizeRecvStr),
|
||||
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsBroadcast, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
|
||||
(int, root, TYPE_INT, "Root"),
|
||||
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsGather, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
|
||||
(int, root, TYPE_INT, "Root"),
|
||||
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduce, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
|
||||
(int, root, TYPE_INT, "Root"),
|
||||
(ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr),
|
||||
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduceScatter, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
|
||||
(ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr),
|
||||
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsScatter, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
|
||||
(int, root, TYPE_INT, "Root"),
|
||||
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
|
||||
)
|
||||
)
|
||||
|
||||
// Used in NCCL APIs `ncclSend` and `ncclRecv`.
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsSendRecv, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
|
||||
(int, peer, TYPE_INT, "Peer rank"),
|
||||
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsMSCCL, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
|
||||
(ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr),
|
||||
(ncclDataType_t, datatype, TYPE_DATATYPE, nccl_nvtxDataTypeStr)
|
||||
)
|
||||
)
|
||||
|
||||
#endif // end include guard
|
||||
@@ -13,7 +13,10 @@ struct nccl_domain{static constexpr char const* name{"NCCL"};};
|
||||
|
||||
#define NVTX3_FUNC_RANGE_IN(domain)
|
||||
#define nvtxNameOsThreadA(syscall, thread)
|
||||
#define NVTX3_FUNC_WITH_PARAMS(ID, S, P)
|
||||
#define NVTX3_FUNC_WITH_PARAMS(N, T, P)
|
||||
#define NVTX3_PAYLOAD(...) __VA_ARGS__
|
||||
#define NVTX3_RANGE(T)
|
||||
#define NVTX3_RANGE_ADD_PAYLOAD(N, S, P)
|
||||
|
||||
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11
|
||||
|
||||
|
||||
@@ -380,6 +380,8 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS);
|
||||
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
|
||||
|
||||
// NB: ncclProxyMsgTypeStr[] in proxy.cc needs to match
|
||||
enum ncclProxyMsgType {
|
||||
ncclProxyMsgInit = 1,
|
||||
ncclProxyMsgSharedInit = 2,
|
||||
|
||||
@@ -70,6 +70,9 @@ DECLARE_ROCM_PFN_EXTERN(hsa_status_string);
|
||||
extern int ncclCuMemEnable();
|
||||
extern int ncclCuMemHostEnable();
|
||||
|
||||
// Handle type used for cuMemCreate()
|
||||
extern CUmemAllocationHandleType ncclCuMemHandleType;
|
||||
|
||||
ncclResult_t rocmLibraryInit(void);
|
||||
|
||||
extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
|
||||
|
||||
+81
-134
@@ -25,6 +25,7 @@
|
||||
#endif
|
||||
#include "tuner.h"
|
||||
#include "ras.h"
|
||||
#include "mnnvl.h"
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
@@ -39,6 +40,7 @@
|
||||
#include "graph/xml.h"
|
||||
#include "archinfo.h"
|
||||
#include "param.h"
|
||||
#include "nvtx_payload_schemas.h"
|
||||
|
||||
// [RCCL]
|
||||
#include "git_version.h"
|
||||
@@ -437,6 +439,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
free(comm->rankToNode);
|
||||
free(comm->rankToLocalRank);
|
||||
free(comm->collNetHeads);
|
||||
free(comm->clique.ranks);
|
||||
|
||||
if (comm->bootstrap)
|
||||
NCCLCHECK(bootstrapClose(comm->bootstrap));
|
||||
@@ -861,6 +864,7 @@ static void showVersion() {
|
||||
}
|
||||
}
|
||||
|
||||
NCCL_PARAM(MNNVLUUID, "MNNVL_UUID", -1);
|
||||
NCCL_PARAM(MNNVLCliqueId, "MNNVL_CLIQUE_ID", -1);
|
||||
|
||||
static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
|
||||
@@ -916,12 +920,16 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
|
||||
info->fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
|
||||
(void) ncclNvmlDeviceGetGpuFabricInfoV(nvmlDev, &info->fabricInfo);
|
||||
if (info->fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
|
||||
if (ncclParamMNNVLUUID() != -1) {
|
||||
((long*)&info->fabricInfo.clusterUuid)[0] = ncclParamMNNVLUUID();
|
||||
((long*)&info->fabricInfo.clusterUuid)[1] = ncclParamMNNVLUUID();
|
||||
}
|
||||
if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
|
||||
INFO(NCCL_INIT, "MNNVL busId 0x%lx fabric UUID %lx.%lx cliqueId 0x%x state %d healthMask 0x%x",
|
||||
info->busId,
|
||||
((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
|
||||
info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
|
||||
}
|
||||
if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -991,71 +999,6 @@ NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0);
|
||||
// MNNVL: Flag to indicate whether to enable Multi-Node NVLink
|
||||
NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2);
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
|
||||
#include <cuda.h>
|
||||
#include "cudawrap.h"
|
||||
|
||||
// Determine if MNNVL support is available
|
||||
static int checkMNNVL(struct ncclComm* comm) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
// MNNVL requires cuMem to be enabled
|
||||
if (!ncclCuMemEnable()) return 0;
|
||||
|
||||
// MNNVL also requires FABRIC handle support
|
||||
int cudaDev;
|
||||
int flag = 0;
|
||||
CUdevice currentDev;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
||||
// Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
|
||||
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
|
||||
if (!flag) return 0;
|
||||
// Check that all ranks have initialized the fabric fully
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return 0;
|
||||
}
|
||||
|
||||
// Determine our MNNVL domain/clique
|
||||
NCCLCHECKGOTO(ncclCalloc(&comm->clique.ranks, comm->nRanks), ret, fail);
|
||||
comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId;
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo;
|
||||
nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
|
||||
// Check if the cluster UUID and cliqueId match
|
||||
// A zero UUID means we don't have MNNVL fabric info - disable MNNVL
|
||||
if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) goto fail;
|
||||
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
|
||||
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
|
||||
if (i == comm->rank) {
|
||||
comm->cliqueRank = comm->clique.size;
|
||||
}
|
||||
comm->clique.ranks[comm->clique.size++] = i;
|
||||
}
|
||||
}
|
||||
// Determine whether to enable MNNVL or not
|
||||
comm->MNNVL = ncclParamMNNVLEnable() == 2 ? comm->clique.size > 1 : ncclParamMNNVLEnable();
|
||||
INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d ", comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank);
|
||||
|
||||
if (comm->MNNVL) {
|
||||
// Force the CUMEM handle type to be FABRIC for MNNVL
|
||||
ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
}
|
||||
|
||||
return comm->MNNVL;
|
||||
|
||||
fail:
|
||||
if (comm->clique.ranks) free(comm->clique.ranks);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
static int checkMNNVL(struct ncclComm* comm) {
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#define TIMER_INIT_TOTAL 0
|
||||
#define TIMER_INIT_KERNELS 1
|
||||
#define TIMER_INIT_BOOTSTRAP 2
|
||||
@@ -1143,12 +1086,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
// AllGather1 - end
|
||||
timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER];
|
||||
|
||||
// MNNVL support
|
||||
if (nNodes > 1 && !checkMNNVL(comm) && ncclParamMNNVLEnable() == 1) {
|
||||
// Return an error if the user specifically requested MNNVL support
|
||||
WARN("MNNVL is not supported on this system");
|
||||
ret = ncclSystemError;
|
||||
goto fail;
|
||||
// Check for MNNVL support
|
||||
if ((nNodes > 1 && ncclParamMNNVLEnable() != 0) || ncclParamMNNVLEnable() == 1) {
|
||||
NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail);
|
||||
}
|
||||
|
||||
do {
|
||||
@@ -1559,7 +1499,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
comm->collNetSupport = 0;
|
||||
}
|
||||
}
|
||||
comm->isAllNvlink = ncclTopoPathAllNVLink(comm->topo);
|
||||
NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink));
|
||||
comm->isOneRPN = (comm->maxLocalRanks == 1);
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
|
||||
@@ -1917,6 +1857,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
int cudaDev = job->cudaDev;
|
||||
int* parentRanks = NULL;
|
||||
int cudaArch;
|
||||
int maxSharedMem = 0;
|
||||
double sum_timers = 0;
|
||||
uint64_t timers[TIMERS_INIT_COUNT] = {0};
|
||||
unsigned long long commIdHash;
|
||||
@@ -1925,12 +1866,13 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
|
||||
timers[TIMER_INIT_TOTAL] = clockNano();
|
||||
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
|
||||
CUDACHECKGOTO(cudaDeviceGetAttribute(&maxSharedMem, cudaDevAttrMaxSharedMemoryPerBlockOptin, cudaDev), res, fail);
|
||||
CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail);
|
||||
CUDACHECKGOTO(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail);
|
||||
cudaArch = 100*archMajor + 10*archMinor;
|
||||
|
||||
timers[TIMER_INIT_KERNELS] = clockNano();
|
||||
NCCLCHECK(ncclInitKernelsForDevice(cudaArch, &maxLocalSizeBytes));
|
||||
NCCLCHECK(ncclInitKernelsForDevice(cudaArch, maxSharedMem, &maxLocalSizeBytes));
|
||||
// Set the maximum kernel stack size of all kernels to avoid
|
||||
// a CUDA memory reconfig on load (c.f. NVSHMEM issue)
|
||||
#ifdef USE_INDIRECT_FUNCTION_CALL
|
||||
@@ -2119,18 +2061,24 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
|
||||
if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) {
|
||||
comm->config.cgaClusterSize = cgaClusterSizeEnv;
|
||||
} else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) {
|
||||
WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
|
||||
INFO(NCCL_ENV, "NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
|
||||
comm->config.cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
|
||||
}
|
||||
|
||||
minCTAsEnv = ncclParamMinCTAs();
|
||||
if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
|
||||
comm->config.minCTAs = minCTAsEnv;
|
||||
if (minCTAsEnv <= 0)
|
||||
INFO(NCCL_ENV, "NCCL_MIN_CTAS %d is too low, leaving it set at %d", minCTAsEnv, comm->config.minCTAs);
|
||||
else
|
||||
comm->config.minCTAs = minCTAsEnv;
|
||||
}
|
||||
|
||||
maxCTAsEnv = ncclParamMaxCTAs();
|
||||
if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
|
||||
comm->config.maxCTAs = maxCTAsEnv;
|
||||
if (maxCTAsEnv <= 0)
|
||||
INFO(NCCL_ENV, "NCCL_MAX_CTAS %d is too low, leaving it set at %d", maxCTAsEnv, comm->config.maxCTAs);
|
||||
else
|
||||
comm->config.maxCTAs = maxCTAsEnv;
|
||||
}
|
||||
|
||||
envNetName = ncclGetEnv("NCCL_NET");
|
||||
@@ -2151,22 +2099,22 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
|
||||
|
||||
/* cap channels if needed */
|
||||
if (comm->config.minCTAs > MAXCHANNELS) {
|
||||
WARN("minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS);
|
||||
INFO(NCCL_ENV, "minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS);
|
||||
comm->config.minCTAs = MAXCHANNELS;
|
||||
}
|
||||
|
||||
if (comm->config.maxCTAs > MAXCHANNELS) {
|
||||
WARN("maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS);
|
||||
INFO(NCCL_ENV, "maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS);
|
||||
comm->config.maxCTAs = MAXCHANNELS;
|
||||
}
|
||||
|
||||
if (comm->config.minCTAs > comm->config.maxCTAs) {
|
||||
WARN("minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs);
|
||||
INFO(NCCL_ENV, "minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs);
|
||||
comm->config.minCTAs = comm->config.maxCTAs;
|
||||
}
|
||||
|
||||
if (comm->config.splitShare != 1 && comm->config.splitShare != 0) {
|
||||
WARN("splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare);
|
||||
INFO(NCCL_ENV, "splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare);
|
||||
comm->config.splitShare = 0;
|
||||
}
|
||||
|
||||
@@ -2352,21 +2300,10 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
struct NvtxParamsCommInitRank
|
||||
{
|
||||
int rank;
|
||||
int nranks;
|
||||
int cudaDev;
|
||||
};
|
||||
constexpr nvtxPayloadSchemaEntry_t CommInitRankSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommInitRank, nranks)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommInitRank, cudaDev)},
|
||||
};
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
|
||||
ncclResult_t ncclCommInitRank_impl(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
|
||||
NCCLCHECK(Recorder::instance().record(rrCommInitRank, nranks, myrank, &commId));
|
||||
NVTX3_RANGE(NcclNvtxParamsCommInitRank)
|
||||
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
|
||||
rocmLibraryInit();
|
||||
|
||||
@@ -2374,10 +2311,11 @@ ncclResult_t ncclCommInitRank_impl(ncclComm_t* newcomm, int nranks, ncclUniqueId
|
||||
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
|
||||
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
|
||||
NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
|
||||
|
||||
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, &config, __func__));
|
||||
|
||||
NVTX3_RANGE_ADD_PAYLOAD(CommInitRank, NcclNvtxParamsCommInitRankSchema,
|
||||
NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -2390,10 +2328,7 @@ ncclResult_t ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlis
|
||||
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
|
||||
int oldDev = 0;
|
||||
|
||||
constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"}
|
||||
};
|
||||
NVTX3_FUNC_WITH_PARAMS(CommInitAll, CommInitAllSchema, ndev)
|
||||
NVTX3_RANGE(NcclNvtxParamsCommInitAll);
|
||||
|
||||
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
|
||||
rocmLibraryInit();
|
||||
@@ -2431,14 +2366,17 @@ ncclResult_t ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlis
|
||||
|
||||
ncclUniqueId uniqueId;
|
||||
NCCLCHECKGOTO(ncclGetUniqueId(&uniqueId), ret, fail);
|
||||
NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
|
||||
NCCLCHECKGOTO(ncclGroupStartInternal(), ret, fail);
|
||||
for (int i=0; i<ndev; i++) {
|
||||
// Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
|
||||
int dev = devlist ? devlist[i] : i;
|
||||
CUDACHECKGOTO(cudaSetDevice(dev), ret, fail);
|
||||
ncclCommInitRankDev(comms+i, ndev,1, &uniqueId, i, dev, &config, __func__);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
|
||||
NCCLCHECKGOTO(ncclGroupEndInternal(), ret, fail);
|
||||
|
||||
NVTX3_RANGE_ADD_PAYLOAD(CommInitAll, NcclNvtxParamsCommInitAllSchema,
|
||||
NVTX3_PAYLOAD(comms[0]->commHash, ndev));
|
||||
|
||||
exit:
|
||||
(void)cudaSetDevice(oldDev);
|
||||
@@ -2465,14 +2403,14 @@ ncclResult_t ncclCommInitRankConfig_impl(ncclComm_t *newcomm, int nranks, ncclUn
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
|
||||
ncclConfig_t *internalConfigPtr = NULL;
|
||||
|
||||
NVTX3_RANGE(NcclNvtxParamsCommInitRankConfig);
|
||||
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
|
||||
rocmLibraryInit();
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
|
||||
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
|
||||
NVTX3_FUNC_WITH_PARAMS(CommInitRankConfig, CommInitRankSchema, payload)
|
||||
|
||||
if (config == NULL)
|
||||
internalConfigPtr = &internalConfig;
|
||||
else
|
||||
@@ -2482,7 +2420,13 @@ ncclResult_t ncclCommInitRankConfig_impl(ncclComm_t *newcomm, int nranks, ncclUn
|
||||
exit:
|
||||
ncclGroupErrCheck(ret);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
|
||||
if (newcomm && *newcomm) {
|
||||
if (!(*newcomm)->config.blocking) {
|
||||
(void) ncclCommGetAsyncError(*newcomm, &ret);
|
||||
}
|
||||
NVTX3_RANGE_ADD_PAYLOAD(CommInitRankConfig, NcclNvtxParamsCommInitRankSchema,
|
||||
NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
|
||||
}
|
||||
return ret;
|
||||
fail:
|
||||
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
|
||||
@@ -2491,6 +2435,8 @@ fail:
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitRankScalable, ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config);
|
||||
ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config) {
|
||||
NVTX3_RANGE(NcclNvtxParamsCommInitRankScalable);
|
||||
|
||||
int cudaDev;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
|
||||
@@ -2500,9 +2446,6 @@ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myran
|
||||
rocmLibraryInit();
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
|
||||
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
|
||||
NVTX3_FUNC_WITH_PARAMS(CommInitRankScalable, CommInitRankSchema, payload)
|
||||
|
||||
if (config == NULL)
|
||||
internalConfigPtr = &internalConfig;
|
||||
else
|
||||
@@ -2512,7 +2455,13 @@ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myran
|
||||
exit:
|
||||
ncclGroupErrCheck(ret);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
|
||||
if (newcomm && *newcomm) {
|
||||
if (!(*newcomm)->config.blocking) {
|
||||
(void) ncclCommGetAsyncError(*newcomm, &ret);
|
||||
}
|
||||
NVTX3_RANGE_ADD_PAYLOAD(CommInitRankScalable, NcclNvtxParamsCommInitRankSchema,
|
||||
NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
|
||||
}
|
||||
return ret;
|
||||
fail:
|
||||
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
|
||||
@@ -2590,7 +2539,8 @@ static ncclResult_t commCleanup(ncclComm_t comm) {
|
||||
NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm);
|
||||
ncclResult_t ncclCommFinalize_impl(ncclComm_t comm) {
|
||||
NCCLCHECK(Recorder::instance().record(rrCommFinalize, comm));
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
NVTX3_RANGE(NcclNvtxParamsCommFinalize);
|
||||
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclCommFinalizeAsyncJob *job = NULL;
|
||||
|
||||
@@ -2615,7 +2565,13 @@ ncclResult_t ncclCommFinalize_impl(ncclComm_t comm) {
|
||||
exit:
|
||||
ncclGroupErrCheck(ret);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); }
|
||||
if (comm) {
|
||||
if (!comm->config.blocking) {
|
||||
NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
|
||||
}
|
||||
NVTX3_RANGE_ADD_PAYLOAD(CommFinalize, NcclNvtxParamsCommFinalizeSchema,
|
||||
NVTX3_PAYLOAD(comm->commHash));
|
||||
}
|
||||
return ret;
|
||||
fail:
|
||||
free(job);
|
||||
@@ -2711,8 +2667,8 @@ ncclResult_t ncclCommDestroy_impl(ncclComm_t comm) {
|
||||
struct ncclCommFinalizeAsyncJob *job = NULL;
|
||||
ncclResult_t res = ncclSuccess;
|
||||
|
||||
NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
|
||||
NVTX3_FUNC_WITH_PARAMS(CommDestroy, CommInitRankSchema, payload)
|
||||
NVTX3_FUNC_WITH_PARAMS(CommDestroy, NcclNvtxParamsCommInitRank,
|
||||
NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
|
||||
|
||||
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
@@ -2740,8 +2696,9 @@ fail:
|
||||
NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
|
||||
ncclResult_t ncclCommAbort_impl(ncclComm_t comm) {
|
||||
NCCLCHECK(Recorder::instance().record(rrCommAbort, comm));
|
||||
NVTX3_RANGE(NcclNvtxParamsCommAbort);
|
||||
|
||||
if (comm == NULL) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
return ncclSuccess;
|
||||
}
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
@@ -2762,8 +2719,8 @@ ncclResult_t ncclCommAbort_impl(ncclComm_t comm) {
|
||||
struct ncclCommFinalizeAsyncJob *job = NULL;
|
||||
ncclResult_t res = ncclSuccess;
|
||||
|
||||
NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
|
||||
NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload)
|
||||
NVTX3_RANGE_ADD_PAYLOAD(CommAbort, NcclNvtxParamsCommInitRankSchema,
|
||||
NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
|
||||
|
||||
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
|
||||
|
||||
@@ -2779,29 +2736,13 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
struct NvtxParamsCommSplit {
|
||||
int rank;
|
||||
int nranks;
|
||||
int cudaDev;
|
||||
int color;
|
||||
int key;
|
||||
};
|
||||
constexpr nvtxPayloadSchemaEntry_t CommSplitSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommSplit, nranks)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommSplit, cudaDev)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "color", nullptr, 0, offsetof(NvtxParamsCommSplit, color)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "key", nullptr, 0, offsetof(NvtxParamsCommSplit, key)},
|
||||
};
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
|
||||
ncclResult_t ncclCommSplit_impl(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
|
||||
struct ncclCommInitRankAsyncJob *job = NULL;
|
||||
struct ncclComm* childComm = NCCL_COMM_NULL;
|
||||
ncclResult_t res = ncclSuccess;
|
||||
|
||||
NvtxParamsCommSplit payload{comm->rank, comm->nRanks, comm->cudaDev, color, key};
|
||||
NVTX3_FUNC_WITH_PARAMS(CommSplit, CommSplitSchema, payload)
|
||||
NVTX3_RANGE(NcclNvtxParamsCommSplit)
|
||||
|
||||
int oldDev;
|
||||
CUDACHECK(cudaGetDevice(&oldDev));
|
||||
@@ -2863,6 +2804,12 @@ exit:
|
||||
(void)cudaSetDevice(oldDev);
|
||||
(void)ncclGroupErrCheck(res);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
|
||||
if (res == ncclSuccess && *newcomm) {
|
||||
NVTX3_RANGE_ADD_PAYLOAD(CommSplit, NcclNvtxParamsCommSplitSchema,
|
||||
NVTX3_PAYLOAD((*newcomm)->commHash, comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, color, key));
|
||||
}
|
||||
|
||||
return res;
|
||||
fail:
|
||||
if (childComm) {
|
||||
|
||||
@@ -24,6 +24,8 @@ DECLARE_ROCM_PFN(hsa_init);
|
||||
DECLARE_ROCM_PFN(hsa_system_get_info);
|
||||
DECLARE_ROCM_PFN(hsa_status_string);
|
||||
|
||||
// Handle type used for cuMemCreate()
|
||||
CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
|
||||
static void *hsaLib;
|
||||
static uint16_t version_major, version_minor;
|
||||
|
||||
@@ -0,0 +1,82 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "mnnvl.h"
|
||||
#include "transport.h"
|
||||
#include <cuda.h>
|
||||
#include "rocmwrap.h"
|
||||
|
||||
// Determine if MNNVL support is available
|
||||
ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) {
|
||||
// MNNVL requires cuMem to be enabled
|
||||
if (!ncclCuMemEnable()) return ncclSuccess;
|
||||
|
||||
// MNNVL also requires FABRIC handle support
|
||||
int cudaDev;
|
||||
int flag = 0;
|
||||
CUdevice currentDev;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUDACHECK(cuDeviceGet(¤tDev, cudaDev));
|
||||
// Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
|
||||
(void) cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev);
|
||||
if (!flag) return ncclSuccess;
|
||||
// Check that all ranks have initialized the fabric fully
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return ncclSuccess;
|
||||
}
|
||||
|
||||
// Determine our MNNVL domain/clique
|
||||
NCCLCHECK(ncclCalloc(&comm->clique.ranks, comm->nRanks));
|
||||
comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId;
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo;
|
||||
nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
|
||||
// Check if the cluster UUID and cliqueId match
|
||||
// A zero UUID means we don't have MNNVL fabric info - disable MNNVL
|
||||
if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess;
|
||||
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
|
||||
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
|
||||
if (i == comm->rank) {
|
||||
comm->cliqueRank = comm->clique.size;
|
||||
}
|
||||
comm->clique.ranks[comm->clique.size++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
// No MNNVL clique found
|
||||
if (comm->clique.size <= 1) return ncclSuccess;
|
||||
|
||||
// Check that FABRIC handles can be exported & imported by IMEX
|
||||
{
|
||||
void *ptr = NULL;
|
||||
CUmemGenericAllocationHandle handle;
|
||||
ncclCuDesc cuDesc;
|
||||
CUresult err;
|
||||
|
||||
// Allocate FABRIC handle compatible memory
|
||||
ncclResult_t ret = ncclCuMemAlloc(&ptr, &handle, CU_MEM_HANDLE_TYPE_FABRIC, CUDA_IPC_MIN);
|
||||
if (ret != ncclSuccess) return ncclSuccess;
|
||||
err = cuMemExportToShareableHandle(&cuDesc, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0);
|
||||
if (err != CUDA_SUCCESS ||
|
||||
(err = cuMemImportFromShareableHandle(&handle, &cuDesc, CU_MEM_HANDLE_TYPE_FABRIC)) != CUDA_SUCCESS) {
|
||||
const char *errStr;
|
||||
(void) cuGetErrorString(err, &errStr);
|
||||
NCCLCHECK(ncclCuMemFree(ptr));
|
||||
// Return an error if this is a MNNVL capable system but it's not working
|
||||
WARN("MNNVL (cliqueSize %d) is available but not supported on this system. Check the IMEX configuration.",
|
||||
comm->clique.size);
|
||||
return ncclSystemError;
|
||||
}
|
||||
NCCLCHECK(ncclCuMemFree(ptr));
|
||||
|
||||
// Force the CUMEM handle type to be FABRIC for MNNVL
|
||||
ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
comm->MNNVL = 1;
|
||||
INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d",
|
||||
comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
+3
-15
@@ -8,6 +8,7 @@
|
||||
#include "msccl/msccl_setup.h"
|
||||
#include "msccl/msccl_status.h"
|
||||
#include "api_trace.h"
|
||||
#include "nvtx_payload_schemas.h"
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
@@ -49,21 +50,8 @@ ncclResult_t mscclRunAlgo_impl(
|
||||
size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
|
||||
mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream) {
|
||||
Recorder::instance().record("mscclRunAlgo");
|
||||
struct NvtxParamsMsccl {
|
||||
size_t bytes;
|
||||
ncclRedOp_t op;
|
||||
ncclDataType_t dataType;
|
||||
};
|
||||
// Just pass the size of one send/recv messages and not the total bytes sent/received.
|
||||
constexpr nvtxPayloadSchemaEntry_t MscclSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
|
||||
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
|
||||
offsetof(NvtxParamsMsccl, op)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_DATATYPE, "Data type", nullptr, 0,
|
||||
offsetof(NvtxParamsMsccl, dataType)}
|
||||
};
|
||||
NvtxParamsMsccl payload{count * ncclTypeSize(dataType), op, dataType};
|
||||
NVTX3_FUNC_WITH_PARAMS(MSCCL, MscclSchema, payload)
|
||||
NVTX3_FUNC_WITH_PARAMS(MSCCL, NcclNvtxParamsMSCCL,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(dataType), op, dataType));
|
||||
|
||||
mscclStatus& status = mscclGetStatus(comm->rank);
|
||||
struct mscclAlgo* hostAlgo = status.hostAlgos[mscclAlgoHandle];
|
||||
|
||||
+21
-15
@@ -23,6 +23,8 @@
|
||||
#include <sys/time.h>
|
||||
#include <sched.h>
|
||||
|
||||
#define NCCL_MAX_PROXY_CONNECTIONS (NCCL_MAX_LOCAL_RANKS+1)
|
||||
|
||||
void* ncclProxyServiceUDS(void* _args);
|
||||
|
||||
static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
|
||||
@@ -799,8 +801,8 @@ process_nextops:
|
||||
ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
|
||||
ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlAppend);
|
||||
TIME_START(2);
|
||||
int freeOp[NCCL_MAX_LOCAL_RANKS];
|
||||
int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
|
||||
int freeOp[NCCL_MAX_PROXY_CONNECTIONS];
|
||||
int freeOpEnd[NCCL_MAX_PROXY_CONNECTIONS];
|
||||
for (int i = 0; i < proxyState->tpLocalnRanks; i++) freeOp[i] = -1;
|
||||
|
||||
uint64_t lastOpCount = 0;
|
||||
@@ -1090,7 +1092,8 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
|
||||
struct ncclProxyState* sharedProxyState = comm->proxyState;
|
||||
int tpProxyRank = comm->topParentRanks[proxyRank];
|
||||
|
||||
proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
|
||||
proxyConn->sameProcess = ((comm->peerInfo[proxyRank].hostHash == comm->peerInfo[comm->rank].hostHash) &&
|
||||
(comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash)) ? 1 : 0;
|
||||
// Keep one connection per local rank
|
||||
proxyConn->connection = NULL;
|
||||
proxyConn->tpRank = tpProxyRank;
|
||||
@@ -1223,7 +1226,7 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" };
|
||||
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd", "QueryFd", "Register", "Deregister" };
|
||||
ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
|
||||
struct ncclSocket* sock;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
@@ -1583,18 +1586,18 @@ void* ncclProxyService(void* _args) {
|
||||
connectionPool.banks = 0;
|
||||
connectionPool.offset = NCCL_PROXY_CONN_POOL_SIZE;
|
||||
|
||||
struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1];
|
||||
struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
|
||||
memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS);
|
||||
for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
|
||||
struct pollfd pollfds[NCCL_MAX_PROXY_CONNECTIONS+1]; // one extra for listenSock fd
|
||||
struct ncclProxyLocalPeer peers[NCCL_MAX_PROXY_CONNECTIONS];
|
||||
memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_PROXY_CONNECTIONS);
|
||||
for (int s=0; s<NCCL_MAX_PROXY_CONNECTIONS; s++) {
|
||||
pollfds[s].fd = -1;
|
||||
pollfds[s].events = POLLHUP|POLLIN;
|
||||
}
|
||||
if (ncclSocketGetFd(proxyState->listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) {
|
||||
if (ncclSocketGetFd(proxyState->listenSock, &pollfds[NCCL_MAX_PROXY_CONNECTIONS].fd) != ncclSuccess) {
|
||||
WARN("[Proxy Service] Get listenSock fd fails");
|
||||
return NULL;
|
||||
};
|
||||
pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
|
||||
pollfds[NCCL_MAX_PROXY_CONNECTIONS].events = POLLIN;
|
||||
|
||||
int maxnpeers = 0;
|
||||
int npeers = 0;
|
||||
@@ -1608,17 +1611,19 @@ void* ncclProxyService(void* _args) {
|
||||
/* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
|
||||
int ret;
|
||||
do {
|
||||
ret = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500);
|
||||
// poll all fds including the listenSock
|
||||
ret = poll(pollfds, NCCL_MAX_PROXY_CONNECTIONS+1, asyncOpCount ? 0 : 500);
|
||||
} while (ret < 0 && errno == EINTR);
|
||||
if (ret < 0) {
|
||||
WARN("[Proxy Service] Poll failed: %s", strerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) {
|
||||
if (pollfds[NCCL_MAX_PROXY_CONNECTIONS].revents) {
|
||||
// We got an event on the listenSock
|
||||
int s = 0;
|
||||
while (s < NCCL_MAX_LOCAL_RANKS && pollfds[s].fd >= 0) s++;
|
||||
if (s == NCCL_MAX_LOCAL_RANKS) {
|
||||
WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS);
|
||||
while (s < NCCL_MAX_PROXY_CONNECTIONS && pollfds[s].fd >= 0) s++;
|
||||
if (s == NCCL_MAX_PROXY_CONNECTIONS) {
|
||||
WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_PROXY_CONNECTIONS);
|
||||
return NULL;
|
||||
}
|
||||
if (maxnpeers < s+1) maxnpeers = s+1;
|
||||
@@ -1851,6 +1856,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
|
||||
|
||||
if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
|
||||
if (*comm->abortFlag == 0 && sharedProxyState->peerAddresses) {
|
||||
// We need to send a ncclProxyMsgStop message to our own proxy
|
||||
struct ncclSocket sock;
|
||||
int type = ncclProxyMsgStop;
|
||||
NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag));
|
||||
|
||||
@@ -80,7 +80,7 @@ static int rasOutBufferSize = 0;
|
||||
|
||||
// We use them all over the place; no point in wasting the stack...
|
||||
static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS_CLIENT_DETAIL_THRESHOLD) rank numbers
|
||||
// or for printing the local GPU devices, which can't be more than 64 (NCCL_MAX_LOCAL_RANKS)
|
||||
// or for printing the local GPU devices, which can't be more than 64
|
||||
// small numbers (times two if the NVML mask is different than the CUDA mask).
|
||||
// Still, 1024 should normally be plenty (verbose output may make things more difficult,
|
||||
// but we do check for overflows, so it will just be trimmed).
|
||||
@@ -1687,7 +1687,7 @@ static int rasCommRanksCollOpCompare(const void* p1, const void* p2) {
|
||||
const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size) {
|
||||
bool first = true;
|
||||
buf[0] = '\0';
|
||||
for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++)
|
||||
for (int i = 0; i < sizeof(cudaDevs)*8; i++)
|
||||
if (cudaDevs & (1UL << i)) {
|
||||
snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i);
|
||||
first = false;
|
||||
@@ -1695,7 +1695,7 @@ const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf,
|
||||
if (cudaDevs != nvmlDevs) {
|
||||
snprintf(buf+strlen(buf), size-strlen(buf), " (NVML ");
|
||||
first = true;
|
||||
for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++)
|
||||
for (int i = 0; i < sizeof(nvmlDevs)*8; i++)
|
||||
if (nvmlDevs & (1UL << i)) {
|
||||
snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i);
|
||||
first = false;
|
||||
|
||||
@@ -78,7 +78,7 @@ struct rasCollResponse {
|
||||
struct rasPeerInfo {
|
||||
union ncclSocketAddress addr;
|
||||
pid_t pid;
|
||||
uint64_t cudaDevs; // Bitmask. Conveniently, NCCL_MAX_LOCAL_RANKS == 64.
|
||||
uint64_t cudaDevs; // Bitmask. This is for local devices so 64 bits is enough.
|
||||
uint64_t nvmlDevs; // Same, but not affected by CUDA_VISIBLE_DEVICES.
|
||||
};
|
||||
|
||||
|
||||
@@ -73,15 +73,19 @@ ncclResult_t ncclRegisterCollNvlsBuffers(
|
||||
|
||||
if (nvlsReged) {
|
||||
*regNeedConnect = 0;
|
||||
/* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
|
||||
* saturate bandwidth. */
|
||||
/* tweak NVLS channels usage; for registered NVLS buffer to saturate bandwidth. */
|
||||
if (comm->nNodes == 1) {
|
||||
if (info->func == ncclFuncReduceScatter)
|
||||
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
|
||||
else
|
||||
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
|
||||
if (info->func == ncclFuncReduceScatter) {
|
||||
// RS: Further tweaks for Blackwell with NVLS registered buffers
|
||||
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 6 : 5));
|
||||
}
|
||||
else {
|
||||
// AR/AG: Further tweaks for Blackwell with NVLS registered buffers
|
||||
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 8 : 4));
|
||||
}
|
||||
} else {
|
||||
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6));
|
||||
// Further tweaks for Blackwell with NVLS registered buffers
|
||||
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 7 : 6));
|
||||
}
|
||||
info->regBufType |= NCCL_NVLS_REG_BUFFER;
|
||||
}
|
||||
|
||||
@@ -141,9 +141,11 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr,
|
||||
#include "channel.h"
|
||||
|
||||
#define NVLS_MEM_ALIGN_SIZE (1 << 21)
|
||||
#define NVLS_NCHANNELS_SM90 16
|
||||
#define NVLS_NCHANNELS_SM100 32
|
||||
|
||||
NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2);
|
||||
NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16);
|
||||
NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", -2);
|
||||
NCCL_PARAM(NvlsChunkSize, "NVLS_CHUNKSIZE", 128*1024);
|
||||
|
||||
ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
|
||||
@@ -152,7 +154,7 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
|
||||
|
||||
int gpuCount;
|
||||
NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount));
|
||||
if (!ncclParamNvlsEnable() || ((!comm->MNNVL && gpuCount <= 2) || (comm->MNNVL && comm->clique.size <= 2))) return ncclSuccess;
|
||||
if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess;
|
||||
|
||||
CUdevice dev;
|
||||
int driverVersion;
|
||||
@@ -170,7 +172,11 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
|
||||
}
|
||||
|
||||
INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev);
|
||||
if (comm->nvlsSupport == 1) comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (int)ncclParamNvlsChannels()));
|
||||
if (comm->nvlsSupport) {
|
||||
int channels = (comm->compCap >= 100) ? NVLS_NCHANNELS_SM100 : NVLS_NCHANNELS_SM90;
|
||||
if (ncclParamNvlsChannels() >= 0) channels = ncclParamNvlsChannels();
|
||||
comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, channels));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -228,7 +228,7 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int refcount, ncclIpcDe
|
||||
|
||||
// cuMem API support
|
||||
CUmemGenericAllocationHandle handle;
|
||||
NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size));
|
||||
NCCLCHECK(ncclCuMemAlloc(ptr, &handle, type, size));
|
||||
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
|
||||
// Return the native cuMem handle for later Export/Import via UDS
|
||||
memcpy(&ipcDesc->cuDesc.data, &handle, sizeof(handle));
|
||||
@@ -852,7 +852,7 @@ ncclResult_t ret = ncclSuccess;
|
||||
if (isLegacyIpc) *isLegacyIpc = false;
|
||||
if (regRecord) {
|
||||
// buffer was registered by by users, we need to start to register or reuse it
|
||||
int peerLocalRank;
|
||||
int peerLocalRank = -1;
|
||||
for (int p = 0; p < nPeers; p++) {
|
||||
int peerRank = peerRanks[p];
|
||||
peerLocalRank = comm->rankToLocalRank[peerRank];
|
||||
@@ -924,8 +924,10 @@ ncclResult_t ret = ncclSuccess;
|
||||
ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr;
|
||||
// Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side
|
||||
// and get the remote register address back.
|
||||
if (proxyConn)
|
||||
if (proxyConn) {
|
||||
INFO(NCCL_REG, "rank %d - IPC registering buffer %p size %ld (baseAddr %p size %ld) to peer %d", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank);
|
||||
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
|
||||
}
|
||||
if (rmtRegAddr) {
|
||||
NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail);
|
||||
assert(regRecord->ipcInfos[peerLocalRank] == NULL);
|
||||
@@ -943,7 +945,7 @@ ncclResult_t ret = ncclSuccess;
|
||||
regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr;
|
||||
needUpdate = true;
|
||||
*regBufFlag = 1;
|
||||
INFO(NCCL_REG, "rank %d - IPC register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
|
||||
INFO(NCCL_REG, "rank %d - IPC registered buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1077,6 +1079,8 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str
|
||||
assert(sizeof(struct p2pIpcExpInfo) == reqSize);
|
||||
assert(sizeof(void*) == respSize);
|
||||
|
||||
INFO(NCCL_REG, "Proxy rank %d register reqBuff %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, reqBuff, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
|
||||
|
||||
// request peer passes all necessary buffer info to import. The proxy thread would register
|
||||
// the buffer locally and return register addr back
|
||||
if (ipcExpInfo->legacyIpcCap) {
|
||||
@@ -1110,7 +1114,7 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str
|
||||
regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset);
|
||||
#endif
|
||||
}
|
||||
INFO(NCCL_REG, "Proxy rank %d register succeeds, regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
|
||||
INFO(NCCL_REG, "Proxy rank %d register success regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
|
||||
|
||||
exit:
|
||||
memcpy(respBuff, (void*)®Addr, sizeof(void*));
|
||||
|
||||
مرجع در شماره جدید
Block a user