Update ext-profiler example
Sync ext-profiler example with 2.26.2.
This commit is contained in:
zatwierdzone przez
Sylvain Jeaugey
rodzic
f44ac759fe
commit
145e67e707
+128
-14
@@ -49,9 +49,9 @@ of newer ones.
|
||||
The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
|
||||
from old API versions. It also provides error codes in `err.h`.
|
||||
|
||||
# API (v2)
|
||||
# API (v3)
|
||||
|
||||
Below is the main `ncclProfiler_v2` struct. Each function is explained in later sections.
|
||||
Below is the main `ncclProfiler_v3` struct. Each function is explained in later sections.
|
||||
|
||||
```
|
||||
typedef struct {
|
||||
@@ -70,7 +70,7 @@ typedef struct {
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
@@ -82,13 +82,13 @@ typedef struct {
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v2_t;
|
||||
} ncclProfiler_v3_t;
|
||||
```
|
||||
|
||||
## Error codes
|
||||
@@ -156,7 +156,6 @@ typedef struct {
|
||||
size_t count; // data count
|
||||
int root; // root rank
|
||||
const char* datatype; // string containing the name of the datatype
|
||||
size_t trafficBytes; // number of transfer bytes
|
||||
uint8_t nMaxChannels; // max number of channels for this collective
|
||||
uint8_t nWarps; // number of GPU warps for this collective
|
||||
const char* algo; // string containing name of the algorithm for this collective
|
||||
@@ -185,12 +184,22 @@ typedef struct {
|
||||
struct { // proxyStep events metadata
|
||||
int step; // individual step in `ncclProxyOp`
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
uint8_t channelId; // id of the channel used by the kernel
|
||||
} kernelCh;
|
||||
|
||||
struct {
|
||||
int64_t id; // net plugin id (used by net and profiler plugins to agree on event definitions)
|
||||
void* data; // pointer to network plugin defined event
|
||||
} netPlugin;
|
||||
};
|
||||
} ncclProfilerEventDescr_v2_t;
|
||||
} ncclProfilerEventDescr_v3_t;
|
||||
```
|
||||
|
||||
NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
|
||||
`ncclProfileProxyOp`, `ncclProfileProxyStep`, and `ncclProfileProxyCtrl`.
|
||||
`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
|
||||
`ncclProfileNetPlugin`.
|
||||
|
||||
#### stopEvent
|
||||
|
||||
@@ -236,7 +245,7 @@ typedef enum {
|
||||
ncclProfilerProxyCtrlWakeup, // state marks proxy progress thread waking up
|
||||
ncclProfilerProxyCtrlAppend, // state marks append of new network work item begin
|
||||
ncclProfilerProxyCtrlAppendEnd, // state marks append of new network work item end
|
||||
} ncclProfilerEventState_v2_t;
|
||||
} ncclProfilerEventState_v3_t;
|
||||
```
|
||||
|
||||
`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
|
||||
@@ -251,6 +260,89 @@ the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events.
|
||||
network requests for the GPU kernel. This includes everything else that the proxy thread might be
|
||||
doing, including appending new `ncclProxyOp` objects to the list of work elements to process.
|
||||
|
||||
`ncclProfileKernelCh` events are generated by the profiler proxy progress function while the kernel
|
||||
processes work items for the enqueued NCCL operations.
|
||||
|
||||
`ncclProfileNetPlugin` events are generated by the network plugin. Network plugins are free to define
|
||||
their own set of events and communicate them to the profiler plugin using `ncclProfileNetPlugin` and
|
||||
the `ncclProfilerCallback\_t` NCCL core callback. The network and profiler plugin can agree on the
|
||||
network defined event definition using the plugin id in the event descriptor. The plugin identifier
|
||||
is a 64-bit integer that has two parts: the 16 LSB are assigned to the plugin event version, the next
|
||||
16 bits are assigned to the plugin type (NCCL\_PROFILER\_NET\_TYPE\_IB, ...). The rest of the bits are
|
||||
unused and available for future extensions.
|
||||
|
||||
A network IB plugin can use this infrastructure to define a QP event as:
|
||||
|
||||
```C
|
||||
#define NCCL_PROFILER_NET_IB_VER 1
|
||||
|
||||
enum {
|
||||
ncclProfileQp = (1 << 0),
|
||||
};
|
||||
|
||||
// The data structure version is encoded in the plugin identifier bitmask and
|
||||
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||
// identifier in the event descriptor before calling the profiler startEvent
|
||||
// function. The profiler should inspect the plugin id to find out the source
|
||||
// plugin as well as the version of the event struct
|
||||
typedef struct {
|
||||
uint8_t type; // event type (plugin defined)
|
||||
union {
|
||||
struct {
|
||||
int device; // network device id
|
||||
uint64_t wr_id; // work request id
|
||||
int opcode; // ibv opcode
|
||||
int qpNum; // QP number
|
||||
size_t length; // work request data length
|
||||
} qp;
|
||||
};
|
||||
} ncclProfilerNetIbDescr_v1_t;
|
||||
```
|
||||
|
||||
The network event infrastructure is network agnostic. A different network socket plugin can
|
||||
use it to define a socket event as:
|
||||
|
||||
```C
|
||||
#define NCCL_PROFILER_NET_SOCKET_VER 1
|
||||
|
||||
enum {
|
||||
ncclProfileSocket = (1 << 0),
|
||||
};
|
||||
|
||||
// The data structure version is encoded in the plugin identifier bitmask and
|
||||
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||
// identifier in the event descriptor before calling the profiler startEvent
|
||||
// function. The profiler should inspect the plugin id to find out the source
|
||||
// plugin as well as the version of the event struct
|
||||
typedef struct {
|
||||
uint8_t type; // event type (plugin defined)
|
||||
union {
|
||||
struct {
|
||||
int fd;
|
||||
int op;
|
||||
size_t length;
|
||||
} sock;
|
||||
};
|
||||
} ncclProfilerNetSockDescr_v1_t;
|
||||
```
|
||||
|
||||
The network plugin creates an event (descriptor) and passes it to the profiler callback,
|
||||
along with the network type and version (plugin id). NCCL then creates a `ncclProfileNetPlugin`
|
||||
event descriptor, attaches the network plugin defined event as external data, and calls
|
||||
the profiler `startEvent` function.
|
||||
|
||||
```C
|
||||
ncclResult_t isend(..., void* phandle, ...) {
|
||||
...
|
||||
int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
|
||||
ncclProfilerNetIbDescr_v1_t eDescr = { };
|
||||
eDescr.type = ncclProfileQp;
|
||||
eDescr.qp = { ... };
|
||||
ncclProfilerCallback(&eHandle, 0 /* start net event */, phandle, pluginId, &eDescr);
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
State transitions for the events described can also come with event attribute updates. For this
|
||||
reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below.
|
||||
|
||||
@@ -264,7 +356,7 @@ typedef union {
|
||||
struct { // attributes to update for ncclProfileProxyCtrl
|
||||
int appendedProxyOps; // number of appended proxy ops thus far
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v2_t;
|
||||
} ncclProfilerEventStateArgs_v3_t;
|
||||
```
|
||||
|
||||
The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
|
||||
@@ -279,14 +371,22 @@ Group event
|
||||
+- Collective event
|
||||
| |
|
||||
| +- ProxyOp event
|
||||
| |
|
||||
| +- ProxyStep event
|
||||
| | |
|
||||
| | +- ProxyStep event
|
||||
| | |
|
||||
| | +- NetPlugin event
|
||||
| |
|
||||
| +- KernelCh event
|
||||
|
|
||||
+- Point-to-point event
|
||||
|
|
||||
+- ProxyOp event
|
||||
|
|
||||
+- ProxyStep event
|
||||
| |
|
||||
| +- ProxyStep event
|
||||
| |
|
||||
| +- NetPlugin event
|
||||
|
|
||||
+- KernelCh event
|
||||
|
||||
ProxyCtrl event
|
||||
```
|
||||
@@ -316,3 +416,17 @@ thread originating the operation. To avoid the profiler instance in the remote p
|
||||
dereference a pointer from another address space the event descriptor includes the PID of the originator.
|
||||
The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the
|
||||
parent event.
|
||||
|
||||
# Known Limitations
|
||||
|
||||
In intra-node communication, or whenever a rank does not have any network activity for which proxy events
|
||||
are unavailable, the profiler will only report the enqueue events (e.g., ncclAllReduce). The events from
|
||||
enqueue can be time stamped by the profiler (at start and stop) to reconstruct the execution time of the
|
||||
collective. However, this time only represents the launch time of the collective and not the actual
|
||||
execution time. To reconstruct the execution time more accurately proxy and kernel events are provided.
|
||||
|
||||
Kernel events instrumentation leverages counters exposed by the kernel to the host and the proxy progress
|
||||
thread. Thus, the proxy progress thread infrastructure is shared between the network and the profiler. If
|
||||
the proxy is serving network requests the kernel profiling probing can be delayed, causing loss of
|
||||
accuracy. Similarly, if the CPU is under heavy load and the scheduling of the proxy progress thread is
|
||||
delayed, a similar loss of accuracy can be encountered. Keep this in mind when using kernel events.
|
||||
|
||||
Reference in New Issue
Block a user