2.22.3-1
Rework core for NVIDIA Trusted Computing * Compress work structs so that they are shared between channels * Utilize the full amount of kernel argument space permitted (4k) before resorting to work fifo. * Rework the task preprocessing phase. * Use a separate abortDevFlag which is kept in sync with abortFlag using cudaMemcpy operations. * Rename src/include/align.h to src/include/bitops.h Add lazy connection establishment for collective operations * Move buffer allocation and connection establishment to the first collective operation using that algorithm. * Accelerate init time and reduce memory usage. * Avoid allocating NVLS buffers if all calls are registered. * Compute algo/proto in ncclLaunchCollTasksInfo early on. * Connect peers in ncclCollPreconnectFunc if not connected already. * Also move shared buffer creation to the first send/recv call. Accelerate intra-node NVLink detection * Make each rank only detect NVLinks attached to its GPU. * Fuse XMLs to reconstruct the full NVLink topology Add init profiling to report time spend in different init phases. * Report timings of bootstrap, allgather, search, connect, etc. * Add new "PROFILE" category for NCCL_DEBUG_SUBSYS. Add support for PCI p2p on split PCI switches * Detect split PCI switches through a kernel module exposing switch information. * Update the topology XML and graph to add those inter-switch connections. Add cost estimation API * Add a new ncclGroupEndSimulate primitive to return the estimated time a group would take. Net/IB: Add separate traffic class for fifo messages * Add NCCL_IB_FIFO_TC to control the traffic class of fifo messages independently from NCCL_IB_TC. Merges PR #1194 Net/IB: Add support for IB router * Use flid instead of lid if subnets do not match * Warn if flid is 0 Optimizations and fixes for device network offload (unpack) * Double the default number of channels * Cache netDeviceType * Fix save/increment head logic to enable Tree support. Support ncclGroupStart/End for ncclCommAbort/Destroy * Allow Abort/Destroy to be called within a group when managing multiple GPUs with a single process. Improve Tuner API * Provide to the plugin the original cost table so that the plugin can leave unknown or disabled algo/proto combinations untouched. * Remove nvlsSupport and collnetSupport. Do not print version to stdout when using a debug file * Also print version from all processes with INFO debug level. Fixes issue #1271 Fix clang warnings in NVTX headers * Update NVTX headers to the latest version Fixes issue #1270 Disable port fusion in heterogeneous systems * Do not fuse ports if a mix of multi-port and single port are detected. Fix NVLS graphs search for dual NICs. * Fix NVLS graph search when we have more than one NIC per GPU. Fix crash with collnetDirect * Add separate graph search for collnetDirect, testing alltoall paths and working similarly to the NVLS search. Fix hang when nodes have different CPU types * Add the CPU type to the rank peer info. * Align all ranks on the CPU type after the first allgather. * Only use the aligned CPU type for all tuning operations. Fixes issue #1136 Fixes issue #1184 Fix performance of registered send/recv operations * Allow for single full size operations * Add INFO to confirm the registration of send/recv buffers. Move all sync ops to finalize stage * Ensure ncclCommDestroy is non-blocking if ncclCommFinalize has been called. Improve error reporting during SHM segment creation Improve support of various compilers Merges PR #1177 Merges PR #1228 Allow net and tuner plugins to be statically linked * Search for ncclNet or ncclTuner symbols in the main binary. Merges PR #979 Plugin examples includes cleanup * Harmonize err.h and common.h usage. * Add mixed plugin with both net and tuner.
This commit is contained in:
@@ -0,0 +1,15 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef COMMON_H_
|
||||
#define COMMON_H_
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
#endif
|
||||
@@ -11,6 +11,7 @@ typedef enum { ncclSuccess = 0,
|
||||
ncclSystemError = 2,
|
||||
ncclInternalError = 3,
|
||||
ncclInvalidArgument = 4,
|
||||
ncclInvalidUsage = 5,
|
||||
ncclRemoteError = 6 } ncclResult_t;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "err.h"
|
||||
|
||||
#define NCCL_NET_HANDLE_MAXSIZE 128
|
||||
@@ -19,11 +20,6 @@
|
||||
// Maximum number of requests per comm object
|
||||
#define NCCL_NET_MAX_REQUESTS 32
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
#include "net_v8.h"
|
||||
#include "net_v7.h"
|
||||
#include "net_v6.h"
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_ERR_H_
|
||||
#define NCCL_ERR_H_
|
||||
#ifndef NCCL_TYPES_H_
|
||||
#define NCCL_TYPES_H_
|
||||
|
||||
/* Data types */
|
||||
typedef enum { ncclInt8 = 0, ncclChar = 0,
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef COMMON_H_
|
||||
#define COMMON_H_
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,17 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_ERR_H_
|
||||
#define NCCL_ERR_H_
|
||||
|
||||
/* Error type for plugins */
|
||||
typedef enum { ncclSuccess = 0,
|
||||
ncclUnhandledCudaError = 1,
|
||||
ncclSystemError = 2,
|
||||
ncclInternalError = 3,
|
||||
ncclInvalidArgument = 4,
|
||||
ncclInvalidUsage = 5,
|
||||
ncclRemoteError = 6 } ncclResult_t;
|
||||
|
||||
#endif
|
||||
@@ -8,15 +8,24 @@
|
||||
#ifndef NCCL_TUNER_H_
|
||||
#define NCCL_TUNER_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
#include "common.h"
|
||||
#include "err.h"
|
||||
|
||||
#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
|
||||
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
|
||||
typedef enum {
|
||||
ncclFuncBroadcast = 0,
|
||||
ncclFuncReduce = 1,
|
||||
ncclFuncAllGather = 2,
|
||||
ncclFuncReduceScatter = 3,
|
||||
ncclFuncAllReduce = 4,
|
||||
ncclFuncSendRecv = 5,
|
||||
ncclFuncSend = 6,
|
||||
ncclFuncRecv = 7,
|
||||
ncclNumFuncs = 8
|
||||
} ncclFunc_t;
|
||||
|
||||
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
|
||||
#define NCCL_ALGO_UNDEF -1
|
||||
@@ -33,6 +42,8 @@ typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncRed
|
||||
#define NCCL_PROTO_LL128 1
|
||||
#define NCCL_PROTO_SIMPLE 2
|
||||
|
||||
#define NCCL_ALGO_PROTO_IGNORE -1.0
|
||||
|
||||
// API to be implemented by external tuner
|
||||
typedef struct {
|
||||
// Name of the tuner
|
||||
@@ -52,31 +63,33 @@ typedef struct {
|
||||
// - context: tuner context object
|
||||
// - collType: collective type , e.g., allreduce, allgather…
|
||||
// - nBytes: collective size in bytes
|
||||
// - collNetSupport: whether collnet supports this type
|
||||
// - nvlsSupport: whether nvlink sharp supports this time
|
||||
// - numPipeOps: number of operations in the group
|
||||
// - numAlgo: number of algorithms in collCostTable
|
||||
// - numProto: number of protocols in collCostTable
|
||||
//
|
||||
// Outputs:
|
||||
// - algorithm: selected algorithm to be used for the given collective
|
||||
// - protocol: selected protocol to be used for the given collective
|
||||
// - nChannels: number of channels (hence SMs) to be used.
|
||||
//
|
||||
// InOut:
|
||||
// - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
|
||||
// NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
|
||||
//
|
||||
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
|
||||
// default tuning for the given collective.
|
||||
// Also, the plugin is allowed to not set any output, or set only the
|
||||
// algorithm and protocol, but not only the algorithm or only the protocol.
|
||||
// Unset fields will be set automatically by NCCL.
|
||||
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int *algorithm, int *protocol, int* nChannels);
|
||||
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
|
||||
int* nChannels);
|
||||
|
||||
// Terminates the plugin and cleans up any resources that the plugin allocated.
|
||||
// context: tuner context object
|
||||
ncclResult_t (*destroy)(void* context);
|
||||
} ncclTuner_v2_t;
|
||||
} ncclTuner_v3_t;
|
||||
|
||||
typedef ncclTuner_v2_t ncclTuner_t;
|
||||
typedef ncclTuner_v3_t ncclTuner_t;
|
||||
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
|
||||
|
||||
#endif
|
||||
|
||||
@@ -11,14 +11,21 @@
|
||||
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
|
||||
|
||||
__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; }
|
||||
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
|
||||
int* nChannels) {
|
||||
// Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
|
||||
if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
|
||||
collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
|
||||
}
|
||||
*nChannels = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
|
||||
|
||||
#define PLUGIN_NAME "Example"
|
||||
|
||||
const ncclTuner_v2_t ncclTunerPlugin_v2 = {
|
||||
const ncclTuner_v3_t ncclTunerPlugin_v3 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.getCollInfo = pluginGetCollInfo,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 21
|
||||
NCCL_PATCH := 5
|
||||
NCCL_MINOR := 22
|
||||
NCCL_PATCH := 3
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
+3
-2
@@ -201,7 +201,6 @@ ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFrom
|
||||
|
||||
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
|
||||
memset(handle, 0, sizeof(ncclBootstrapHandle));
|
||||
NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
|
||||
|
||||
const char* env = ncclGetEnv("NCCL_COMM_ID");
|
||||
if (env) {
|
||||
@@ -210,7 +209,9 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
|
||||
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
handle->magic = NCCL_MAGIC;
|
||||
} else {
|
||||
NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
|
||||
memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
|
||||
NCCLCHECK(bootstrapCreateRoot(handle, false));
|
||||
}
|
||||
@@ -626,7 +627,7 @@ ncclResult_t bootstrapClose(void* commState) {
|
||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||
if (state->unexpectedConnections != NULL) {
|
||||
unexpectedFree(state);
|
||||
if (__atomic_load_n(state->abortFlag, __ATOMIC_RELAXED) == 0) {
|
||||
if (__atomic_load_n(state->abortFlag, __ATOMIC_ACQUIRE) == 0) {
|
||||
WARN("Unexpected connections are not empty");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
+5
-3
@@ -7,16 +7,17 @@
|
||||
#include "channel.h"
|
||||
#include "param.h"
|
||||
#include "gdrwrap.h"
|
||||
#include "transport.h"
|
||||
|
||||
ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
|
||||
struct ncclChannel* channel = &comm->channels[channelId];
|
||||
if (channel->id != -1) return ncclSuccess;
|
||||
|
||||
int nRanks = comm->nRanks;
|
||||
int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
|
||||
int nvlsRanks = comm->localRanks;
|
||||
int nPeers = nRanks + 1 /* Collnet */ + nvlsRanks /* NVLS */;
|
||||
channel->id = channelId;
|
||||
channel->workFifoSent = 0;
|
||||
channel->workFifoProduced = 0;
|
||||
|
||||
struct ncclSharedResources* sharedRes = comm->sharedRes;
|
||||
|
||||
@@ -74,7 +75,8 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
|
||||
|
||||
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
|
||||
|
||||
int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
|
||||
int nvlsRanks = comm->localRanks;
|
||||
|
||||
if (share) {
|
||||
channel->nvlsPeers = parent->channels[channelId].nvlsPeers;
|
||||
channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers;
|
||||
|
||||
@@ -9,6 +9,69 @@
|
||||
#include "enqueue.h"
|
||||
#include "nccl.h"
|
||||
|
||||
const char* ncclFuncToString(ncclFunc_t fn) {
|
||||
switch (fn) {
|
||||
case ncclFuncAllGather: return "AllGather";
|
||||
case ncclFuncAllReduce: return "AllReduce";
|
||||
case ncclFuncBroadcast: return "Broadcast";
|
||||
case ncclFuncRecv: return "Recv";
|
||||
case ncclFuncReduce: return "Reduce";
|
||||
case ncclFuncReduceScatter: return "ReduceScatter";
|
||||
case ncclFuncSendRecv: return "SendRecv";
|
||||
case ncclFuncSend: return "Send";
|
||||
default: return "Invalid";
|
||||
}
|
||||
}
|
||||
|
||||
const char* ncclDevRedOpToString(ncclDevRedOp_t op) {
|
||||
switch (op) {
|
||||
case ncclDevSum: return "Sum";
|
||||
case ncclDevProd: return "Prod";
|
||||
case ncclDevMinMax: return "MinMax";
|
||||
case ncclDevPreMulSum: return "PreMulSum";
|
||||
case ncclDevSumPostDiv: return "SumPostDiv";
|
||||
default: return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
const char* ncclDatatypeToString(ncclDataType_t type) {
|
||||
switch (type) {
|
||||
case ncclInt8: return "ncclInt8";
|
||||
case ncclInt32: return "ncclInt32";
|
||||
case ncclUint32: return "ncclUint32";
|
||||
case ncclInt64: return "ncclInt64";
|
||||
case ncclUint64: return "ncclUint64";
|
||||
case ncclFloat16: return "ncclFloat16";
|
||||
case ncclFloat32: return "ncclFloat32";
|
||||
case ncclFloat64: return "ncclFloat64";
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
case ncclBfloat16: return "ncclBfloat16";
|
||||
#endif
|
||||
default: return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
const char* ncclAlgoToString(int algo) {
|
||||
switch (algo) {
|
||||
case NCCL_ALGO_TREE: return "TREE";
|
||||
case NCCL_ALGO_RING: return "RING";
|
||||
case NCCL_ALGO_COLLNET_DIRECT: return "COLLNET_DIRECT";
|
||||
case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN";
|
||||
case NCCL_ALGO_NVLS: return "NVLS";
|
||||
case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE";
|
||||
default: return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
const char* ncclProtoToString(int proto) {
|
||||
switch (proto) {
|
||||
case NCCL_PROTO_LL: return "LL";
|
||||
case NCCL_PROTO_LL128: return "LL128";
|
||||
case NCCL_PROTO_SIMPLE: return "SIMPLE";
|
||||
default: return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
|
||||
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
|
||||
+29
-18
@@ -8,7 +8,10 @@
|
||||
#include "nccl_net.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
#include <strings.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <chrono>
|
||||
#include "param.h"
|
||||
|
||||
int ncclDebugLevel = -1;
|
||||
@@ -16,14 +19,15 @@ static int pid = -1;
|
||||
static char hostname[1024];
|
||||
thread_local int ncclDebugNoWarn = 0;
|
||||
char ncclLastError[1024] = ""; // Global string for the last error in human readable form
|
||||
uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV
|
||||
static uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV
|
||||
FILE *ncclDebugFile = stdout;
|
||||
pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
std::chrono::steady_clock::time_point ncclEpoch;
|
||||
static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static std::chrono::steady_clock::time_point ncclEpoch;
|
||||
static bool ncclWarnSetDebugInfo = false;
|
||||
|
||||
static __thread int tid = -1;
|
||||
|
||||
void ncclDebugInit() {
|
||||
static void ncclDebugInit() {
|
||||
pthread_mutex_lock(&ncclDebugLock);
|
||||
if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
|
||||
const char* nccl_debug = ncclGetEnv("NCCL_DEBUG");
|
||||
@@ -83,6 +87,8 @@ void ncclDebugInit() {
|
||||
mask = NCCL_BOOTSTRAP;
|
||||
} else if (strcasecmp(subsys, "REG") == 0) {
|
||||
mask = NCCL_REG;
|
||||
} else if (strcasecmp(subsys, "PROFILE") == 0) {
|
||||
mask = NCCL_PROFILE;
|
||||
} else if (strcasecmp(subsys, "ALL") == 0) {
|
||||
mask = NCCL_ALL;
|
||||
}
|
||||
@@ -94,6 +100,15 @@ void ncclDebugInit() {
|
||||
free(ncclDebugSubsys);
|
||||
}
|
||||
|
||||
const char* ncclWarnSetDebugInfoEnv = ncclGetEnv("NCCL_WARN_ENABLE_DEBUG_INFO");
|
||||
if (ncclWarnSetDebugInfoEnv != NULL && strlen(ncclWarnSetDebugInfoEnv) > 0) {
|
||||
int64_t value;
|
||||
errno = 0;
|
||||
value = strtoll(ncclWarnSetDebugInfoEnv, NULL, 0);
|
||||
if (!errno)
|
||||
ncclWarnSetDebugInfo = value;
|
||||
}
|
||||
|
||||
// Cache pid and hostname
|
||||
getHostName(hostname, 1024, '.');
|
||||
pid = getpid();
|
||||
@@ -143,8 +158,6 @@ void ncclDebugInit() {
|
||||
pthread_mutex_unlock(&ncclDebugLock);
|
||||
}
|
||||
|
||||
NCCL_PARAM(WarnSetDebugInfo, "WARN_ENABLE_DEBUG_INFO", 0);
|
||||
|
||||
/* Common logging function used by the INFO, WARN and TRACE macros
|
||||
* Also exported to the dynamically loadable Net transport modules so
|
||||
* they can share the debugging mechanisms and output files
|
||||
@@ -178,7 +191,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
if (level == NCCL_LOG_WARN) {
|
||||
len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
|
||||
hostname, pid, tid, cudaDev, filefunc, line);
|
||||
if (ncclParamWarnSetDebugInfo()) ncclDebugLevel = NCCL_LOG_INFO;
|
||||
if (ncclWarnSetDebugInfo) ncclDebugLevel = NCCL_LOG_INFO;
|
||||
} else if (level == NCCL_LOG_INFO) {
|
||||
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
|
||||
} else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
|
||||
@@ -190,17 +203,15 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
hostname, pid, tid, cudaDev, timestamp, filefunc, line);
|
||||
}
|
||||
|
||||
if (len) {
|
||||
va_list vargs;
|
||||
va_start(vargs, fmt);
|
||||
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
|
||||
va_end(vargs);
|
||||
// vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
|
||||
// Rewind len so that we can replace the final \0 by \n
|
||||
if (len > sizeof(buffer)) len = sizeof(buffer)-1;
|
||||
buffer[len++] = '\n';
|
||||
fwrite(buffer, 1, len, ncclDebugFile);
|
||||
}
|
||||
va_list vargs;
|
||||
va_start(vargs, fmt);
|
||||
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
|
||||
va_end(vargs);
|
||||
// vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
|
||||
// Rewind len so that we can replace the final \0 by \n
|
||||
if (len > sizeof(buffer)) len = sizeof(buffer)-1;
|
||||
buffer[len++] = '\n';
|
||||
if (len) fwrite(buffer, 1, len, ncclDebugFile);
|
||||
}
|
||||
|
||||
NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
|
||||
|
||||
+55
-62
@@ -10,30 +10,26 @@
|
||||
|
||||
namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
const int *ringRanks = ring->userRanks;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const size_t channelCount = args->workCount;
|
||||
const size_t gridOffset = args->workOffset;
|
||||
const size_t count = args->count;
|
||||
size_t count, partOffset, partCount, chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount);
|
||||
size_t offset;
|
||||
size_t dataOffset;
|
||||
int nelem;
|
||||
int rankDest;
|
||||
|
||||
T *inputBuf = (T*)args->sendbuff;
|
||||
T *outputBuf = (T*)args->recvbuff;
|
||||
T *inputBuf = (T*)work->sendbuff;
|
||||
T *outputBuf = (T*)work->recvbuff;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
|
||||
(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
|
||||
(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg);
|
||||
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) {
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
dataOffset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, partCount - elemOffset);
|
||||
dataOffset = partOffset + elemOffset;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ringRanks[0];
|
||||
@@ -64,52 +60,50 @@ namespace {
|
||||
}
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
using Proto = ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>;
|
||||
runRing<T, RedOp, Proto>(args);
|
||||
runRing<T, RedOp, Proto>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL>(args);
|
||||
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL128>(args);
|
||||
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
const ssize_t count = args->count;
|
||||
const ssize_t rank = ncclShmem.comm.rank;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
size_t gridOffset = args->workOffset;
|
||||
size_t channelCount = args->workCount;
|
||||
size_t count, gridOffset, channelCount;
|
||||
size_t chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
const int nThreadsBcast = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
|
||||
const int nThreadsGather = args->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
|
||||
const int nThreadsBcast = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
|
||||
const int nThreadsGather = work->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
|
||||
const int tidEndGather = nThreadsGather;
|
||||
const int tidEndBcast = tidEndGather + nThreadsBcast;
|
||||
|
||||
if (!args->regUsed) {
|
||||
if (!work->regUsed) {
|
||||
if (tid < tidEndGather) {
|
||||
// Gather
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
prims(tid, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
@@ -119,8 +113,8 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
// Bcast through NVLS
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
|
||||
prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, work->sendbuff, NULL,
|
||||
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
@@ -133,7 +127,7 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
|
||||
/* used as sync */
|
||||
prims.scatter(0, 0, 0, 0, -1, 0);
|
||||
@@ -144,8 +138,8 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
} else if (tid < tidEndBcast) {
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, args->sendbuff, NULL,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, work->sendbuff, NULL,
|
||||
work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
/* used as sync */
|
||||
prims.recv(0, 0);
|
||||
|
||||
@@ -161,10 +155,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
|
||||
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
|
||||
template<bool BcastSendNotRecv>
|
||||
struct Scatterer {
|
||||
struct ncclWorkElem* args;
|
||||
struct ncclDevWorkColl* work;
|
||||
ssize_t chunkSize;
|
||||
ssize_t railGridOffset;
|
||||
|
||||
@@ -179,13 +173,13 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
int nNodes = ncclShmem.comm.nNodes;
|
||||
int nRails = direct->nHeads;
|
||||
int bid = args->bid;
|
||||
char* inbuf = (char*)args->sendbuff;
|
||||
char* outbuf = (char*)args->recvbuff;
|
||||
ssize_t sizePerRank = args->count*sizeof(T);
|
||||
int part = ncclShmem.channelId - work->channelLo;
|
||||
char* inbuf = (char*)work->sendbuff;
|
||||
char* outbuf = (char*)work->recvbuff;
|
||||
ssize_t sizePerRank = work->collnet.count*sizeof(T);
|
||||
bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*sizePerRank);
|
||||
|
||||
ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank);
|
||||
ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank);
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
|
||||
int railAllSize = railAllEnd - railAllBeg;
|
||||
if (tid < nDsts) dstSizes[tid] = railAllSize;
|
||||
@@ -232,28 +226,27 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
}
|
||||
};
|
||||
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
int tid = threadIdx.x;
|
||||
const int nChannels = args->nChannels;
|
||||
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
|
||||
const int part = ncclShmem.channelId - work->channelLo;
|
||||
const int nChannels = work->channelHi - work->channelLo + 1;
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
int const &nNodes = ncclShmem.comm.nNodes;
|
||||
ssize_t chunkSize = int(args->chunkCount);
|
||||
ssize_t const &sizePerRank = args->count;
|
||||
|
||||
ssize_t sizePerRank = work->collnet.count*sizeof(T);
|
||||
size_t chunkSize = work->collnet.chunkCount;
|
||||
bool isMultiRail = (direct->nHeads > 1);
|
||||
int nWarps1 = 1;
|
||||
int nWarps2 = (isMultiRail ? 2 : 1);
|
||||
int nWarps3 = (isMultiRail ? 2 : 0);
|
||||
float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3);
|
||||
float denom = float(work->nWarps)/float(nWarps1+nWarps2+nWarps3);
|
||||
nWarps3 = int(denom*nWarps3);
|
||||
nWarps2 = int(denom*nWarps2);
|
||||
nWarps1 = args->nWarps - (nWarps2+nWarps3);
|
||||
nWarps1 = work->nWarps - (nWarps2+nWarps3);
|
||||
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
|
||||
int tn = nWarps1*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (tid == 0) {
|
||||
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
|
||||
@@ -262,10 +255,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
} else {
|
||||
// Phase 1: send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, nullptr, &direct->out, args->sendbuff, nullptr,
|
||||
prims(tid, tn, nullptr, &direct->out, work->sendbuff, nullptr,
|
||||
/*redOpArg=*/0, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
|
||||
ssize_t railAllBeg = railGridOffset + args->bid * chunkSize;
|
||||
ssize_t railAllBeg = railGridOffset + part * chunkSize;
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
|
||||
ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
|
||||
ssize_t railOneEnd = railOneBeg + sizePerRank;
|
||||
@@ -280,7 +273,7 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
|
||||
tn = nWarps2*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (tid == 0) {
|
||||
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
|
||||
@@ -293,10 +286,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
/*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
|
||||
Scatterer</*BcastSendNotRecv=*/true> scat;
|
||||
scat.args = args;
|
||||
scat.work = work;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
prims.template process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
}
|
||||
}
|
||||
return;
|
||||
@@ -311,10 +304,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
/*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
Scatterer</*BcastSendNotRecv=*/false> scat;
|
||||
scat.args = args;
|
||||
scat.work = work;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.process</*Recv=*/1, /*Send=*/0>(scat);
|
||||
prims.template process</*Recv=*/1, /*Send=*/0>(scat);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
+143
-147
@@ -10,28 +10,27 @@
|
||||
|
||||
namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
int ringIx = ring->index;
|
||||
ssize_t chunkCount = args->chunkCount;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
ssize_t gridOffset;
|
||||
ssize_t channelCount;
|
||||
ssize_t chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
|
||||
const ssize_t loopCount = nranks * chunkCount;
|
||||
ssize_t offset;
|
||||
ssize_t gridOffset = args->workOffset;
|
||||
ssize_t channelCount = args->workCount;
|
||||
int nelem;
|
||||
int chunk;
|
||||
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
|
||||
(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
|
||||
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t remCount = channelCount - elemOffset;
|
||||
ssize_t chunkOffset;
|
||||
|
||||
if (remCount < loopCount) chunkCount = args->lastChunkCount;
|
||||
if (remCount < loopCount) chunkCount = alignUp(divUp(remCount, nranks), 16/sizeof(T));
|
||||
|
||||
auto modRanks = [&]__device__(int r)->int {
|
||||
return r - (r >= nranks ? nranks : 0);
|
||||
@@ -75,24 +74,24 @@ namespace {
|
||||
chunkOffset = chunk * chunkCount;
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
|
||||
prims.directRecv(offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __forceinline__ void runTreeUpDown(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
__device__ __forceinline__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
ncclTree *tree = &ncclShmem.channel.tree;
|
||||
const size_t channelCount = args->workCount;
|
||||
const size_t gridOffset = args->workOffset;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
size_t gridOffset;
|
||||
size_t channelCount;
|
||||
size_t chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
{ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
|
||||
(tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
(tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg);
|
||||
if (tree->up == -1) {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
@@ -118,7 +117,7 @@ namespace {
|
||||
|
||||
{ // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_TREE_ARITY>, /*Direct=*/1, Proto, 0> prims
|
||||
(tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
(tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
|
||||
if (tree->up == -1) {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
@@ -144,16 +143,14 @@ namespace {
|
||||
}
|
||||
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __forceinline__ void runTreeSplit(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
__device__ __forceinline__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
ncclTree *tree = &ncclShmem.channel.tree;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const size_t gridOffset = args->workOffset;
|
||||
const size_t channelCount = args->workCount;
|
||||
size_t gridOffset;
|
||||
size_t channelCount;
|
||||
size_t chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
int nthreadsSplit;
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE) {
|
||||
nthreadsSplit = nthreads/2;
|
||||
@@ -167,7 +164,7 @@ namespace {
|
||||
if (tree->up == -1) {
|
||||
// Reduce and broadcast. Max number of recv is 2, max number of send is 2
|
||||
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_TREE_ARITY_TOP>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
@@ -184,7 +181,7 @@ namespace {
|
||||
* but the ctor above for tree roots would be DirectRecv=0 DirectSend=1.
|
||||
*/
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth);
|
||||
prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth);
|
||||
if (tree->down[0] == -1) {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
@@ -203,8 +200,8 @@ namespace {
|
||||
else {
|
||||
// Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_TREE_ARITY>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, 1*Proto::MaxGroupWidth);
|
||||
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, 1*Proto::MaxGroupWidth);
|
||||
if (tree->down[0] == -1) {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
@@ -224,34 +221,33 @@ namespace {
|
||||
}
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>;
|
||||
runRing<T, RedOp, Proto>(args);
|
||||
runRing<T, RedOp, Proto>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
#if CUDART_VERSION >= 11020 && CUDART_VERSION < 11040 && __CUDA_ARCH__ >= 800
|
||||
runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(args);
|
||||
runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(tid, nthreads, work);
|
||||
#else
|
||||
runTreeSplit<T, RedOp, ProtoSimple<1, 1>>(args);
|
||||
runTreeSplit<T, RedOp, ProtoSimple<1, 1>>(tid, nthreads, work);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
|
||||
static constexpr int COLLNET_COPY_THREADS = 96;
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
const int bid = ncclShmem.channelId - work->channelLo;
|
||||
const int nChannels = work->channelHi - work->channelLo + 1;
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
const ssize_t chunkSize = args->chunkCount;
|
||||
const ssize_t size = args->count;
|
||||
const ssize_t chunkSize = work->collnet.chunkCount;
|
||||
const ssize_t size = work->collnet.count;
|
||||
const ssize_t loopSize = nChannels*direct->nHeads*chunkSize;
|
||||
|
||||
const int hasUp = (direct->up[0] >= 0) ? 1 : 0;
|
||||
@@ -259,7 +255,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
const int nThreadsScatter = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 3*COLLNET_COPY_THREADS : 0);
|
||||
const int nThreadsGather = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
|
||||
const int nThreadsBcast = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 2*COLLNET_COPY_THREADS);
|
||||
const int nThreadsReduce = args->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
|
||||
const int nThreadsReduce = work->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
|
||||
const int tidStartBcast = nThreadsGather;
|
||||
const int tidStartScatter = tidStartBcast + nThreadsBcast;
|
||||
const int tidStartReduce = tidStartScatter + nThreadsScatter;
|
||||
@@ -269,12 +265,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
|
||||
// Scatter
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, args);
|
||||
prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, work);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
|
||||
int nelem = min(direct->nHeads*chunkSize, size-offset);
|
||||
if (args->regUsed) {
|
||||
if (work->regUsed) {
|
||||
prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
|
||||
} else {
|
||||
prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
|
||||
@@ -284,12 +280,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
if (hasDn) {
|
||||
// Reduce, send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, args);
|
||||
prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, work);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (args->regUsed) {
|
||||
if (work->regUsed) {
|
||||
prims.directRecvReduceSend(offset, nelem);
|
||||
} else {
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
@@ -297,7 +293,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
}
|
||||
} else {
|
||||
// Directly send to network
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (tid == tidStartReduce) {
|
||||
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
|
||||
@@ -305,8 +301,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
__syncwarp();
|
||||
} else {
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
|
||||
prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
@@ -317,8 +313,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
} else if (tid < tidStartBcast && hasUp) {
|
||||
// Gather
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, args);
|
||||
prims(tid, nThreadsGather, direct->up, NULL, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
|
||||
int nelem = min(direct->nHeads*chunkSize, size-offset);
|
||||
@@ -328,15 +324,15 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
if (hasDn) {
|
||||
// Recv from network, broadcast
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, args);
|
||||
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true);
|
||||
}
|
||||
} else {
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (tid == tidStartBcast) {
|
||||
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
|
||||
@@ -345,8 +341,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
} else {
|
||||
// Recv from network (no post thread needed)
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0);
|
||||
prims(tid - tidStartBcast, nThreadsBcast, &direct->out, nullptr, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
@@ -359,18 +355,16 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
ssize_t chunkSize = args->chunkCount;
|
||||
const bool hasOut = nvls->out != -1;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
|
||||
const int bcastWarps = hasOut ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0;
|
||||
const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5);
|
||||
const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
|
||||
const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
|
||||
const int bcastWarps = hasOut ? (work->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0;
|
||||
const int reduceWarps = work->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5);
|
||||
const int scatterWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
|
||||
const int gatherWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
|
||||
|
||||
const int nThreadsScatter = scatterWarps*WARP_SIZE;
|
||||
const int nThreadsGather = gatherWarps*WARP_SIZE;
|
||||
@@ -381,35 +375,37 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
const int tidEndReduce = tidEndGather + nThreadsReduce;
|
||||
const int tidEndBcast = tidEndReduce + nThreadsBcast;
|
||||
|
||||
if (args->oneNode) {
|
||||
if (work->oneNode) {
|
||||
ssize_t gridOffset, channelCount, chunkSize;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkSize);
|
||||
const ssize_t loopCount = nvls->nHeads * chunkSize;
|
||||
const ssize_t channelCount = args->workCount;
|
||||
const ssize_t gridOffset = args->workOffset;
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
int remCount = channelCount%(nvls->nHeads*chunkSize);
|
||||
int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T));
|
||||
|
||||
if (tid < tidEndScatter) {
|
||||
// Scatter
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
|
||||
if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndGather) {
|
||||
// Gather
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
|
||||
work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
|
||||
if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndReduce) {
|
||||
@@ -417,10 +413,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t chunkOffset;
|
||||
if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
|
||||
if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
|
||||
chunkOffset = elemOffset + nvls->headRank * chunkSize;
|
||||
offset = gridOffset + chunkOffset;
|
||||
nelem = min(chunkSize, channelCount - chunkOffset);
|
||||
@@ -428,30 +424,32 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const int bid = args->bid;
|
||||
const ssize_t loopSize = args->nChannels * nvls->nHeads * chunkSize;
|
||||
const ssize_t size = args->count;
|
||||
const int bid = ncclShmem.channelId - work->channelLo;
|
||||
const int nChannels = work->channelHi - work->channelLo + 1;
|
||||
const ssize_t chunkSize = work->collnet.chunkCount;
|
||||
const ssize_t loopSize = nChannels * nvls->nHeads * chunkSize;
|
||||
const ssize_t size = work->collnet.count;
|
||||
|
||||
if (tid < tidEndScatter) {
|
||||
// Scatter
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
|
||||
int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
|
||||
int nelem = work->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
|
||||
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndGather) {
|
||||
// Gather
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
|
||||
work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
|
||||
int nelem = args->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
|
||||
int nelem = work->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
|
||||
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndReduce && nvls->headRank != -1) {
|
||||
@@ -460,7 +458,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
@@ -471,7 +469,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, args);
|
||||
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
@@ -483,7 +481,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
@@ -495,25 +493,25 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
const int treeUp = nvls->treeUp;
|
||||
const int* treeDown = nvls->treeDown;
|
||||
ssize_t chunkCount = args->chunkCount;
|
||||
ssize_t gridOffset, channelCount, chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
|
||||
const ssize_t loopCount = nvls->nHeads * chunkCount;
|
||||
const ssize_t channelCount = args->workCount;
|
||||
const ssize_t gridOffset = args->workOffset;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const bool hasUp = treeUp != -1;
|
||||
const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
|
||||
const int bcastWarps = hasUp ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 4) : 0;
|
||||
const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5);
|
||||
const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
|
||||
const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
|
||||
const int bcastWarps = hasUp ? (work->regUsed ? ((totalWarps - 2) >> 1) - 1 : 4) : 0;
|
||||
const int reduceWarps = work->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5);
|
||||
const int scatterWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
|
||||
const int gatherWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
int remCount = channelCount%(nvls->nHeads*chunkCount);
|
||||
int lastChunkCount = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T));
|
||||
|
||||
const int nThreadsScatter = scatterWarps*WARP_SIZE;
|
||||
const int nThreadsGather = gatherWarps*WARP_SIZE;
|
||||
@@ -528,24 +526,24 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
// Scatter
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
prims.scatter(offset, nelem, chunkCount, chunkCount, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndGather) {
|
||||
// Gather
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
|
||||
work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
prims.gather(offset, nelem, chunkCount, chunkCount, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndReduce && nvls->headRank != -1) {
|
||||
@@ -554,10 +552,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t chunkOffset;
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
|
||||
chunkOffset = elemOffset + nvls->headRank * chunkCount;
|
||||
offset = gridOffset + chunkOffset;
|
||||
nelem = min(chunkCount, channelCount - chunkOffset);
|
||||
@@ -568,10 +566,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
|
||||
Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t chunkOffset;
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
|
||||
chunkOffset = elemOffset + nvls->headRank * chunkCount;
|
||||
offset = gridOffset + chunkOffset;
|
||||
nelem = min(chunkCount, channelCount - chunkOffset);
|
||||
@@ -583,10 +581,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t chunkOffset;
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
|
||||
chunkOffset = elemOffset + nvls->headRank * chunkCount;
|
||||
offset = gridOffset + chunkOffset;
|
||||
nelem = min(chunkCount, channelCount - chunkOffset);
|
||||
@@ -597,17 +595,15 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
const int bid = ncclShmem.channelId - work->channelLo;
|
||||
const int nChannels = work->channelHi - work->channelLo + 1;
|
||||
ncclTree *tree = &ncclShmem.channel.collnetChain;
|
||||
ssize_t chunkSize = args->chunkCount;
|
||||
ssize_t chunkSize = work->collnet.chunkCount;
|
||||
const ssize_t loopSize = int(nChannels*chunkSize);
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ssize_t size = args->count;
|
||||
const ssize_t size = work->collnet.count;
|
||||
|
||||
int nthreadsSplit = nthreads/2;
|
||||
if (nthreadsSplit >= 256) nthreadsSplit += 64;
|
||||
@@ -634,7 +630,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
|
||||
|
||||
if (tid < nthreadsSplit) {
|
||||
if (recv == -1) {
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (groupTid == 0) {
|
||||
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, steps);
|
||||
@@ -642,8 +638,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
|
||||
__syncwarp();
|
||||
} else {
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * int(chunkSize);
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
@@ -652,8 +648,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
|
||||
}
|
||||
} else {
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * int(chunkSize);
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
@@ -665,7 +661,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
|
||||
if (recv == nranks) {
|
||||
// I'm the first in the broadcast chain, I need to perform the division (postOp)
|
||||
if (send == -1) {
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (groupTid == 0) {
|
||||
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, steps);
|
||||
@@ -673,8 +669,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
|
||||
__syncwarp();
|
||||
} else {
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * int(chunkSize);
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
@@ -683,8 +679,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
|
||||
}
|
||||
} else {
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * int(chunkSize);
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
@@ -693,8 +689,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
|
||||
}
|
||||
} else {
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
if (send == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
@@ -714,29 +710,29 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL>(args);
|
||||
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runTreeSplit<T, RedOp, ProtoLL>(args);
|
||||
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
runTreeSplit<T, RedOp, ProtoLL>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL128>(args);
|
||||
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL128> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runTreeSplit<T, RedOp, ProtoLL128>(args);
|
||||
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL128> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
runTreeSplit<T, RedOp, ProtoLL128>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
+18
-19
@@ -10,23 +10,22 @@
|
||||
|
||||
namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
const int rank = ring->userRanks[0];
|
||||
const int nextRank = ring->userRanks[1];
|
||||
const int root = args->root;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const size_t channelCount = args->workCount;
|
||||
const size_t gridOffset = args->workOffset;
|
||||
const int root = work->root;
|
||||
size_t chunkCount;
|
||||
size_t channelCount;
|
||||
size_t gridOffset;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
T *inputBuf = (T*)args->sendbuff;
|
||||
T *outputBuf = (T*)args->recvbuff;
|
||||
T *inputBuf = (T*)work->sendbuff;
|
||||
T *outputBuf = (T*)work->recvbuff;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg);
|
||||
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
@@ -48,23 +47,23 @@ namespace {
|
||||
}
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
using Proto = ProtoSimple<BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS>;
|
||||
runRing<T, RedOp, Proto>(args);
|
||||
runRing<T, RedOp, Proto>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL>(args);
|
||||
struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL128>(args);
|
||||
struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -14,11 +14,11 @@ __shared__ ncclShmemData ncclShmem;
|
||||
#endif
|
||||
|
||||
struct RunWorkNop {
|
||||
__device__ void run(ncclWork *w) {}
|
||||
__device__ void run() {}
|
||||
};
|
||||
|
||||
__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) {
|
||||
ncclKernelMain<-1, RunWorkNop>(comm, channelMask, workHead);
|
||||
__global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
|
||||
ncclKernelMain<-1, RunWorkNop>(&args4K.args);
|
||||
}
|
||||
|
||||
__device__ void ncclDevFunc_Nop() {}
|
||||
|
||||
+288
-118
@@ -10,10 +10,19 @@
|
||||
#include "collectives.h"
|
||||
#include "device.h"
|
||||
#include "op128.h"
|
||||
#include "reduce_kernel.h"
|
||||
#include "network/unpack/unpack_defs.h"
|
||||
|
||||
#define COLL_UNROLL (ncclCollUnroll())
|
||||
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
// __grid_constant__ appears to break cuda-gdb
|
||||
//#define NCCL_GRID_CONSTANT __grid_constant__
|
||||
#define NCCL_GRID_CONSTANT
|
||||
#else
|
||||
#define NCCL_GRID_CONSTANT
|
||||
#endif
|
||||
|
||||
typedef void(*ncclDevFuncPtr_t)();
|
||||
extern __device__ ncclDevFuncPtr_t const ncclDevFuncTable[];
|
||||
|
||||
@@ -31,18 +40,28 @@ struct ncclShmemGroup {
|
||||
};
|
||||
|
||||
struct ncclShmemData {
|
||||
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
|
||||
uint64_t redOpArgs[NCCL_MAX_ARITY+1];
|
||||
struct ncclDevKernelArgs args;
|
||||
int channelId;
|
||||
int aborted;
|
||||
alignas(16) struct ncclDevComm comm;
|
||||
alignas(16) struct ncclDevChannel channel;
|
||||
alignas(16) struct ncclWork work;
|
||||
|
||||
int batchIx, nextBatchIx;
|
||||
enum ncclDevWorkType workType;
|
||||
uint8_t directMode;
|
||||
uint16_t funcId;
|
||||
int nWorks;
|
||||
int workSize;
|
||||
uint32_t workConsumed;
|
||||
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
|
||||
uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
|
||||
|
||||
alignas(16) char workStorage[1024];
|
||||
|
||||
alignas(16) union {
|
||||
unpackShmem unpack;
|
||||
} devicePlugin;
|
||||
};
|
||||
static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
|
||||
|
||||
extern __shared__ ncclShmemData ncclShmem;
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
@@ -55,14 +74,62 @@ __device__ inline void* ncclScratchForWarp(int warp) {
|
||||
return (char*)ncclShmemPerWarp + warp*ncclShmemScratchWarpSize();
|
||||
}
|
||||
|
||||
__device__ inline bool barrierReduceAny(int bit) {
|
||||
uint32_t popc;
|
||||
asm ("{"
|
||||
".reg .pred barr_pred;"
|
||||
"setp.eq.u32 barr_pred, %1, 1;"
|
||||
"bar.red.popc.u32 %0, 2, barr_pred;"
|
||||
"}" : "=r"(popc) : "r"(bit));
|
||||
return popc != 0;
|
||||
__device__ inline void barrier_sync(int name) {
|
||||
#if 0
|
||||
asm volatile("barrier.sync %0;" :: "r"(name) : "memory");
|
||||
#else
|
||||
asm volatile("barrier.sync.aligned %0;" :: "r"(name) : "memory");
|
||||
#endif
|
||||
}
|
||||
__device__ inline void barrier_sync(int name, int nThreads) {
|
||||
#if 0
|
||||
asm volatile("barrier.sync %0, %1;" :: "r"(name), "r"(nThreads) : "memory");
|
||||
#else
|
||||
asm volatile("barrier.sync.aligned %0, %1;" :: "r"(name), "r"(nThreads) : "memory");
|
||||
#endif
|
||||
}
|
||||
__device__ inline void barrier_sync_aligned(int name) {
|
||||
asm volatile("barrier.sync.aligned %0;" :: "r"(name) : "memory");
|
||||
}
|
||||
__device__ inline void barrier_sync_aligned(int name, int nThreads) {
|
||||
asm volatile("barrier.sync.aligned %0, %1;" :: "r"(name), "r"(nThreads) : "memory");
|
||||
}
|
||||
|
||||
__device__ inline bool barrier_red_or(bool vote, int name) {
|
||||
int ans;
|
||||
asm("{ .reg .pred p;"
|
||||
" setp.ne.s32 p, %1, 0;"
|
||||
" barrier.red.or.pred p, %2, p; "
|
||||
" selp.s32 %0, 1, 0, p; }"
|
||||
: "=r"(ans) : "r"((int)vote), "r"(name) : "memory");
|
||||
return bool(ans);
|
||||
}
|
||||
__device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
|
||||
int ans;
|
||||
asm("{ .reg .pred p;"
|
||||
" setp.ne.s32 p, %1, 0;"
|
||||
" barrier.red.or.pred p, %2, %3, p; "
|
||||
" selp.s32 %0, 1, 0, p; }"
|
||||
: "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
|
||||
return bool(ans);
|
||||
}
|
||||
__device__ inline bool barrier_red_or_aligned(bool vote, int name) {
|
||||
int ans;
|
||||
asm("{ .reg .pred p;"
|
||||
" setp.ne.s32 p, %1, 0;"
|
||||
" barrier.red.or.pred.aligned p, %2, p; "
|
||||
" selp.s32 %0, 1, 0, p; }"
|
||||
: "=r"(ans) : "r"((int)vote), "r"(name) : "memory");
|
||||
return bool(ans);
|
||||
}
|
||||
__device__ inline bool barrier_red_or_aligned(bool vote, int name, int nThreads) {
|
||||
int ans;
|
||||
asm("{ .reg .pred p;"
|
||||
" setp.ne.s32 p, %1, 0;"
|
||||
" barrier.red.or.pred.aligned p, %2, %3, p; "
|
||||
" selp.s32 %0, 1, 0, p; }"
|
||||
: "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
|
||||
return bool(ans);
|
||||
}
|
||||
|
||||
// Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads.
|
||||
@@ -71,158 +138,261 @@ inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int by
|
||||
if (offset < bytes) {
|
||||
uint64_t a=0, b=0;
|
||||
asm("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset));
|
||||
asm volatile("st.v2.u64 [%0],{%1,%2};" :: "l"((char*)dst + offset), "l"(a), "l"(b));
|
||||
uint32_t udst = (uint32_t)__cvta_generic_to_shared(dst);
|
||||
asm volatile("st.shared.v2.u64 [%0],{%1,%2};" :: "r"(udst + offset), "l"(a), "l"(b));
|
||||
}
|
||||
}
|
||||
|
||||
// Must run with at least 64 threads
|
||||
__device__ __forceinline__ void loadWorkBatchToShmem(
|
||||
int tid, int tn, struct ncclDevKernelArgs const* args, int batchIx
|
||||
) {
|
||||
int lane = tid%WARP_SIZE;
|
||||
int workCursor = 0; // num works written in previous loop iterations.
|
||||
while (true) {
|
||||
struct ncclDevWorkBatch batch = ((struct ncclDevWorkBatch*)(args+1))[batchIx];
|
||||
|
||||
// fnsOfBitset[n] = index of n'th set bit in batch.offsetBitset.
|
||||
// PTX has instruction "fns" (find n-th set) but it expands to a lot of SASS,
|
||||
// since we know all lanes will be querying the same bitmask we can compute
|
||||
// much faster using shared memory.
|
||||
uint8_t* fnsOfBitset = (uint8_t*)ncclScratchForWarp(threadIdx.x/WARP_SIZE);
|
||||
__syncwarp();
|
||||
if (uint32_t(batch.offsetBitset) & (1u<<lane)) {
|
||||
int nWorksBelow = __popc(uint32_t(batch.offsetBitset) & ((1u<<lane)-1));
|
||||
fnsOfBitset[nWorksBelow] = lane;
|
||||
}
|
||||
int nWorksLow32 = __popc(uint32_t(batch.offsetBitset)); // just of low 32 bits
|
||||
if (uint32_t(batch.offsetBitset>>32) & (1u<<lane)) {
|
||||
int nWorksBelow = nWorksLow32;
|
||||
nWorksBelow += __popc(uint32_t(batch.offsetBitset>>32) & ((1u<<lane)-1));
|
||||
fnsOfBitset[nWorksBelow] = 32 + lane;
|
||||
}
|
||||
int nWorks = nWorksLow32 + __popc(uint32_t(batch.offsetBitset>>32)); // add high 32 bits
|
||||
__syncwarp();
|
||||
|
||||
int workSize;
|
||||
int nPacks; // total number of packs loaded, each pack is 16 bytes
|
||||
int packInWork; // my pack index within work struct
|
||||
int dstWork; // my work index in contiguous destination shmem
|
||||
switch (batch.workType) {
|
||||
case (int)ncclDevWorkTypeP2p:
|
||||
workSize = sizeof(struct ncclDevWorkP2p);
|
||||
nPacks = nWorks*(workSize/16);
|
||||
packInWork = tid%(workSize/16);
|
||||
dstWork = tid/(workSize/16);
|
||||
break;
|
||||
case (int)ncclDevWorkTypeColl:
|
||||
workSize = sizeof(struct ncclDevWorkColl);
|
||||
nPacks = nWorks*(workSize/16);
|
||||
packInWork = tid%(workSize/16);
|
||||
dstWork = tid/(workSize/16);
|
||||
break;
|
||||
case (int)ncclDevWorkTypeCollReg:
|
||||
default:
|
||||
workSize = sizeof(struct ncclDevWorkCollReg);
|
||||
nPacks = nWorks*(workSize/16);
|
||||
packInWork = tid%(workSize/16);
|
||||
dstWork = tid/(workSize/16);
|
||||
break;
|
||||
}
|
||||
if (tid == 0) {
|
||||
ncclShmem.workSize = workSize;
|
||||
ncclShmem.workConsumed = batch.offsetBase + (64-__clzll(batch.offsetBitset))*workSize;
|
||||
}
|
||||
// We deliberately replicate these div and mod calculations into the case
|
||||
// blocks above so that they get constant divisor optimizations by the compiler.
|
||||
// packInWork = tid%(workSize/16);
|
||||
// dstWork = tid/(workSize/16);
|
||||
|
||||
// We can only assume we have 64 threads, which means we can read at most 1024 bytes
|
||||
// here which is the per batch maximum.
|
||||
if (tid < nPacks) {
|
||||
int srcWork = fnsOfBitset[dstWork]; // find n'th set bit in batch.offsetBitset
|
||||
ulong2 tmp;
|
||||
// The loads done in these two cases must be kept separate since we are
|
||||
// relying on the compiler to use "ld.param" in the first one. The parameter
|
||||
// space is not generically addressable, so any attempt to load through
|
||||
// a pointer that *might* be parameter space backed will cause the
|
||||
// compiler to spill the parameter struct (4K!) to each thread's local space
|
||||
// before creating a pointer (to the spill) and decimate perf.
|
||||
//
|
||||
// An example of what not to do would be the following:
|
||||
//
|
||||
// if (condition) {
|
||||
// // The compiler could spill parameter_variable to local space and take
|
||||
// // the address of that, since when src is loaded below it could also
|
||||
// // be global space.
|
||||
// src = ¶meter_variable;
|
||||
// } else {
|
||||
// src = &global_variable;
|
||||
// }
|
||||
// memcpy(dst, src, n);
|
||||
if (ncclShmem.args.workStorageType == ncclDevWorkStorageTypeArgs) {
|
||||
char* src = (char*)args + (batch.offsetBase + srcWork*workSize + packInWork*16);
|
||||
tmp = *(ulong2*)src; // becomes ld.param.v2.u64
|
||||
} else {
|
||||
char* src = (char*)ncclShmem.args.workBuf + ((batch.offsetBase + srcWork*workSize + packInWork*16) & ncclShmem.args.workMask);
|
||||
tmp = *(ulong2*)src; // becomes ld.v2.u64
|
||||
}
|
||||
char* dst = ncclShmem.workStorage;
|
||||
dst += (workCursor + dstWork)*workSize + packInWork*16;
|
||||
*(ulong2*)dst = tmp;
|
||||
}
|
||||
workCursor += nWorks;
|
||||
|
||||
if (batch.nextExtends) {
|
||||
batchIx += batch.nextJump;
|
||||
tid -= 64; // Rotate threads so we use the next two warps for next batch struct.
|
||||
if (tid < 0) tid += tn;
|
||||
} else {
|
||||
if (tid == 0) {
|
||||
ncclShmem.batchIx = batchIx;
|
||||
ncclShmem.nextBatchIx = (batch.nextJump == 0) ? -1 : batchIx + batch.nextJump;
|
||||
ncclShmem.workType = (enum ncclDevWorkType)batch.workType;
|
||||
ncclShmem.nWorks = workCursor;
|
||||
ncclShmem.funcId = batch.funcId;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
|
||||
struct RunWorkElement {
|
||||
__device__ void run(ncclWorkElem*) {
|
||||
struct RunWorkColl {
|
||||
__device__ void run(int tid, int tn, struct ncclDevWorkColl* work) {
|
||||
// Put NOT IMPLEMENTED behavior here.
|
||||
}
|
||||
};
|
||||
|
||||
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
|
||||
struct RunWork {
|
||||
struct RunWorkBatch;
|
||||
|
||||
// Specialized for P2p in sendrecv.h
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE>;
|
||||
|
||||
// Specialized here for non-P2p (Coll and CollReg)
|
||||
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
|
||||
struct RunWorkBatch {
|
||||
// This __forceinline__ is necessary. The compiler was inserting a function call
|
||||
// here from the LL ncclKernel.
|
||||
__device__ __forceinline__ void run(ncclWork *w) {
|
||||
int wid = threadIdx.x / WARP_SIZE;
|
||||
ncclWorkElem* we = w->header.type == ncclWorkTypeRegColl ? &w->regElems[0].elem : &w->elems[0];
|
||||
int stride = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) : sizeof(ncclWorkElem);
|
||||
#pragma unroll 1
|
||||
while ((char*)we + stride <= (char*)(w+1) && we->isUsed) {
|
||||
if (wid < we->nWarps) {
|
||||
RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(we);
|
||||
__device__ __forceinline__ void run() {
|
||||
int tid = threadIdx.x;
|
||||
int tn = blockDim.x;
|
||||
|
||||
if (RedOpArg<RedOp>::ArgUsed) {
|
||||
int nWorks = ncclShmem.nWorks;
|
||||
for (int w=tid; w < nWorks; w += tn) {
|
||||
struct ncclDevWorkColl* work = (ncclDevWorkColl*)(ncclShmem.workStorage + w*ncclShmem.workSize);
|
||||
if (work->redOpArgIsPtr) {
|
||||
work->redOpArg = RedOpArg<RedOp>::loadArg(reinterpret_cast<void*>(work->redOpArg));
|
||||
}
|
||||
}
|
||||
we = (ncclWorkElem*)((char*)we + stride);
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#pragma unroll 1
|
||||
for (int w=0; w < ncclShmem.nWorks; w++) {
|
||||
struct ncclDevWorkColl* work = (struct ncclDevWorkColl*)(ncclShmem.workStorage + w*ncclShmem.workSize);
|
||||
if (w != 0) {
|
||||
struct ncclDevWorkColl* workPrev = (struct ncclDevWorkColl*)(ncclShmem.workStorage + (w-1)*ncclShmem.workSize);
|
||||
if (work->nWarps != workPrev->nWarps) __syncthreads();
|
||||
}
|
||||
int subtn = work->nWarps*WARP_SIZE;
|
||||
if (tid < subtn) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid, subtn, work);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
|
||||
if (we->isUsed && we->redOpArgIsPtr) {
|
||||
/* redOpArg is a pointer to the scalar value, so we'll dereference it
|
||||
* here so that redOpArg holds the bits of the scalar going forward.
|
||||
* The tricky thing is we don't know its type T since that's encoded in
|
||||
* the funcIndex. Because it would be difficult to get sizeof(T) from
|
||||
* funcIndex, we'll cheat and just dereference the largest possible size
|
||||
* given the alignment of the pointer. We might be reading in more bytes
|
||||
* than we need but that's harmless.
|
||||
*/
|
||||
if (we->redOpArg%2 != 0)
|
||||
we->redOpArg = *reinterpret_cast<uint8_t*>(we->redOpArg);
|
||||
else if (we->redOpArg%4 != 0)
|
||||
we->redOpArg = *reinterpret_cast<uint16_t*>(we->redOpArg);
|
||||
else if (we->redOpArg%8 != 0)
|
||||
we->redOpArg = *reinterpret_cast<uint32_t*>(we->redOpArg);
|
||||
else
|
||||
we->redOpArg = *reinterpret_cast<uint64_t*>(we->redOpArg);
|
||||
}
|
||||
}
|
||||
|
||||
template<int SpecializedFnId, typename SpecializedRunWork>
|
||||
__device__ void ncclKernelMain(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) {
|
||||
template<int SpecializedFnId, typename SpecializedRunWorkBatch>
|
||||
__device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* args) {
|
||||
int tid = threadIdx.x;
|
||||
int tn = blockDim.x;
|
||||
|
||||
// Copy kernel args to shmem and then only read those. Otherwise the compiler
|
||||
// will end up putting the args into thread local stack which is very wasteful.
|
||||
if (tid < sizeof(ncclDevKernelArgs)/sizeof(uint32_t)) {
|
||||
((uint32_t*)&ncclShmem.args)[tid] = ((uint32_t*)args)[tid];
|
||||
}
|
||||
|
||||
// To map blockId to channelId, we need the n'th set bit of channelMask which
|
||||
// is the inverse of counting the number of set bits among the the first n.
|
||||
if (tid < WARP_SIZE) {
|
||||
int x = tid;
|
||||
if (channelMask & (1ull<<x)) {
|
||||
int y = __popcll(channelMask & ((1ull<<x)-1));
|
||||
if (blockIdx.x == y) ncclShmem.channelId = x;
|
||||
}
|
||||
if (32 < MAXCHANNELS) {
|
||||
x = 32 + tid;
|
||||
if (channelMask & (1ull<<x)) {
|
||||
int y = __popcll(channelMask & ((1ull<<x)-1));
|
||||
if (blockIdx.x == y) ncclShmem.channelId = x;
|
||||
}
|
||||
}
|
||||
// PTX has the fns instruction which does this but is extremely slow. We can
|
||||
// do better when we know all threads are querying the same bitmask.
|
||||
if (tid < MAXCHANNELS && (args->channelMask & (1ull<<tid))) {
|
||||
int n = __popcll(args->channelMask & ((1ull<<tid)-1));
|
||||
if (blockIdx.x == n) ncclShmem.channelId = tid;
|
||||
}
|
||||
__syncthreads(); // publish ncclShmem.channelId
|
||||
int channelId = ncclShmem.channelId;
|
||||
__syncthreads(); // publish ncclShmem.{args, channelId}
|
||||
/* set abort flag to 0 */
|
||||
if (tid == 0) ncclShmem.aborted = 0;
|
||||
|
||||
if (true) {
|
||||
void *dst, *src;
|
||||
int bytes;
|
||||
// Use first 3 warps to load comm, channel, and work into ncclShmem
|
||||
switch (tid/WARP_SIZE) {
|
||||
case 0:
|
||||
dst = &ncclShmem.comm;
|
||||
src = comm;
|
||||
bytes = sizeof(ncclDevComm);
|
||||
// Use first 2 warps to load comm and channel, and reamaining load work batch.
|
||||
switch (tid/WARP_SIZE) {
|
||||
case 0:
|
||||
{ void* dst = &ncclShmem.comm;
|
||||
void* src = ncclShmem.args.comm;
|
||||
int bytes = sizeof(ncclDevComm);
|
||||
static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
|
||||
break;
|
||||
case 1:
|
||||
// Get address of channel without incurring indirect load from ncclDevComm::channels
|
||||
dst = &ncclShmem.channel;
|
||||
src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
|
||||
bytes = sizeof(ncclDevChannel);
|
||||
copyToShmem16(tid, dst, src, bytes);
|
||||
} break;
|
||||
case 1:
|
||||
{ // Get address of channel without incurring indirect load from ncclDevComm::channels
|
||||
void* dst = &ncclShmem.channel;
|
||||
void* src = &((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
|
||||
int bytes = sizeof(ncclDevChannel);
|
||||
static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
|
||||
break;
|
||||
case 2:
|
||||
dst = &ncclShmem.work;
|
||||
src = workHead + blockIdx.x;
|
||||
bytes = sizeof(ncclWork);
|
||||
static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn.");
|
||||
break;
|
||||
default:
|
||||
bytes = 0;
|
||||
break;
|
||||
}
|
||||
if (bytes) copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
|
||||
copyToShmem16(tid-WARP_SIZE, dst, src, bytes);
|
||||
} break;
|
||||
default:
|
||||
{ int subtid = tid - 2*WARP_SIZE;
|
||||
int subtn = tn - 2*WARP_SIZE;
|
||||
loadWorkBatchToShmem(subtid, subtn, args, /*batchIx=*/blockIdx.x);
|
||||
} break;
|
||||
}
|
||||
__syncthreads(); // publish ncclShmem
|
||||
|
||||
if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
|
||||
// ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
|
||||
ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
// Notify host that all fifo reads are complete.
|
||||
if (tid == 0 && ncclShmem.work.header.isLast && ncclShmem.work.header.inFifo) {
|
||||
*ncclShmem.channel.workFifoDone = ncclShmem.work.header.doneAcks;
|
||||
}
|
||||
|
||||
__syncwarp();
|
||||
if (ncclShmem.work.header.type == ncclWorkTypeColl) {
|
||||
if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]);
|
||||
} else if (ncclShmem.work.header.type == ncclWorkTypeRegColl) {
|
||||
if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem.work.regElems[tid].elem);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (0 <= SpecializedFnId && ncclShmem.work.header.funcIndex == (unsigned)SpecializedFnId) {
|
||||
SpecializedRunWork().run(&ncclShmem.work);
|
||||
if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) {
|
||||
SpecializedRunWorkBatch().run();
|
||||
} else {
|
||||
ncclDevFuncTable[ncclShmem.work.header.funcIndex]();
|
||||
ncclDevFuncTable[ncclShmem.funcId]();
|
||||
}
|
||||
|
||||
int workIxNext = ncclShmem.work.header.workNext;
|
||||
if (ncclShmem.nextBatchIx == -1) break;
|
||||
int batchIx = ncclShmem.nextBatchIx;
|
||||
__syncthreads();
|
||||
if (ncclShmem.work.header.isLast) break;
|
||||
loadWorkBatchToShmem(tid, tn, args, batchIx);
|
||||
|
||||
copyToShmem16(tid, &ncclShmem.work, workHead + workIxNext, sizeof(ncclWork));
|
||||
|
||||
{ // Check whether the last operation was aborted and make sure all threads exit
|
||||
int aborted = tid == 0 ? *comm->abortFlag : 0;
|
||||
if (barrierReduceAny(aborted)) // publish ncclShmem.work
|
||||
break;
|
||||
// Check whether the last operation was aborted and make sure all threads exit
|
||||
bool aborted = false;
|
||||
if (tid == 0) aborted = *ncclShmem.comm.abortFlag;
|
||||
aborted = barrier_red_or_aligned(aborted, 0); // publish ncclShmem.work
|
||||
if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
|
||||
// ncclShmem.workConsumed written by loadWorkBatchToShmem before barrier_red_or()
|
||||
ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
|
||||
}
|
||||
if (aborted) break;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
|
||||
__global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
|
||||
__device__ void ncclDevFunc_Nop();
|
||||
|
||||
#define DEFINE_ncclDevKernel(suffix, coll, redop, ty, algo, proto, specializedFnId) \
|
||||
__global__ void ncclDevKernel_##suffix(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
|
||||
ncclKernelMain<specializedFnId, RunWork<coll, ty, redop<ty>, algo, proto>>(comm, channelMask, workHead); \
|
||||
__global__ void ncclDevKernel_##suffix(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { \
|
||||
ncclKernelMain<specializedFnId, RunWorkBatch<coll, ty, redop<ty>, algo, proto>>(&args4K.args); \
|
||||
}
|
||||
|
||||
#define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto) \
|
||||
__device__ void ncclDevFunc_##suffix() { \
|
||||
RunWork<coll, ty, redop<ty>, algo, proto>().run(&ncclShmem.work); \
|
||||
RunWorkBatch<coll, ty, redop<ty>, algo, proto>().run(); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -233,6 +233,8 @@ with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
|
||||
out('#include "device.h"\n')
|
||||
out("\n")
|
||||
|
||||
out("extern int const ncclDevFuncIdCount = %d;\n" % len(primary_funcs))
|
||||
|
||||
# The mapping from function rows to valid primary function ids.
|
||||
out("extern int const ncclDevFuncRowToId[] = {\n")
|
||||
index = 0
|
||||
@@ -251,7 +253,7 @@ with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
|
||||
cudart, _ = required_cuda(*kfn)
|
||||
sym = paste("_", "ncclDevKernel", *kfn)
|
||||
if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
|
||||
out("__global__ void %s(struct ncclDevComm*, uint64_t, struct ncclWork*);\n" % sym)
|
||||
out("__global__ void %s(ncclDevKernelArgs4K const);\n" % sym)
|
||||
if cudart != 0: out("#endif\n")
|
||||
out("\n")
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include "unpack_defs.h"
|
||||
|
||||
#include "op128.h"
|
||||
#include "align.h"
|
||||
#include "bitops.h"
|
||||
#include "device.h"
|
||||
#include "common.h"
|
||||
|
||||
@@ -35,16 +35,16 @@ inline __device__ void ncclNetDeviceUnpackSetup(void* ohandle, const int group,
|
||||
struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
|
||||
ncclShmem.groups[group].devicePlugin.unpack.g_meta[index] = handle->meta;
|
||||
ncclShmem.devicePlugin.unpack.bounce_buf = handle->bounce_buf;
|
||||
ncclShmem.groups[group].devicePlugin.unpack.head = handle->head;
|
||||
ncclShmem.groups[group].devicePlugin.unpack.head[index] = handle->head;
|
||||
}
|
||||
|
||||
inline __device__ void ncclNetDeviceIncrementHead(const int group) {
|
||||
ncclShmem.groups[group].devicePlugin.unpack.head++;
|
||||
inline __device__ void ncclNetDeviceIncrementHead(const int group, const int index) {
|
||||
ncclShmem.groups[group].devicePlugin.unpack.head[index]++;
|
||||
}
|
||||
|
||||
inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group) {
|
||||
inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group, const int index) {
|
||||
struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
|
||||
handle->head = ncclShmem.groups[group].devicePlugin.unpack.head;
|
||||
handle->head = ncclShmem.groups[group].devicePlugin.unpack.head[index];
|
||||
}
|
||||
|
||||
template <uint8_t sz>
|
||||
@@ -183,7 +183,7 @@ inline __device__ void ncclNetDeviceUnpack</*Recv=*/1>(
|
||||
// Pack data from the internal iovec to the supplied flat srcs buffer using all the threads
|
||||
// + Src is necessary in the case of accessing the user buffer directly
|
||||
ncclNetDeviceUnpackInner(tid, tidInBlock, nworkers, group /* in case they need to use split warps shared memory partitioning*/,
|
||||
ix, ncclShmem.groups[group].srcs[ix + Src], workSize, ncclShmem.groups[group].devicePlugin.unpack.head);
|
||||
ix, ncclShmem.groups[group].srcs[ix + Src], workSize, ncclShmem.groups[group].devicePlugin.unpack.head[ix]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -54,7 +54,7 @@ struct unpackShmem {
|
||||
|
||||
struct unpackGroupShmem {
|
||||
int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv
|
||||
uint64_t head;
|
||||
uint64_t head[NET_UNPACK_MAX_NPEERS];
|
||||
struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy
|
||||
};
|
||||
|
||||
|
||||
@@ -44,10 +44,11 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
|
||||
|
||||
inline __device__ void barrier() {
|
||||
if (nthreads == WARP_SIZE)
|
||||
if (nthreads == WARP_SIZE) {
|
||||
__syncwarp();
|
||||
else
|
||||
asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
|
||||
} else {
|
||||
barrier_sync(15-group, nthreads);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t abort = 0;
|
||||
@@ -323,7 +324,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
__device__ Primitives(
|
||||
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
|
||||
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
|
||||
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0
|
||||
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr,
|
||||
bool userBufReg=false, int stepSize_=0
|
||||
):
|
||||
redOp(redOpArg),
|
||||
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
|
||||
|
||||
@@ -50,7 +50,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
||||
inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; }
|
||||
|
||||
inline __device__ void barrier() {
|
||||
asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
|
||||
barrier_sync(15-group, nthreads);
|
||||
}
|
||||
|
||||
uint32_t abort = 0;
|
||||
@@ -364,7 +364,8 @@ public:
|
||||
__device__ Primitives(
|
||||
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
|
||||
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
|
||||
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0
|
||||
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclDevWorkColl* e = nullptr,
|
||||
bool userBufReg=false, int stepSize_=0
|
||||
):
|
||||
redOp(redOpArg),
|
||||
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
|
||||
|
||||
@@ -23,7 +23,7 @@ class Primitives<
|
||||
ConnFifoEnabled = 0x100,
|
||||
DirectWrite = 0x200,
|
||||
DirectRead = 0x400,
|
||||
ThreadsSynced = 0x800,
|
||||
// 0x800 is free to use
|
||||
NvlsMinPolling = 0x1000,
|
||||
NetDeviceUnpack = 0x2000,
|
||||
AnyNetDeviceUnpack = 0x4000,
|
||||
@@ -44,53 +44,38 @@ class Primitives<
|
||||
uint64_t *connStepPtr;
|
||||
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
|
||||
int connStepSize; // Connection step size
|
||||
void* mhandle;
|
||||
void* netDeviceHandle;
|
||||
|
||||
// Don't use barrier 0 as it's used by the final sync
|
||||
__device__ void barrier() {
|
||||
flags |= ThreadsSynced;
|
||||
if (nthreads == WARP_SIZE) __syncwarp();
|
||||
else {
|
||||
int bar = 15-group;
|
||||
asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nthreads) : "memory");
|
||||
barrier_sync(bar, nthreads);
|
||||
}
|
||||
}
|
||||
__device__ void subBarrier() {
|
||||
if (nworkers == WARP_SIZE) __syncwarp();
|
||||
else {
|
||||
int bar = (nworkers==nthreads ? 15 : 8) - group;
|
||||
asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nworkers) : "memory");
|
||||
int bar = 15-group - (nworkers!=nthreads ? 1 : 0);
|
||||
barrier_sync(bar, nworkers);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ bool barrierAny(int vote) {
|
||||
flags |= ThreadsSynced;
|
||||
if (nthreads == WARP_SIZE) {
|
||||
return __any_sync(~0u, vote);
|
||||
} else {
|
||||
int ans, bar = 15-group;
|
||||
asm volatile(
|
||||
"{ .reg .pred p;"
|
||||
" setp.ne.s32 p, %1, 0;"
|
||||
" bar.red.or.pred p, %2, %3, p; "
|
||||
" selp.s32 %0, 1, 0, p; }"
|
||||
: "=r"(ans) : "r"(vote), "r"(bar), "r"(nthreads) : "memory");
|
||||
return ans != 0;
|
||||
int name = 15-group;
|
||||
return barrier_red_or(vote, name, nthreads);
|
||||
}
|
||||
}
|
||||
__device__ bool subBarrierAny(int vote) {
|
||||
if (nworkers == WARP_SIZE) {
|
||||
return __any_sync(~0u, vote);
|
||||
} else {
|
||||
int ans, bar = (nworkers==nthreads ? 15 : 8) - group;
|
||||
asm volatile(
|
||||
"{ .reg .pred p;"
|
||||
" setp.ne.s32 p, %1, 0;"
|
||||
" bar.red.or.pred p, %2, %3, p; "
|
||||
" selp.s32 %0, 1, 0, p; }"
|
||||
: "=r"(ans) : "r"(vote), "r"(bar), "r"(nworkers) : "memory");
|
||||
return ans != 0;
|
||||
int name = 15-group - (nworkers!=nthreads ? 1 : 0);
|
||||
return barrier_red_or(vote, name, nworkers);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -164,8 +149,8 @@ class Primitives<
|
||||
else {
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
|
||||
}
|
||||
if ((flags & (AnyNetDeviceUnpack)) && (flags & (Recv*RoleWaitRecv))) {
|
||||
ncclNetDeviceIncrementHead(group);
|
||||
if (flags & NetDeviceUnpack) {
|
||||
ncclNetDeviceIncrementHead(group, index);
|
||||
}
|
||||
step += StepPerSlice;
|
||||
}
|
||||
@@ -436,7 +421,7 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
|
||||
__device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
|
||||
if (flags & (RoleWaitRecv|RolePostRecv)) {
|
||||
auto *conn = &peer->recv[connIndex];
|
||||
if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
|
||||
@@ -488,7 +473,7 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
|
||||
__device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
|
||||
if (flags & (RoleWaitSend|RolePostSend)) {
|
||||
auto *conn = &peer->send[connIndex];
|
||||
step = conn->step;
|
||||
@@ -538,13 +523,13 @@ private:
|
||||
__device__ Primitives(
|
||||
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
|
||||
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
|
||||
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0
|
||||
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,bool userBufReg=false, int stepSize_=0
|
||||
):
|
||||
tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
|
||||
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
|
||||
|
||||
// For send operations, we need an extra warp to overlap the threadfence and the copy
|
||||
this->nworkers = nthreads - (MaxSend > 0 && nthreads-WARP_SIZE >= 64 ? WARP_SIZE : 0);
|
||||
this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0);
|
||||
|
||||
int nrecv=0, nsend=0;
|
||||
while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
|
||||
@@ -572,7 +557,7 @@ private:
|
||||
loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
|
||||
loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
|
||||
|
||||
if (p2p && p2p->reg) flags |= UserBufferMode;
|
||||
if (userBufReg) flags |= UserBufferMode;
|
||||
|
||||
if (barrierAny(flags & NetDeviceUnpack)) {
|
||||
flags |= AnyNetDeviceUnpack;
|
||||
@@ -584,13 +569,12 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
|
||||
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e);
|
||||
}
|
||||
|
||||
__device__ ~Primitives() {
|
||||
// Ensure ncclShmem.groups[].send/recvConns are available
|
||||
if (!(flags & ThreadsSynced))
|
||||
barrier();
|
||||
barrier();
|
||||
// Save steps for the next operation
|
||||
if (flags & (RolePostSend|RolePostRecv)) {
|
||||
auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
|
||||
@@ -606,8 +590,8 @@ private:
|
||||
while (*ptr != -1) if (checkAbort(spins)) break;
|
||||
}
|
||||
|
||||
if ((flags & (AnyNetDeviceUnpack)) && (flags & (RoleWaitRecv))) {
|
||||
ncclNetDeviceSaveHead(netDeviceHandle, group);
|
||||
if (flags & NetDeviceUnpack) {
|
||||
ncclNetDeviceSaveHead(netDeviceHandle, group, index);
|
||||
}
|
||||
|
||||
// Make sure all threads are done writing back conn->step and done using
|
||||
@@ -615,7 +599,7 @@ private:
|
||||
barrier();
|
||||
}
|
||||
|
||||
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) {
|
||||
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* e) {
|
||||
if (tid==0) {
|
||||
ncclShmem.groups[group].userInput = (void*)inputBuf;
|
||||
ncclShmem.groups[group].userOutput = (void*)outputBuf;
|
||||
@@ -625,7 +609,7 @@ private:
|
||||
bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite));
|
||||
bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
|
||||
bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead) || (flags == (flags|RoleWaitRecv|NvlsDirectRead)); // receiver accepts direct buffer
|
||||
int regUsed = e != nullptr ? e->elem.regUsed : 0;
|
||||
int regUsed = e != nullptr ? e->coll.regUsed : 0;
|
||||
|
||||
if (Direct && recvProvider) {
|
||||
int spins = 0;
|
||||
|
||||
+16
-17
@@ -10,22 +10,21 @@
|
||||
|
||||
namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const int rank = ncclShmem.comm.rank;
|
||||
const int prevRank = ring->userRanks[nranks-1];
|
||||
const int root = args->root;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const size_t channelCount = args->workCount;
|
||||
const size_t gridOffset = args->workOffset;
|
||||
const int root = work->root;
|
||||
size_t chunkCount;
|
||||
size_t channelCount;
|
||||
size_t gridOffset;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
|
||||
|
||||
if (prevRank == root) {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
@@ -52,23 +51,23 @@ namespace {
|
||||
}
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
using Proto = ProtoSimple<REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS>;
|
||||
runRing<T, RedOp, Proto>(args);
|
||||
runRing<T, RedOp, Proto>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL>(args);
|
||||
struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL128>(args);
|
||||
struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -37,6 +37,7 @@ template<typename T>
|
||||
struct FuncSum { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; };
|
||||
template<typename T>
|
||||
struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; };
|
||||
|
||||
template<typename T>
|
||||
struct FuncMinMax {
|
||||
using EltType = T;
|
||||
@@ -47,9 +48,30 @@ struct FuncMinMax {
|
||||
isMinNotMax = (opArg&1)==0;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T> struct FuncPreMulSum;
|
||||
template<typename T> struct FuncSumPostDiv;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Trait class for handling the reduction argument.
|
||||
|
||||
template<typename Fn>
|
||||
struct RedOpArg { // default case: no argument
|
||||
static constexpr bool ArgUsed = false;
|
||||
__device__ static uint64_t loadArg(void *ptr) { return 0; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct RedOpArg<FuncMinMax<T>> {
|
||||
static constexpr bool ArgUsed = true;
|
||||
__device__ static uint64_t loadArg(void *ptr) {
|
||||
union { uint64_t u64; T val; };
|
||||
u64 = 0;
|
||||
val = *(T*)ptr;
|
||||
return u64;
|
||||
}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Trait classes for reduction functions. Given a function (FuncSum, etc.)
|
||||
// and a number of elements in a pack, will reduce, preOp, or postOp a pack
|
||||
@@ -356,6 +378,17 @@ struct Apply_PostOp<Fn, /*EltPerPack=*/0> {
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// FuncPreMulSum
|
||||
|
||||
template<typename T>
|
||||
struct RedOpArg<FuncPreMulSum<T>> {
|
||||
static constexpr bool ArgUsed = true;
|
||||
__device__ static uint64_t loadArg(void *ptr) {
|
||||
union { uint64_t u64; T val; };
|
||||
u64 = 0;
|
||||
val = *(T*)ptr;
|
||||
return u64;
|
||||
}
|
||||
};
|
||||
|
||||
// General definition for all integral types, float, and double.
|
||||
template<typename T>
|
||||
struct FuncPreMulSum {
|
||||
@@ -486,6 +519,14 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// FuncSumPostDiv
|
||||
|
||||
template<typename T>
|
||||
struct RedOpArg<FuncSumPostDiv<T>> {
|
||||
static constexpr bool ArgUsed = true;
|
||||
__device__ static uint64_t loadArg(void *ptr) {
|
||||
return *(uint64_t*)ptr;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, bool IsFloating=IsFloatingPoint<T>::value>
|
||||
struct FuncSumPostDiv_IntOnly;
|
||||
|
||||
@@ -658,7 +699,7 @@ struct Apply_LoadMultimem {
|
||||
static constexpr bool IsFloat = IsFloatingPoint<T>::value;
|
||||
static constexpr int BigPackSize =
|
||||
IsFloat && IsSum && sizeof(T) < 8 ? 16 :
|
||||
IsFloat && IsSum ? 8 :
|
||||
IsFloat && IsSum ? sizeof(T) :
|
||||
IsFloat && IsMinMax && sizeof(T)==2 ? 16 :
|
||||
!IsFloat && (IsSum||IsMinMax) && sizeof(T)>=4 ? sizeof(T) :
|
||||
/*multimem.ld_reduce not supported:*/ 0;
|
||||
|
||||
@@ -10,23 +10,22 @@
|
||||
|
||||
namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const uint32_t nthreads = (uint32_t)args->nWarps * WARP_SIZE;
|
||||
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
int const *ringRanks = ring->userRanks;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
size_t channelCount = args->workCount;
|
||||
size_t gridOffset = args->workOffset;
|
||||
size_t count;
|
||||
size_t gridOffset;
|
||||
size_t channelCount;
|
||||
size_t chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
|
||||
size_t offset;
|
||||
size_t dataOffset;
|
||||
size_t count = args->count;
|
||||
uint32_t nelem;
|
||||
int rankDest;
|
||||
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
|
||||
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
@@ -54,56 +53,56 @@ namespace {
|
||||
}
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
using Proto = ProtoSimple<REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS>;
|
||||
runRing<T, RedOp, Proto>(args);
|
||||
runRing<T, RedOp, Proto>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL>(args);
|
||||
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL128>(args);
|
||||
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const size_t count = args->count;
|
||||
size_t count;
|
||||
size_t gridOffset;
|
||||
size_t channelCount;
|
||||
size_t chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
|
||||
const int rank = ncclShmem.comm.rank;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
size_t gridOffset = args->workOffset;
|
||||
size_t channelCount = args->workCount;
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
/* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync;
|
||||
* if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth
|
||||
* and the rest are allocated to scatter. */
|
||||
const int nThreadsReduce = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
|
||||
const int nThreadsScatter = args->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce);
|
||||
const int nThreadsReduce = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
|
||||
const int nThreadsScatter = work->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce);
|
||||
const int tidEndScatter = nThreadsScatter;
|
||||
const int tidEndReduce = tidEndScatter + nThreadsReduce;
|
||||
|
||||
if (!args->regUsed) {
|
||||
if (!work->regUsed) {
|
||||
if (tid < tidEndScatter) {
|
||||
// Scatter
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
@@ -113,8 +112,8 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
|
||||
// Reduce through NVLS
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
|
||||
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, work->recvbuff,
|
||||
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
@@ -127,7 +126,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
prims.scatter(0, 0, 0, 0, -1, 0);
|
||||
}
|
||||
@@ -138,8 +137,8 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
|
||||
// Reduce through NVLS
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, args->recvbuff,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, work->recvbuff,
|
||||
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
size_t outOffset = gridOffset + elemOffset;
|
||||
size_t inpOffset = outOffset + rank * count;
|
||||
@@ -155,10 +154,10 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
|
||||
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
|
||||
template<bool ReduceSendNotRecv>
|
||||
struct Scatterer {
|
||||
struct ncclWorkElem* args;
|
||||
struct ncclDevWorkColl* work;
|
||||
int chunkSize;
|
||||
ssize_t railGridOffset;
|
||||
|
||||
@@ -173,11 +172,11 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
int nNodes = ncclShmem.comm.nNodes;
|
||||
int nRails = direct->nHeads;
|
||||
int bid = args->bid;
|
||||
void* inbuf = (void*)args->sendbuff;
|
||||
ssize_t sizePerRank = args->count;
|
||||
int part = ncclShmem.channelId - work->channelLo;
|
||||
void* inbuf = (void*)work->sendbuff;
|
||||
ssize_t sizePerRank = work->collnet.count;
|
||||
|
||||
ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank);
|
||||
ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank);
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
|
||||
int railAllSize = railAllEnd - railAllBeg;
|
||||
if (tid < nDsts) dstSizes[tid] = railAllSize;
|
||||
@@ -204,7 +203,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
|
||||
/*MultimemSrcs=*/0, 1+MinSrcs, 1+MaxSrcs,
|
||||
/*MultimemDsts,MinDsts,MaxDsts=*/0,1,1,
|
||||
/*PreOpSrcs=*/1>
|
||||
(tid, tn, args->redOpArg, &args->redOpArg, false,
|
||||
(tid, tn, work->redOpArg, &work->redOpArg, false,
|
||||
/*nSrcs=*/1+nSrcs, [=]__device__(int s) {
|
||||
return s==0 ? (T*)inbuf + userOneBeg
|
||||
: (T*)srcPtrs[s-1] + railAllOffset;
|
||||
@@ -223,23 +222,23 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
|
||||
}
|
||||
};
|
||||
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
int tid = threadIdx.x;
|
||||
const int nChannels = args->nChannels;
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
const int part = ncclShmem.channelId - work->channelLo;
|
||||
const int nChannels = work->channelHi - work->channelLo + 1;
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
int const &nNodes = ncclShmem.comm.nNodes;
|
||||
ssize_t chunkSize = int(args->chunkCount);
|
||||
ssize_t sizePerRank = args->count;
|
||||
ssize_t chunkSize = int(work->collnet.chunkCount);
|
||||
ssize_t sizePerRank = work->collnet.count;
|
||||
|
||||
if (direct->out == -1) __trap();
|
||||
bool isMultiRail = (direct->nHeads > 1);
|
||||
int nWarps1 = (isMultiRail ? 2 : 0);
|
||||
int nWarps2 = (isMultiRail ? 2 : 1);
|
||||
int nWarps3 = 1;
|
||||
float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3);
|
||||
float denom = float(work->nWarps)/float(nWarps1+nWarps2+nWarps3);
|
||||
nWarps3 = int(denom*nWarps3);
|
||||
nWarps2 = int(denom*nWarps2);
|
||||
nWarps1 = args->nWarps - (nWarps2+nWarps3);
|
||||
nWarps1 = work->nWarps - (nWarps2+nWarps3);
|
||||
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
|
||||
@@ -248,13 +247,13 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
|
||||
// Phase 1: Scatter inputs to peers
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, nullptr, direct->heads+1, nullptr, nullptr,
|
||||
args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
|
||||
work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
Scatterer</*ReduceSendNotRecv=*/true> scat;
|
||||
scat.args = args;
|
||||
scat.work = work;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.process</*Recv=*/0, /*Send=*/1>(scat);
|
||||
prims.template process</*Recv=*/0, /*Send=*/1>(scat);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -262,7 +261,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
|
||||
|
||||
tn = nWarps2*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (tid == 0) {
|
||||
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
|
||||
@@ -272,13 +271,13 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
|
||||
// Phase 2: Reduce from peers + local input -> send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
|
||||
Scatterer</*ReduceSendNotRecv=*/false> scat;
|
||||
scat.args = args;
|
||||
scat.work = work;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
prims.template process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
}
|
||||
}
|
||||
return;
|
||||
@@ -287,7 +286,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
|
||||
|
||||
tn = nWarps3*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (tid == 0) {
|
||||
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
|
||||
@@ -296,10 +295,10 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
|
||||
} else {
|
||||
// Phase 3: recv from network
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, &direct->out, nullptr, nullptr, args->recvbuff,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0);
|
||||
prims(tid, tn, &direct->out, nullptr, nullptr, work->recvbuff,
|
||||
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
|
||||
ssize_t railAllBeg = railGridOffset + args->bid * chunkSize;
|
||||
ssize_t railAllBeg = railGridOffset + part * chunkSize;
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
|
||||
ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
|
||||
ssize_t railOneEnd = railOneBeg + sizePerRank;
|
||||
|
||||
+141
-65
@@ -9,83 +9,159 @@
|
||||
#include "primitives.h"
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
static_assert(sizeof(T)==1, "SendRecv only works on single byte types T.");
|
||||
|
||||
template<typename Proto>
|
||||
__device__ void runSend(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
|
||||
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
|
||||
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
|
||||
if (args->peer == ncclShmem.comm.rank) {
|
||||
struct ncclWorkElemP2p* recvArgs = args-1;
|
||||
void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
|
||||
if (buff != recvBuff) {
|
||||
reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
|
||||
(tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count);
|
||||
}
|
||||
} else {
|
||||
int chunkSize = args->chunkSize/sizeof(T);
|
||||
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
|
||||
int const peer = args->peer;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
|
||||
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T));
|
||||
size_t offset = 0;
|
||||
do {
|
||||
int nelem = min(size_t(chunkSize), count-offset);
|
||||
prims.directSend(offset, offset, nelem);
|
||||
offset += nelem;
|
||||
} while(offset < count && args->reg == 0);
|
||||
}
|
||||
__device__ void runSend(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
|
||||
size_t bytes = work->sendBytes;
|
||||
int chunkSize = u32fp8Decode(work->sendChunkSize_u32fp8);
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1>
|
||||
prims(tid, tn, nullptr, &work->sendRank, work->sendAddr, nullptr,
|
||||
/*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
|
||||
/*userBufferMode=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize);
|
||||
size_t cursor = 0;
|
||||
do {
|
||||
int n = min(size_t(chunkSize), bytes-cursor);
|
||||
prims.directSend(cursor, cursor, n);
|
||||
cursor += n;
|
||||
} while (cursor < bytes && work->sendRegistered == 0);
|
||||
}
|
||||
|
||||
template<typename Proto>
|
||||
__device__ void runRecv(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
|
||||
if (args->peer != ncclShmem.comm.rank) {
|
||||
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
|
||||
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
|
||||
int chunkSize = args->chunkSize/sizeof(T);
|
||||
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
|
||||
int const peer = args->peer;
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
|
||||
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T));
|
||||
size_t offset = 0;
|
||||
do {
|
||||
int nelem = min(size_t(chunkSize), count-offset);
|
||||
prims.directRecv(offset, nelem);
|
||||
offset += nelem;
|
||||
} while(offset < count && args->reg == 0);
|
||||
}
|
||||
__device__ void runRecv(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
|
||||
size_t bytes = work->recvBytes;
|
||||
int chunkSize = u32fp8Decode(work->recvChunkSize_u32fp8);
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1>
|
||||
prims(tid, tn, &work->recvRank, nullptr, nullptr, work->recvAddr,
|
||||
/*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
|
||||
/*userBufferMode=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize);
|
||||
size_t cursor = 0;
|
||||
do {
|
||||
int n = min(size_t(chunkSize), bytes-cursor);
|
||||
prims.directRecv(cursor, n);
|
||||
cursor += n;
|
||||
} while (cursor < bytes && work->recvRegistered == 0);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void run(ncclWork *work) {
|
||||
struct ncclWorkElemP2p* args = work->p2pElems;
|
||||
int ngroups = args->ngroups;
|
||||
int tid = threadIdx.x;
|
||||
int wid = tid / WARP_SIZE;
|
||||
// This has to work even for groups of 2.5 warps (which is 8 groups, and means 3
|
||||
// warps for send, 2 warps for recv).
|
||||
// warpStarts were rounded thanks to int division, but for group number we need to round the other way around
|
||||
// So we mirror wid then mirror again the group.
|
||||
#define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
|
||||
uint8_t group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
|
||||
args += group;
|
||||
tid -= args->warpStart * WARP_SIZE;
|
||||
int nthreads = args->nWarps * WARP_SIZE;
|
||||
__device__ __forceinline__ void run() {
|
||||
const int tid = threadIdx.x;
|
||||
const int tn = blockDim.x;
|
||||
const int wid = tid/WARP_SIZE;
|
||||
const int nWarps = tn/WARP_SIZE;
|
||||
const int lane = tid%WARP_SIZE;
|
||||
|
||||
if (args->p2pType == ncclWorkP2pTypeUnused) return;
|
||||
if (tid >= nthreads || args->peer == -1) return;
|
||||
struct Shared {
|
||||
uint32_t workSendMask; // bitmasks of which work indices have send/recv
|
||||
uint32_t workRecvMask;
|
||||
};
|
||||
Shared* shared = (Shared*)ncclScratchForWarp(0);
|
||||
|
||||
// Select Proto here
|
||||
// This is to allow the same kernel to run multiple primitives on different warps (thread groups)
|
||||
if ((group%2) == 0) {
|
||||
if (args->proto == NCCL_PROTO_LL) {
|
||||
runRecv<ProtoLL>(tid, nthreads, group, args);
|
||||
struct ncclDevWorkP2p* works = (ncclDevWorkP2p*)ncclShmem.workStorage;
|
||||
int nWorks = ncclShmem.nWorks;
|
||||
|
||||
if (wid == 0) {
|
||||
// Modify the memory range of each work[] to reflect this channel's
|
||||
// partition of the work. Since integer divides are very heavy it's
|
||||
// best to do them all in one warp.
|
||||
int workIx = lane%16;
|
||||
int isSend = lane < 16 ? 0 : 1;
|
||||
bool hasWork = false;
|
||||
if (workIx < nWorks) {
|
||||
struct ncclDevWorkP2p* work = &works[workIx];
|
||||
size_t bytes = isSend ? work->sendBytes : work->recvBytes;
|
||||
int nParts = isSend ? work->nSendChannels : work->nRecvChannels;
|
||||
int part = ncclP2pChannelToPart(work->nP2pChannels, work->channelBase, ncclShmem.channelId);
|
||||
hasWork = (part < nParts);
|
||||
if (nParts != 0) {
|
||||
size_t partBeg, partEnd;
|
||||
ncclP2pPartBounds(nParts, part, bytes, &partBeg, &partEnd);
|
||||
(isSend ? work->sendAddr : work->recvAddr) = (char*)(isSend ? work->sendAddr : work->recvAddr) + partBeg;
|
||||
(isSend ? work->sendBytes : work->recvBytes) = partEnd - partBeg;
|
||||
}
|
||||
}
|
||||
uint32_t mask = __ballot_sync(~0u, hasWork);
|
||||
if (lane == 0) {
|
||||
shared->workSendMask = mask>>16;
|
||||
shared->workRecvMask = mask & 0xffff;
|
||||
}
|
||||
}
|
||||
|
||||
// The fastest way to compute a warp uniform division x/y in [0,32) is to
|
||||
// use each lane to guess a solution and count the ones that don't exceed
|
||||
// the numerator:
|
||||
// __popc(__ballot_sync(~0u, y*(lane+1) <= x))
|
||||
// That takes 1/3 the time of standard division and about 3/4 the time of
|
||||
// approximate floating point division:
|
||||
// __float2int_rd(__fdividef(float(x),float(y))).
|
||||
|
||||
// nWarpPerWork = nWarps/nWorks
|
||||
int nWarpPerWork = __popc(__ballot_sync(~0u, nWorks*(lane+1) <= nWarps));
|
||||
int nRecvWarpPerWork = nWarpPerWork<=4 ? nWarpPerWork/2 : (nWarpPerWork-1)/2;
|
||||
int nSendWarpPerWork = nWarpPerWork<=4 ? nRecvWarpPerWork : nRecvWarpPerWork+1;
|
||||
// This might reduce nWarpPerWork which is probably desirable. It is better
|
||||
// to have a balanced number of reading and writing threads even if that
|
||||
// leaves warps unused.
|
||||
nWarpPerWork = nSendWarpPerWork + nRecvWarpPerWork;
|
||||
// The work index this warp belongs to: workIx = wid/nWarpPerWork
|
||||
int workIx = __popc(__ballot_sync(~0u, (lane+1)*nWarpPerWork <= wid));
|
||||
|
||||
__syncthreads(); // Wait for works[] and shared->* to be updated by warp=0
|
||||
|
||||
uint32_t workSendMask = shared->workSendMask;
|
||||
uint32_t workRecvMask = shared->workRecvMask;
|
||||
|
||||
__syncthreads(); // release scratch space used by shared->*
|
||||
if (nWorks <= workIx) return;
|
||||
|
||||
// Thread range for whole work (send & recv combined)
|
||||
int subtid = tid - workIx*nWarpPerWork*WARP_SIZE;
|
||||
int subtn = nWarpPerWork*WARP_SIZE;
|
||||
|
||||
// A send primtive of sufficient size requires 2 cuda barrier ids.
|
||||
constexpr int nSendWarpsForExtraGroup = NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE/WARP_SIZE;
|
||||
// Count up all group ids used below this workIx:
|
||||
int group, extra;
|
||||
// Each recv gets one group id:
|
||||
group = __popc(workRecvMask & ((1<<workIx)-1));
|
||||
// Sends accompanying recvs get one and maybe an extra:
|
||||
extra = (nSendWarpPerWork >= nSendWarpsForExtraGroup) ? 1 : 0;
|
||||
group += __popc((workSendMask & workRecvMask) & ((1<<workIx)-1))*(1+extra);
|
||||
// Sends without recvs use more warps so compute extra accordingly:
|
||||
extra = (nWarpPerWork >= nSendWarpsForExtraGroup) ? 1 : 0;
|
||||
group += __popc((workSendMask & ~workRecvMask) & ((1<<workIx)-1))*(1+extra);
|
||||
|
||||
struct ncclDevWorkP2p* work = &works[workIx];
|
||||
bool hasSend = 1 & (workSendMask>>workIx);
|
||||
bool hasRecv = 1 & (workRecvMask>>workIx);
|
||||
bool isCopy = work->sendRank == ncclShmem.comm.rank;
|
||||
bool isSend = !hasRecv || (hasSend && subtid < nSendWarpPerWork*WARP_SIZE);
|
||||
|
||||
if (!isCopy && hasSend && hasRecv) {
|
||||
// Translate thread ids to reflect just this send or recv as opposed to whole work.
|
||||
if (isSend) {
|
||||
subtn = nSendWarpPerWork*WARP_SIZE;
|
||||
} else {
|
||||
runRecv<ProtoSimple<1,1>>(tid, nthreads, group, args);
|
||||
subtid -= nSendWarpPerWork*WARP_SIZE;
|
||||
subtn = nRecvWarpPerWork*WARP_SIZE;
|
||||
group += 1 + (nSendWarpPerWork >= nSendWarpsForExtraGroup ? 1 : 0);
|
||||
}
|
||||
}
|
||||
|
||||
if (isCopy) {
|
||||
reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
|
||||
(subtid, subtn, 0, nullptr, false, 1, &work->sendAddr, 1, &work->recvAddr, (ssize_t)work->sendBytes);
|
||||
} else if (isSend) {
|
||||
if (work->sendProtoLL) {
|
||||
runSend<ProtoLL>(subtid, subtn, group, work);
|
||||
} else {
|
||||
runSend<ProtoSimple<1,1>>(subtid, subtn, group, work);
|
||||
}
|
||||
} else {
|
||||
if (args->proto == NCCL_PROTO_LL) {
|
||||
runSend<ProtoLL>(tid, nthreads, group, args);
|
||||
if (work->recvProtoLL) {
|
||||
runRecv<ProtoLL>(subtid, subtn, group, work);
|
||||
} else {
|
||||
runSend<ProtoSimple<1,1>>(tid, nthreads, group, args);
|
||||
runRecv<ProtoSimple<1,1>>(subtid, subtn, group, work);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+1338
-1295
File diff ditekan karena terlalu besar
Load Diff
+19
-4
@@ -5,7 +5,9 @@
|
||||
************************************************************************/
|
||||
|
||||
#include "comm.h"
|
||||
#include "device.h"
|
||||
#include "graph.h"
|
||||
#include "transport.h"
|
||||
#include "trees.h"
|
||||
#include "rings.h"
|
||||
#include "topo.h"
|
||||
@@ -84,6 +86,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
|
||||
topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
|
||||
}
|
||||
}
|
||||
memcpy(comm->nvlsHeads, topoRanks->nvlsHeads, sizeof(int) * topoRanks->nvlsHeadNum);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -188,7 +191,7 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
char line[1024];
|
||||
sprintf(line, "CollNet channel %d rank %d ", c, rank);
|
||||
sprintf(line, "CollNetDirect channel %d rank %d ", c, rank);
|
||||
int nDown = 0;
|
||||
for (int i=0; i<nHeads; i++) {
|
||||
if (rank == heads[i]) { // is head
|
||||
@@ -334,10 +337,14 @@ int ncclMinNchannels() {
|
||||
if (minNchannels < 0) minNchannels = 0;
|
||||
return minNchannels;
|
||||
}
|
||||
|
||||
extern int64_t ncclParamWorkArgsBytes();
|
||||
|
||||
int ncclMaxNchannels() {
|
||||
int maxNchannels = MAXCHANNELS;
|
||||
if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings();
|
||||
if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
|
||||
maxNchannels = std::min(maxNchannels, ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes()));
|
||||
if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
|
||||
if (maxNchannels < 1) {
|
||||
WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
|
||||
@@ -363,6 +370,8 @@ void exchangeValues(int* v0, int* v1) {
|
||||
*v0 = tmp;
|
||||
}
|
||||
|
||||
NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1);
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
|
||||
// Gather data from all ranks
|
||||
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
|
||||
@@ -444,13 +453,13 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
|
||||
// Setup CollNet
|
||||
if (comm->collNetSupport == 1) {
|
||||
struct ncclTopoGraph* collNetGraph = graphs[NCCL_ALGO_COLLNET_DIRECT];
|
||||
struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
|
||||
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
|
||||
if (collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) {
|
||||
if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) {
|
||||
int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
|
||||
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
|
||||
}
|
||||
NCCLCHECK(connectCollNet(comm, collNetGraph));
|
||||
NCCLCHECK(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]));
|
||||
}
|
||||
|
||||
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
|
||||
@@ -458,6 +467,12 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
|
||||
}
|
||||
|
||||
// Double the number of channels when using unpack networking (greater than 1 node)
|
||||
// We won't automatically double past 16 channels, users can specify 32 if they want
|
||||
if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) {
|
||||
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
|
||||
}
|
||||
|
||||
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
|
||||
// We permit combining max, then min, to only use the first channels, then duplicate them.
|
||||
if (comm->sharedRes->owner != comm) {
|
||||
|
||||
+10
-15
@@ -10,6 +10,8 @@
|
||||
#include "comm.h"
|
||||
#include "net.h"
|
||||
#include "channel.h"
|
||||
#include "transport.h"
|
||||
#include "device.h"
|
||||
|
||||
// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
|
||||
|
||||
@@ -732,12 +734,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp
|
||||
|
||||
NCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 1);
|
||||
NCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS);
|
||||
|
||||
static int nextPow2(int v) {
|
||||
int pow2 = 1;
|
||||
while (pow2 < v) pow2 <<= 1;
|
||||
return pow2;
|
||||
}
|
||||
extern int64_t ncclParamWorkArgsBytes();
|
||||
|
||||
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
/* here we already honor comm->max/minCTAs for p2pnChannels. */
|
||||
@@ -759,19 +756,17 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
}
|
||||
}
|
||||
|
||||
// Round to next pow2 nChannelsPerPeer and nChannels
|
||||
comm->p2pnChannelsPerPeer = nextPow2(minChannels);
|
||||
comm->p2pnChannels = nextPow2(comm->p2pnChannels);
|
||||
// Make nChannelsPerPeer and nChannels powers of 2. This is relied on when
|
||||
// mapping p2p peers to channels.
|
||||
comm->p2pnChannelsPerPeer = pow2Up(minChannels);
|
||||
comm->p2pnChannels = pow2Up(comm->p2pnChannels);
|
||||
|
||||
comm->p2pnChannels = std::min(comm->p2pnChannels, pow2Down(ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes())));
|
||||
comm->p2pnChannelsPerPeer = std::min(comm->p2pnChannelsPerPeer, comm->p2pnChannels);
|
||||
|
||||
// Init channels that weren't used so far
|
||||
for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c));
|
||||
|
||||
// We want to spread channels used when there aren't many and progressively
|
||||
// fill the whole space of nChannels. To do so we mirror the bits in the
|
||||
// nChannels space.
|
||||
for (int c=0; c<comm->p2pnChannels; c++) {
|
||||
comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
+66
-6
@@ -8,6 +8,7 @@
|
||||
#include "core.h"
|
||||
#include "graph.h"
|
||||
#include "topo.h"
|
||||
#include "transport.h"
|
||||
#include "xml.h"
|
||||
#include <math.h>
|
||||
|
||||
@@ -51,6 +52,15 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm) {
|
||||
// We assume there is at least one CPU and that the CPUs have the same
|
||||
// architecture and vendor.
|
||||
const struct ncclTopoNodeSet* cpus = &comm->topo->nodes[CPU];
|
||||
comm->cpuArch = cpus->nodes[0].cpu.arch;
|
||||
comm->cpuVendor = cpus->nodes[0].cpu.vendor;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, int type, struct ncclTopoLink** revLink) {
|
||||
for (int l=0; l<node2->nlinks; l++) {
|
||||
struct ncclTopoLink* link = node2->links+l;
|
||||
@@ -104,7 +114,7 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
|
||||
}
|
||||
|
||||
// Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1).
|
||||
static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, int mult, struct ncclTopoNode** node) {
|
||||
static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, float mult, struct ncclTopoNode** node) {
|
||||
// First handle easy cases
|
||||
*node = system->nodes[type2].nodes+index2;
|
||||
if (type1 == -1) return ncclSuccess;
|
||||
@@ -334,6 +344,42 @@ ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoSearchTryCollnetDirect(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) {
|
||||
int fwdg = 0;
|
||||
int bwdg = 0;
|
||||
struct ncclTopoNode* gpu = NULL;
|
||||
float mul = 1.0 / (float)(system->nodes[GPU].count - 1);
|
||||
do {
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, fwdg, mul, &gpu));
|
||||
} while (gpu && ++fwdg < system->nodes[GPU].count);
|
||||
|
||||
if (gpu != NULL) {
|
||||
do {
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, bwdg, GPU, g, mul, &gpu));
|
||||
} while (gpu && ++bwdg < system->nodes[GPU].count);
|
||||
if (gpu != NULL) {
|
||||
// Both directions worked. Now we already have head, so pop the all other intra ranks.
|
||||
int step = 1;
|
||||
for (int index = 0; index < ngpus; ++index) {
|
||||
if (index != g) {
|
||||
graph->intra[graph->nChannels * ngpus + step] = system->nodes[GPU].nodes[index].gpu.rank;
|
||||
step++;
|
||||
}
|
||||
}
|
||||
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time));
|
||||
}
|
||||
while (bwdg) {
|
||||
bwdg--;
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, bwdg, GPU, g, -mul, &gpu));
|
||||
}
|
||||
}
|
||||
while (fwdg) {
|
||||
fwdg--;
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, fwdg, -mul, &gpu));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoSearchTryNvls(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) {
|
||||
struct ncclTopoNode* nvs;
|
||||
struct ncclTopoNode* gpu;
|
||||
@@ -514,6 +560,8 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
}
|
||||
} else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
|
||||
NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time));
|
||||
} else if (graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) {
|
||||
NCCLCHECK(ncclTopoSearchTryCollnetDirect(system, graph, saveGraph, g, ngpus, time));
|
||||
} else if (step < system->nodes[GPU].count-1) {
|
||||
// Go to next GPU
|
||||
int next[NCCL_TOPO_MAX_NODES];
|
||||
@@ -552,9 +600,10 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
int* nets;
|
||||
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
|
||||
int netCount;
|
||||
int graphFound = 0;
|
||||
NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
|
||||
for (int i=0; i<netCount; i++) {
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && i>0) continue;
|
||||
if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) continue;
|
||||
int n = nets[(graph->nChannels+i)%netCount];
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
if (graph->collNet && net->net.collSupport == 0) continue;
|
||||
@@ -571,12 +620,22 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
}
|
||||
}
|
||||
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) {
|
||||
// NVLS search only tries to find NIC:GPU combinations to compute the heads.
|
||||
if (graph->nChannels < netCount) {
|
||||
int gpu;
|
||||
int duplicate = 0;
|
||||
NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
|
||||
// check whether there is duplicate head when one GPU connects with multiple NICs
|
||||
for (int gc = 0; gc < graph->nChannels; gc++) {
|
||||
if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) {
|
||||
duplicate = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (duplicate) continue;
|
||||
if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
|
||||
graphFound = 1;
|
||||
}
|
||||
} else {
|
||||
if (graph->nChannels > 0) {
|
||||
@@ -891,8 +950,9 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
int ccMin;
|
||||
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
|
||||
// NVLS search must have ngpus heads at most.
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) graph->maxChannels = system->nodes[GPU].count;
|
||||
// NVLS and COLLNET_DIRECT search must have ngpus heads at most.
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT)
|
||||
graph->maxChannels = system->nodes[GPU].count;
|
||||
|
||||
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
|
||||
|
||||
@@ -1104,7 +1164,7 @@ ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, i
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank);
|
||||
WARN("Could not find NIC for rank %d in NVLS graph", comm->rank);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
|
||||
+107
-35
@@ -11,6 +11,7 @@
|
||||
#include "nvmlwrap.h"
|
||||
#include "net.h"
|
||||
#include "coll_net.h"
|
||||
#include "transport.h"
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include "xml.h"
|
||||
@@ -51,7 +52,12 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode*
|
||||
return ncclSuccess;
|
||||
}
|
||||
for (int l=0; l<node->nlinks; l++) {
|
||||
if (node->links[l].type == LINK_PCI) NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
|
||||
// Go up the PCI tree to find the CPU. Follow only PCI switches.
|
||||
if (node->links[l].type == LINK_PCI
|
||||
&& (node->links[l].remNode->type == PCI
|
||||
|| node->links[l].remNode->type == CPU)) {
|
||||
NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
|
||||
}
|
||||
if (*cpu != NULL) return ncclSuccess;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -109,11 +115,6 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo
|
||||
n->type = type;
|
||||
n->id = id;
|
||||
if (type == GPU) {
|
||||
// Create link to itself (used in some corner cases)
|
||||
n->nlinks=1;
|
||||
n->links[0].type = LINK_LOC;
|
||||
n->links[0].remNode = n;
|
||||
n->links[0].bw = LOC_BW;
|
||||
n->gpu.dev = NCCL_TOPO_UNDEF;
|
||||
n->gpu.rank = NCCL_TOPO_UNDEF;
|
||||
n->gpu.cudaCompCap = NCCL_TOPO_UNDEF;
|
||||
@@ -279,8 +280,10 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
|
||||
|
||||
for (int l=0; l<node->nlinks; l++) {
|
||||
struct ncclTopoLink* link = node->links+l;
|
||||
if (link->type == LINK_LOC) continue;
|
||||
if (link->type != LINK_PCI || link->remNode != prevNode) {
|
||||
if (link->type == LINK_LOC) {
|
||||
sprintf(line+offset, "+ %s[%2.1f] - %s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], link->remNode->id);
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
} else if (link->type != LINK_PCI || link->remNode != prevNode) {
|
||||
sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw);
|
||||
int nextOffset = strlen(line);
|
||||
if (link->type == LINK_PCI) {
|
||||
@@ -443,7 +446,9 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
|
||||
|
||||
for (int s=0; s<xmlPci->nSubs; s++) {
|
||||
struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
|
||||
NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId));
|
||||
if (strcmp(xmlSubPci->name, "pcilink") != 0) { // PCI links will be added later
|
||||
NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -579,6 +584,38 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoAddPciLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) {
|
||||
if (strcmp(node->name, "pcilink") == 0) {
|
||||
struct ncclTopoNode* pci = NULL;
|
||||
int64_t pBusId;
|
||||
NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
|
||||
pBusId = NCCL_TOPO_ID(systemId, pBusId);
|
||||
NCCLCHECK(ncclTopoGetNode(system, &pci, PCI, pBusId));
|
||||
if (pci == NULL) {
|
||||
WARN("Add PCI Link error : could not find PCI SW %lx", pBusId);
|
||||
return ncclInternalError;
|
||||
}
|
||||
struct ncclTopoNode* remote = NULL;
|
||||
const char* target;
|
||||
NCCLCHECK(xmlGetAttrStr(node, "target", &target));
|
||||
int64_t busId;
|
||||
NCCLCHECK(busIdToInt64(target, &busId));
|
||||
NCCLCHECK(ncclTopoGetNode(system, &remote, PCI, NCCL_TOPO_ID(systemId, busId)));
|
||||
if (remote) NCCLCHECK(ncclTopoConnectNodes(pci, remote, LINK_LOC, LOC_BW));
|
||||
} else {
|
||||
if (strcmp(node->name, "cpu") == 0) {
|
||||
NCCLCHECK(ncclGetSystemId(system, node, &systemId));
|
||||
}
|
||||
const char* busId;
|
||||
NCCLCHECK(xmlGetAttr(node, "busid", &busId));
|
||||
for (int s=0; s<node->nSubs; s++) {
|
||||
NCCLCHECK(ncclTopoAddPciLinks(node->subs[s], system, busId ? busId : parentBusId, systemId));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) {
|
||||
if (strcmp(node->name, "c2c") == 0) {
|
||||
struct ncclTopoNode* gpu = NULL;
|
||||
@@ -626,6 +663,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
|
||||
|
||||
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0));
|
||||
NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0));
|
||||
NCCLCHECK(ncclTopoAddPciLinks(topNode, *topoSystem, NULL, 0));
|
||||
|
||||
NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
|
||||
NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
|
||||
@@ -668,6 +706,18 @@ static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrN
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoRefreshBcmP2pLinks(void) {
|
||||
//refresh the switch topology by reading the link below
|
||||
FILE *fp = fopen("/sys/kernel/pci_switch_link/refresh_switch_toplogy", "r");
|
||||
if (fp != NULL) {
|
||||
int tmp;
|
||||
size_t r = fread(&tmp, sizeof(tmp), 1, fp);
|
||||
if (r != 1)
|
||||
INFO(NCCL_GRAPH, "Failed to read refresh_switch_toplogy");
|
||||
fclose(fp);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
|
||||
struct ncclXml* xml;
|
||||
@@ -687,18 +737,17 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
|
||||
}
|
||||
|
||||
// Auto-detect GPUs if needed
|
||||
for (int r=0; r<comm->nRanks; r++) {
|
||||
if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
|
||||
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
|
||||
struct ncclXmlNode* node;
|
||||
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
|
||||
if (node == NULL) continue;
|
||||
NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
|
||||
NCCLCHECK(xmlSetAttrInt(node, "rank", r));
|
||||
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
|
||||
}
|
||||
NCCLCHECK(ncclTopoRefreshBcmP2pLinks());
|
||||
|
||||
// Detect only the GPU managed by this process. We'll get any others through XML fusion.
|
||||
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
NCCLCHECK(int64ToBusId(comm->peerInfo[comm->rank].busId, busId));
|
||||
struct ncclXmlNode* node;
|
||||
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
|
||||
if (node) {
|
||||
NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
|
||||
NCCLCHECK(xmlSetAttrInt(node, "rank", comm->rank));
|
||||
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport));
|
||||
}
|
||||
// Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
|
||||
// so we start with collnet so that it has precedence.
|
||||
@@ -728,6 +777,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
for (int n=0; n<netDevCount; n++) {
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECK(comm->ncclNet->getProperties(n, &props));
|
||||
comm->netDeviceType = props.netDeviceType;
|
||||
struct ncclXmlNode* netNode;
|
||||
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
|
||||
@@ -745,24 +795,46 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
|
||||
NCCLCHECK(ncclTopoTrimXml(xml));
|
||||
|
||||
// XML topo fusion.
|
||||
int* localRanks;
|
||||
int localRank = -1, nLocalRanks = 0;
|
||||
if (comm->MNNVL) {
|
||||
// MNNVL clique support
|
||||
char* mem;
|
||||
NCCLCHECK(ncclCalloc(&mem, comm->clique.size * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
|
||||
struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*comm->cliqueRank);
|
||||
memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
|
||||
NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
|
||||
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->clique.ranks, comm->cliqueRank, comm->clique.size, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
|
||||
struct ncclXml* cliqueXml;
|
||||
NCCLCHECK(xmlAlloc(&cliqueXml, comm->clique.size*NCCL_TOPO_XML_MAX_NODES));
|
||||
for (int i = 0; i < comm->clique.size; i++) {
|
||||
struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
|
||||
NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
|
||||
NCCLCHECK(ncclTopoFuseXml(cliqueXml, peerXml));
|
||||
nLocalRanks = comm->clique.size;
|
||||
localRank = comm->cliqueRank;
|
||||
localRanks = comm->clique.ranks;
|
||||
} else {
|
||||
// Intra-node fusion. Much of the comm is not initialized yet at this point so we need to do our own calculations.
|
||||
NCCLCHECK(ncclCalloc(&localRanks, comm->nRanks));
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
if (comm->peerInfo[i].hostHash == comm->peerInfo[comm->rank].hostHash) {
|
||||
if (i == comm->rank)
|
||||
localRank = nLocalRanks;
|
||||
localRanks[nLocalRanks++] = i;
|
||||
}
|
||||
}
|
||||
free(xml);
|
||||
xml = cliqueXml;
|
||||
}
|
||||
char* mem;
|
||||
NCCLCHECK(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
|
||||
struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank);
|
||||
memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
|
||||
NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
|
||||
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
|
||||
if (comm->MNNVL) {
|
||||
// Ensure that we have enough room when fusing topos from multiple nodes.
|
||||
free(xml);
|
||||
NCCLCHECK(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES));
|
||||
} else {
|
||||
// In the intra-node case there's no need to enlarge the topo xml.
|
||||
xml->maxIndex = 0;
|
||||
free(localRanks);
|
||||
}
|
||||
for (int i = 0; i < nLocalRanks; i++) {
|
||||
struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
|
||||
NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
|
||||
NCCLCHECK(ncclTopoFuseXml(xml, peerXml));
|
||||
}
|
||||
free(mem);
|
||||
|
||||
xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
|
||||
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
|
||||
|
||||
+1
-1
@@ -218,7 +218,7 @@ static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, int64_t id
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("Could not find NET with id %lx\n", id);
|
||||
WARN("Could not find NET with id %lx", id);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
|
||||
+15
-17
@@ -110,11 +110,9 @@ NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);
|
||||
|
||||
static float getNetOverhead(struct ncclComm* comm) {
|
||||
if (ncclParamNetOverhead() != -2) return ncclParamNetOverhead() * .001;
|
||||
int cpuArch, cpuVendor, cpuModel;
|
||||
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
|
||||
if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0;
|
||||
if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0;
|
||||
else return 1.0;
|
||||
if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0;
|
||||
if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0;
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
|
||||
@@ -317,6 +315,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
}
|
||||
if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
|
||||
if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
|
||||
if (a == NCCL_ALGO_RING && pEnable == 0) comm->ringbdw[c][p] = 0;
|
||||
}
|
||||
|
||||
for (int c = 0; c < NCCL_NUM_FUNCTIONS; c++) {
|
||||
@@ -415,15 +414,15 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
|
||||
{ .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .6, .7, .8, .7, .7, .8, .9, .9 }
|
||||
};
|
||||
|
||||
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup) {
|
||||
float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
|
||||
float lat = info->comm->latencies[info->coll][algorithm][protocol];
|
||||
ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup) {
|
||||
float bw = comm->bandwidths[coll][algorithm][protocol];
|
||||
float lat = comm->latencies[coll][algorithm][protocol];
|
||||
|
||||
if (backup) {
|
||||
*backup = false;
|
||||
if (algorithm == NCCL_ALGO_RING && bw == 0.0f) {
|
||||
/* try back up RING algorithm */
|
||||
bw = info->comm->ringbdw[info->coll][protocol];
|
||||
bw = comm->ringbdw[coll][protocol];
|
||||
*backup = true;
|
||||
}
|
||||
}
|
||||
@@ -431,15 +430,14 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
|
||||
if (bw == 0) {
|
||||
*time = -1.0; return ncclSuccess;
|
||||
}
|
||||
int logSize = log2i(info->nBytes>>6);
|
||||
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
|
||||
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
|
||||
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
|
||||
&& info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) {
|
||||
lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
|
||||
int logSize = log2i(nBytes>>6);
|
||||
if (algorithm == NCCL_ALGO_TREE && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
|
||||
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && comm->nNodes > 1
|
||||
&& coll == ncclFuncAllReduce && nBytes/(comm->nChannels*comm->nRanks) >= 64) {
|
||||
lat *= comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
|
||||
}
|
||||
// Tree pipelining saves latency in aggregation cases
|
||||
int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS);
|
||||
*time = lat * latCount + (info->nBytes) / (1000 * bw);
|
||||
int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_DEV_WORK_BATCH_COLLS);
|
||||
*time = lat * latCount + nBytes / (1000 * bw);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
+85
-53
@@ -272,56 +272,34 @@ ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml)
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src) {
|
||||
struct ncclXmlNode* topNode;
|
||||
NCCLCHECK(xmlFindTag(dst, "system", &topNode));
|
||||
static ncclResult_t xmlTopoFuseXmlRecursive(struct ncclXml* dst, struct ncclXmlNode* dstParent, struct ncclXmlNode* srcParent) {
|
||||
for (int i = 0; i < srcParent->nSubs; i++) {
|
||||
struct ncclXmlNode* srcNode = srcParent->subs[i];
|
||||
struct ncclXmlNode* dstNode;
|
||||
NCCLCHECK(xmlFindNode(dstParent, srcNode, &dstNode));
|
||||
if (dstNode == NULL) {
|
||||
NCCLCHECK(xmlAddTree(dst, dstParent, srcNode));
|
||||
} else {
|
||||
NCCLCHECK(xmlTopoFuseXmlRecursive(dst, dstNode, srcNode));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
if (topNode == NULL) {
|
||||
ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src) {
|
||||
struct ncclXmlNode* topNodeDst;
|
||||
NCCLCHECK(xmlFindTag(dst, "system", &topNodeDst));
|
||||
|
||||
if (topNodeDst == NULL) {
|
||||
xmlAddTree(dst, NULL, src->nodes);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Fuse the CPUs with the first XML
|
||||
struct ncclXmlNode* srcCpu;
|
||||
NCCLCHECK(xmlFindTag(src, "cpu", &srcCpu));
|
||||
while (srcCpu) {
|
||||
const char* srcNumaId;
|
||||
const char* srcHostHash;
|
||||
NCCLCHECK(xmlGetAttr(srcCpu, "numaid", &srcNumaId));
|
||||
if (srcNumaId == NULL) {
|
||||
WARN("TopoFuseXmls : could not find CPU numa ID.");
|
||||
return ncclInternalError;
|
||||
}
|
||||
xmlGetAttr(srcCpu, "host_hash", &srcHostHash);
|
||||
if (srcHostHash == NULL)
|
||||
srcHostHash = "0";
|
||||
struct ncclXmlNode* topNodeSrc;
|
||||
NCCLCHECK(xmlFindTag(src, "system", &topNodeSrc));
|
||||
|
||||
// Search through the destination for a duplicate. Note that
|
||||
// this makes the complexity of this whole function O(n^2), but n
|
||||
// is expected to be small.
|
||||
struct ncclXmlNode* dstCpu;
|
||||
NCCLCHECK(xmlFindTag(dst, "cpu", &dstCpu));
|
||||
while (dstCpu) {
|
||||
const char* dstNumaId;
|
||||
const char* dstHostHash;
|
||||
NCCLCHECK(xmlGetAttr(dstCpu, "numaid", &dstNumaId));
|
||||
if (dstNumaId == NULL) {
|
||||
WARN("TopoFuseXmls : could not find CPU numa ID.");
|
||||
return ncclInternalError;
|
||||
}
|
||||
xmlGetAttr(dstCpu, "host_hash", &dstHostHash);
|
||||
if (dstHostHash == NULL)
|
||||
dstHostHash = "0";
|
||||
if (strcmp(srcNumaId, dstNumaId) == 0 && strcmp(srcHostHash, dstHostHash) == 0)
|
||||
break;
|
||||
NCCLCHECK(xmlTopoFuseXmlRecursive(dst, topNodeDst, topNodeSrc));
|
||||
|
||||
NCCLCHECK(xmlFindNextTag(dst, "cpu", dstCpu, &dstCpu));
|
||||
}
|
||||
// Only add the CPU if no duplicate was found
|
||||
if (dstCpu == NULL)
|
||||
NCCLCHECK(xmlAddTree(dst, topNode, srcCpu));
|
||||
NCCLCHECK(xmlFindNextTag(src, "cpu", srcCpu, &srcCpu));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -335,6 +313,11 @@ ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclX
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoXmlLoadPciLink(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
|
||||
NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoXmlLoadC2c(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
|
||||
NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
|
||||
return ncclSuccess;
|
||||
@@ -357,8 +340,8 @@ ncclResult_t ncclTopoXmlLoadNic(FILE* file, struct ncclXml* xml, struct ncclXmlN
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoXmlLoadPci(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
|
||||
struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic} };
|
||||
NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 3));
|
||||
struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic}, { "pcilink", ncclTopoXmlLoadPciLink} };
|
||||
NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 4));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -423,6 +406,28 @@ static ncclResult_t getPciPath(const char* busId, char** path) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#include <dirent.h>
|
||||
static ncclResult_t getBcmLinks(const char* busId, int* nlinks, char** peers) {
|
||||
*nlinks = 0;
|
||||
*peers = NULL;
|
||||
char dirPath[] = "/sys/kernel/pci_switch_link/virtual_switch_links/0000:00:00.0";
|
||||
memcpylower(dirPath+sizeof("/sys/kernel/pci_switch_link/virtual_switch_links/")-1, busId, BUSID_SIZE-1);
|
||||
DIR *dir = opendir(dirPath);
|
||||
if (dir) {
|
||||
struct dirent* file;
|
||||
while ((file = readdir(dir)) != NULL) {
|
||||
if (strlen(file->d_name) != BUSID_SIZE-1) continue;
|
||||
char* path;
|
||||
if (getPciPath(file->d_name, &path) == ncclSystemError) continue;
|
||||
free(path);
|
||||
NCCLCHECK(ncclRealloc(peers, (*nlinks)*BUSID_SIZE, ((*nlinks)+1)*BUSID_SIZE));
|
||||
memcpy((*peers)+BUSID_SIZE*(*nlinks)++, file->d_name, BUSID_SIZE);
|
||||
}
|
||||
closedir(dir);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) {
|
||||
char filePath[PATH_MAX];
|
||||
sprintf(filePath, "%s/%s", path, fileName);
|
||||
@@ -541,10 +546,11 @@ ncclResult_t ncclTopoGetPciNode(struct ncclXml* xml, const char* busId, struct n
|
||||
// There can be trailing chars.
|
||||
int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')); }
|
||||
int checkBDFFormat(char* bdf) {
|
||||
if (bdf[4] != ':' || bdf[7] != ':' || bdf[10] != '.') return 0;
|
||||
if (isHex(bdf[0]) == 0 || isHex(bdf[1] == 0) || isHex(bdf[2] == 0) || isHex(bdf[3] == 0) ||
|
||||
isHex(bdf[5] == 0) || isHex(bdf[6] == 0) || isHex(bdf[8] == 0) || isHex(bdf[9] == 0) ||
|
||||
isHex(bdf[11] == 0)) return 0;
|
||||
if (strlen(bdf) != 12) return 0;
|
||||
if ((bdf[4] != ':') || (bdf[7] != ':') || (bdf[10] != '.')) return 0;
|
||||
if ((isHex(bdf[0]) == 0) || (isHex(bdf[1]) == 0) || (isHex(bdf[2]) == 0) || (isHex(bdf[3]) == 0) ||
|
||||
(isHex(bdf[5]) == 0) || (isHex(bdf[6]) == 0) || (isHex(bdf[8]) == 0) || (isHex(bdf[9]) == 0) ||
|
||||
(isHex(bdf[11]) == 0)) return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -608,6 +614,24 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
|
||||
NCCLCHECK(xmlSetAttr(pciNode, "link_width", ""));
|
||||
}
|
||||
}
|
||||
|
||||
const char* vendor;
|
||||
NCCLCHECK(xmlGetAttr(pciNode, "vendor", &vendor));
|
||||
if (vendor != NULL && strcmp(vendor, "0x1000") == 0) { // BCM switch, look for P2P connections
|
||||
int nlinks;
|
||||
char* peers;
|
||||
NCCLCHECK(getBcmLinks(busId, &nlinks, &peers));
|
||||
for (int l=0; l<nlinks; l++) {
|
||||
char* target = peers+l*BUSID_SIZE;
|
||||
struct ncclXmlNode* linkNode;
|
||||
NCCLCHECK(xmlGetSubKv(pciNode, "pcilink", &linkNode, "target", target));
|
||||
if (linkNode == NULL) {
|
||||
NCCLCHECK(xmlAddNode(xml, pciNode, "pcilink", &linkNode));
|
||||
NCCLCHECK(xmlSetAttr(linkNode, "target", target));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ncclXmlNode* parent = pciNode->parent;
|
||||
if (parent == NULL) {
|
||||
if (path) {
|
||||
@@ -911,25 +935,33 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node) {
|
||||
ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node, int* keep) {
|
||||
const char* str;
|
||||
NCCLCHECK(xmlGetAttr(node, "keep", &str));
|
||||
if (str && strcmp(str, "1") == 0) {
|
||||
NCCLCHECK(xmlUnsetAttr(node, "keep"));
|
||||
*keep = 1;
|
||||
} else {
|
||||
// Copy nSubs and subs as they could change as we trim recursively.
|
||||
struct ncclXmlNode* subs[MAX_SUBS];
|
||||
int nSubs = node->nSubs;
|
||||
memcpy(subs, node->subs, node->nSubs*sizeof(struct ncclXmlNode*));
|
||||
*keep = 0;
|
||||
for (int s=0; s<nSubs; s++) {
|
||||
NCCLCHECK(ncclTopoTrimXmlRec(subs[s]));
|
||||
int k = 0;
|
||||
NCCLCHECK(ncclTopoTrimXmlRec(subs[s], &k));
|
||||
*keep += k;
|
||||
}
|
||||
if (*keep == 0 && // Trim PCI switches or CPU with no used GPU/NIC under them.
|
||||
(strcmp(node->name, "pci") == 0 || strcmp(node->name, "cpu") == 0)) {
|
||||
NCCLCHECK(xmlRemoveNode(node));
|
||||
}
|
||||
if (node->nSubs == 0) NCCLCHECK(xmlRemoveNode(node));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
ncclResult_t ncclTopoTrimXml(struct ncclXml* xml) {
|
||||
NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes));
|
||||
int keep = 0;
|
||||
NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes, &keep));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
+24
-1
@@ -55,7 +55,7 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
|
||||
/* Remove unneeded parts */
|
||||
ncclResult_t ncclTopoTrimXml(struct ncclXml* xml);
|
||||
|
||||
/* Fuse multiple system XMLs into one, skipping duplicate CPUs */
|
||||
/* Fuse multiple system XMLs into one, skipping duplicate entries */
|
||||
ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src);
|
||||
/* Relocate pointers in XML to (de-)serialize the structure */
|
||||
ncclResult_t ncclTopoConvertXml(struct ncclXml* xml, uintptr_t base, int exp);
|
||||
@@ -172,6 +172,29 @@ static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagName, struc
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t xmlFindNode(struct ncclXmlNode* parentNode, struct ncclXmlNode* searchNode, struct ncclXmlNode** node) {
|
||||
*node = NULL;
|
||||
// Search for the node at the current level only.
|
||||
for (int i=0; i<parentNode->nSubs; i++) {
|
||||
struct ncclXmlNode* n = parentNode->subs[i];
|
||||
if (strcmp(n->name, searchNode->name) == 0 && n->type == searchNode->type && n->nAttrs == searchNode->nAttrs) {
|
||||
int a;
|
||||
// Ensure that all the attributes are the same.
|
||||
for (a=0; a<searchNode->nAttrs; a++) {
|
||||
const char* val;
|
||||
NCCLCHECK(xmlGetAttr(n, searchNode->attrs[a].key, &val));
|
||||
if (!val || strcmp(val, searchNode->attrs[a].value))
|
||||
break;
|
||||
}
|
||||
if (a == searchNode->nAttrs) {
|
||||
*node = n;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, const char* value) {
|
||||
int index;
|
||||
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
|
||||
|
||||
+195
-66
@@ -10,6 +10,7 @@
|
||||
#include "transport.h"
|
||||
#include "channel.h"
|
||||
#include <assert.h>
|
||||
#include "bootstrap.h"
|
||||
|
||||
__thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting
|
||||
__thread ncclResult_t ncclGroupError = ncclSuccess;
|
||||
@@ -31,6 +32,7 @@ ncclResult_t ncclAsyncLaunch(
|
||||
) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
job->destroyFlag = comm->destroyFlag;
|
||||
if (ncclGroupDepth == 0) {
|
||||
ret = func(job);
|
||||
if (ret != ncclSuccess && undo) undo(job);
|
||||
@@ -40,11 +42,15 @@ ncclResult_t ncclAsyncLaunch(
|
||||
job->undo = undo;
|
||||
job->destructor = destructor;
|
||||
job->abortFlag = comm->abortFlag;
|
||||
job->abortFlagDev = comm->abortFlagDev;
|
||||
job->childAbortFlag = comm->childAbortFlag;
|
||||
job->childAbortFlagDev = comm->childAbortFlagDev;
|
||||
job->state = ncclGroupJobRunning;
|
||||
job->comm = comm;
|
||||
/* check if there are blocking and nonblocking comms at the same time in group. */
|
||||
if (ncclGroupBlocking == -1) {
|
||||
if (comm->destroyFlag) {
|
||||
ncclGroupBlocking = 1;
|
||||
} else if (ncclGroupBlocking == -1) {
|
||||
/* first met communicator */
|
||||
ncclGroupBlocking = comm->config.blocking;
|
||||
} else if (ncclGroupBlocking != comm->config.blocking) {
|
||||
@@ -98,11 +104,23 @@ exit:
|
||||
return ret;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupSimulateEnd, ncclSimInfo_t* simInfo);
|
||||
ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
NCCLCHECKGOTO(ncclGroupEndInternal(simInfo), ret, exit);
|
||||
TRACE_CALL("ncclGroupSimulateEnd()");
|
||||
exit:
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct ncclPreconnectJob {
|
||||
struct ncclAsyncJob base;
|
||||
struct ncclComm* comm;
|
||||
bool* algoNeedConnect;
|
||||
};
|
||||
ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) {
|
||||
|
||||
ncclResult_t ncclP2PPreconnectFunc(struct ncclAsyncJob* job_) {
|
||||
struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
|
||||
struct ncclComm* comm = job->comm;
|
||||
CUDACHECK(cudaSetDevice(comm->cudaDev));
|
||||
@@ -111,6 +129,57 @@ ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) {
|
||||
struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
|
||||
struct ncclComm* comm = job->comm;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
CUDACHECK(cudaSetDevice(comm->cudaDev));
|
||||
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; ++i) {
|
||||
if (job->algoNeedConnect[i]) {
|
||||
switch (i) {
|
||||
case NCCL_ALGO_RING: {
|
||||
NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
|
||||
break;
|
||||
}
|
||||
case NCCL_ALGO_TREE: {
|
||||
NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
|
||||
break;
|
||||
}
|
||||
case NCCL_ALGO_NVLS: {
|
||||
/* If we are using NVLS_TREE algo, we must mark NVLS algo to set up
|
||||
* NVLS intra-node buffer */
|
||||
NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
|
||||
break;
|
||||
}
|
||||
case NCCL_ALGO_NVLS_TREE: {
|
||||
NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
|
||||
break;
|
||||
}
|
||||
case NCCL_ALGO_COLLNET_CHAIN: {
|
||||
NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
|
||||
break;
|
||||
}
|
||||
case NCCL_ALGO_COLLNET_DIRECT: {
|
||||
NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
ret = ncclInternalError;
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
free(job->algoNeedConnect);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t doLaunches(struct ncclComm* head) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
struct ncclComm* cliqueComm0 = head->intraComm0;
|
||||
@@ -124,7 +193,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
|
||||
struct ncclComm* comm = cliqueHead;
|
||||
bool capturingYes = false, capturingNo = false;
|
||||
do {
|
||||
(ncclCudaGraphValid(comm->tasks.capturingGraph) ? capturingYes : capturingNo) = true;
|
||||
(ncclCudaGraphValid(comm->planner.capturingGraph) ? capturingYes : capturingNo) = true;
|
||||
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
|
||||
NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
|
||||
if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
|
||||
@@ -150,19 +219,19 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
|
||||
// Barrier reduction result tells us if this was the final round.
|
||||
moreRounds = 0 != ncclCommIntraBarrierOut(comm);
|
||||
} else {
|
||||
moreRounds |= comm->unlaunchedPlansHead != nullptr;
|
||||
moreRounds |= comm->planner.unlaunchedPlansHead != nullptr;
|
||||
}
|
||||
if (moreRounds) {
|
||||
// Pop next unlaunched kernel
|
||||
struct ncclKernelPlan* plan = comm->unlaunchedPlansHead;
|
||||
struct ncclKernelPlan* plan = comm->planner.unlaunchedPlansHead;
|
||||
if (plan != nullptr) {
|
||||
comm->unlaunchedPlansHead = plan->next;
|
||||
comm->planner.unlaunchedPlansHead = plan->next;
|
||||
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
|
||||
NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure);
|
||||
NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
|
||||
}
|
||||
// Barrier reduction input indicates if we require further rounds.
|
||||
if (useBarrier) ncclCommIntraBarrierIn(comm, comm->unlaunchedPlansHead != nullptr ? 1 : 0);
|
||||
if (useBarrier) ncclCommIntraBarrierIn(comm, comm->planner.unlaunchedPlansHead != nullptr ? 1 : 0);
|
||||
if (plan != nullptr) {
|
||||
NCCLCHECKGOTO(ncclLaunchKernelAfter_NoCuda(comm, plan), result, failure);
|
||||
}
|
||||
@@ -210,37 +279,29 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
|
||||
// is needed.
|
||||
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
comm->tasks.peers[i].sendSeen = false;
|
||||
comm->tasks.peers[i].recvSeen = false;
|
||||
comm->connectSend[i] = 0UL;
|
||||
comm->connectRecv[i] = 0UL;
|
||||
}
|
||||
comm->unlaunchedPlansHead = nullptr;
|
||||
// Reclaim abandoned kernel plan memory. Note ncclWork structs were already
|
||||
// reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
|
||||
while (!ncclIntruQueueEmpty(&comm->planQueue)) {
|
||||
struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue);
|
||||
while (!ncclIntruQueueEmpty(&comm->planner.planQueue)) {
|
||||
struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planner.planQueue);
|
||||
// Persistent plans will be reclaimed via the callbackQueue when the
|
||||
// graph drops its UserObject reference.
|
||||
if (!plan->persistent) {
|
||||
for (int c = 0; c < MAXCHANNELS; c++) {
|
||||
while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) {
|
||||
struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue);
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
|
||||
}
|
||||
while (!ncclIntruQueueEmpty(&plan->proxyOpQueue)) {
|
||||
struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->proxyOpQueue);
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
|
||||
}
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
|
||||
}
|
||||
}
|
||||
// Reset comm->tasks to empty.
|
||||
comm->tasks.nTasksColl = 0;
|
||||
comm->tasks.nTasksP2p = 0;
|
||||
comm->tasks.workBytesTotal = 0;
|
||||
comm->tasks.streams = nullptr;
|
||||
ncclIntruQueueConstruct(&comm->tasks.collQueue);
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
|
||||
ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
|
||||
|
||||
{ // Reset comm->planner to empty.
|
||||
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
|
||||
memset(&comm->planner, 0, sizeof(comm->planner));
|
||||
comm->planner.peers = tmp;
|
||||
memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0]));
|
||||
}
|
||||
|
||||
if (!comm->config.blocking)
|
||||
@@ -260,37 +321,10 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
|
||||
return;
|
||||
}
|
||||
|
||||
static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
|
||||
int savedDev;
|
||||
static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain, volatile bool *groupAbortFlag) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
bool jobsDone = false;
|
||||
bool errorJobAbortFlag = false;
|
||||
struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
|
||||
struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
|
||||
struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
|
||||
volatile bool *groupAbortFlag = gjob->abortFlagPtr;
|
||||
|
||||
CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
|
||||
|
||||
if (groupCommPreconnectHeadMain != nullptr) {
|
||||
struct ncclComm* comm = groupCommPreconnectHeadMain;
|
||||
do {
|
||||
struct ncclPreconnectJob* job;
|
||||
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
|
||||
job->base.func = ncclPreconnectFunc;
|
||||
job->base.undo = nullptr;
|
||||
job->base.destructor = free;
|
||||
job->base.state = ncclGroupJobRunning;
|
||||
job->base.abortFlag = comm->abortFlag;
|
||||
job->comm = comm;
|
||||
ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
|
||||
|
||||
struct ncclComm* next = comm->preconnectNext;
|
||||
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
comm = next;
|
||||
} while (comm != nullptr);
|
||||
}
|
||||
|
||||
if (!ncclIntruQueueEmpty(asyncJobsMain)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain);
|
||||
@@ -321,9 +355,13 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
|
||||
assert(state == ncclGroupJobJoined);
|
||||
}
|
||||
|
||||
if (__atomic_load_n(groupAbortFlag, __ATOMIC_RELAXED) || errorJobAbortFlag == true) {
|
||||
__atomic_store_n(job->abortFlag, 1, __ATOMIC_RELAXED);
|
||||
if (job->childAbortFlag) __atomic_store_n(job->childAbortFlag, 1, __ATOMIC_RELAXED);
|
||||
if (!job->destroyFlag && (__atomic_load_n(groupAbortFlag, __ATOMIC_ACQUIRE) || errorJobAbortFlag == true)) {
|
||||
__atomic_store_n(job->abortFlag, 1, __ATOMIC_RELEASE);
|
||||
__atomic_store_n(job->abortFlagDev, 1, __ATOMIC_RELEASE);
|
||||
if (job->childAbortFlag) {
|
||||
__atomic_store_n(job->childAbortFlag, 1, __ATOMIC_RELEASE);
|
||||
__atomic_store_n(job->childAbortFlagDev, 1, __ATOMIC_RELEASE);
|
||||
}
|
||||
}
|
||||
|
||||
job = job->next;
|
||||
@@ -335,17 +373,86 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
|
||||
if (ret != ncclSuccess) goto fail;
|
||||
}
|
||||
|
||||
if (groupCommHeadMain != nullptr) {
|
||||
NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
|
||||
}
|
||||
|
||||
while (!ncclIntruQueueEmpty(asyncJobsMain)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
|
||||
if (job->comm && !job->comm->config.blocking)
|
||||
if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
|
||||
(void) ncclCommSetAsyncError(job->comm, ret);
|
||||
if (job->destructor) job->destructor((void*)job);
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) {
|
||||
int savedDev;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
|
||||
struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
|
||||
struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
|
||||
bool *groupAbortFlag = gjob->abortFlagPtr;
|
||||
|
||||
CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
|
||||
|
||||
if (!simInfo && groupCommPreconnectHeadMain != nullptr) {
|
||||
struct ncclComm* comm = groupCommPreconnectHeadMain;
|
||||
do {
|
||||
struct ncclPreconnectJob* job;
|
||||
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
|
||||
job->base.func = ncclP2PPreconnectFunc;
|
||||
job->base.undo = nullptr;
|
||||
job->base.destructor = free;
|
||||
job->base.state = ncclGroupJobRunning;
|
||||
job->base.abortFlag = comm->abortFlag;
|
||||
job->base.abortFlagDev = comm->abortFlagDev;
|
||||
job->comm = comm;
|
||||
ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
|
||||
|
||||
struct ncclComm* next = comm->preconnectNext;
|
||||
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
comm = next;
|
||||
} while (comm != nullptr);
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail);
|
||||
|
||||
/* Connect channels at runtime if cumem is supported */
|
||||
if (groupCommHeadMain != nullptr) {
|
||||
struct ncclComm* comm = groupCommHeadMain;
|
||||
|
||||
do {
|
||||
bool needConnect = false;
|
||||
bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
|
||||
memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
|
||||
|
||||
NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
|
||||
|
||||
if (comm->cuMemSupport && needConnect) {
|
||||
struct ncclPreconnectJob* job;
|
||||
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
|
||||
job->base.func = ncclCollPreconnectFunc;
|
||||
job->base.undo = nullptr;
|
||||
job->base.destructor = free;
|
||||
job->base.state = ncclGroupJobRunning;
|
||||
job->base.abortFlag = comm->abortFlag;
|
||||
job->comm = comm;
|
||||
NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
|
||||
memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
|
||||
ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
|
||||
}
|
||||
comm = comm->groupNext;
|
||||
} while (comm);
|
||||
|
||||
NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail);
|
||||
}
|
||||
|
||||
if ((!simInfo) && (groupCommHeadMain != nullptr)) {
|
||||
NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
|
||||
}
|
||||
|
||||
while (groupCommHeadMain != nullptr) {
|
||||
struct ncclComm* comm = groupCommHeadMain;
|
||||
struct ncclComm* next = comm->groupNext;
|
||||
@@ -365,8 +472,17 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclGroupEndInternal() {
|
||||
static ncclResult_t groupLaunchNonBlocking(struct ncclAsyncJob *job_) {
|
||||
return groupLaunch(job_ /* estimatedTime = NULL */);
|
||||
}
|
||||
|
||||
ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
ncclSimInfo_t internalSimInfo = NCCL_SIM_INFO_INITIALIZER;
|
||||
ncclSimInfo_t* internalSimInfoPtr = NULL;
|
||||
size_t realSize = 0;
|
||||
|
||||
internalSimInfo.magic = 0;
|
||||
|
||||
if (ncclGroupDepth == 0) {
|
||||
WARN("ncclGroupEnd: not in a group call.");
|
||||
@@ -378,6 +494,18 @@ ncclResult_t ncclGroupEndInternal() {
|
||||
|
||||
if ((ret = ncclGroupError) != ncclSuccess) goto fail;
|
||||
|
||||
if (simInfo) {
|
||||
memcpy((void*)&realSize, (void*)&simInfo->size, sizeof(size_t));
|
||||
realSize = realSize > sizeof(ncclSimInfo_t) ? sizeof(ncclSimInfo_t) : realSize;
|
||||
memcpy((void*)&internalSimInfo, (void*)simInfo, realSize);
|
||||
if (internalSimInfo.magic != 0x74685283) {
|
||||
WARN("ncclSimInfo_t argument not initialized via NCCL_SIM_INFO_INITIALIZER");
|
||||
ret = ncclInvalidArgument;
|
||||
goto fail;
|
||||
}
|
||||
internalSimInfoPtr = &internalSimInfo;
|
||||
}
|
||||
|
||||
if (ncclGroupCommHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs) || ncclGroupCommPreconnectHead != nullptr) {
|
||||
ncclGroupJobMain.groupCommHeadPtr = &ncclGroupCommHead;
|
||||
ncclGroupJobMain.groupCommPreconnectHeadPtr = &ncclGroupCommPreconnectHead;
|
||||
@@ -410,12 +538,13 @@ ncclResult_t ncclGroupEndInternal() {
|
||||
} while (comm);
|
||||
}
|
||||
|
||||
ncclGroupJobMainPtr->base.func = groupLaunch;
|
||||
ncclGroupJobMainPtr->base.func = groupLaunchNonBlocking;
|
||||
SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail);
|
||||
ret = ncclInProgress;
|
||||
} else {
|
||||
/* blocking group */
|
||||
NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base), ret, fail);
|
||||
NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base, internalSimInfoPtr), ret, fail);
|
||||
if (simInfo) memcpy((void*)simInfo, (void*)internalSimInfoPtr, realSize);
|
||||
groupResetJobState(ncclGroupJobMainPtr);
|
||||
}
|
||||
}
|
||||
@@ -438,7 +567,7 @@ ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) {
|
||||
|
||||
ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) {
|
||||
if (groupJob && groupJob->initialized) {
|
||||
__atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELAXED);
|
||||
__atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELEASE);
|
||||
NCCLCHECK(ncclGroupJobComplete(groupJob));
|
||||
}
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -1,47 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_ALIGN_H_
|
||||
#define NCCL_ALIGN_H_
|
||||
|
||||
#define DIVUP(x, y) \
|
||||
(((x)+(y)-1)/(y))
|
||||
|
||||
#define ROUNDUP(x, y) \
|
||||
(DIVUP((x), (y))*(y))
|
||||
|
||||
#define ALIGN_POWER(x, y) \
|
||||
((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
|
||||
|
||||
#define ALIGN_SIZE(size, align) \
|
||||
size = ((size + (align) - 1) / (align)) * (align);
|
||||
|
||||
#if !__CUDA_ARCH__
|
||||
#ifndef __host__
|
||||
#define __host__
|
||||
#endif
|
||||
#ifndef __device__
|
||||
#define __device__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z divUp(X x, Y y) {
|
||||
return (x+y-1)/y;
|
||||
}
|
||||
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z roundUp(X x, Y y) {
|
||||
return (x+y-1) - (x+y-1)%y;
|
||||
}
|
||||
|
||||
// assumes second argument is a power of 2
|
||||
template<typename X, typename Z = decltype(X()+int())>
|
||||
__host__ __device__ constexpr Z alignUp(X x, int a) {
|
||||
return (x+a-1) & Z(-a);
|
||||
}
|
||||
|
||||
#endif
|
||||
+73
-52
@@ -9,7 +9,7 @@
|
||||
|
||||
#include "nccl.h"
|
||||
#include "checks.h"
|
||||
#include "align.h"
|
||||
#include "bitops.h"
|
||||
#include "utils.h"
|
||||
#include "p2p.h"
|
||||
#include <sys/mman.h>
|
||||
@@ -19,18 +19,25 @@
|
||||
|
||||
uint64_t clockNano(); // from utils.h with which we have a circular dependency
|
||||
|
||||
template<typename T>
|
||||
constexpr size_t ncclSizeOfT() { return sizeof(T); }
|
||||
template<>
|
||||
constexpr size_t ncclSizeOfT<void>() { return 1; }
|
||||
|
||||
template <typename T>
|
||||
ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish);
|
||||
memset(*ptr, 0, nelem*sizeof(T));
|
||||
if (nelem > 0) {
|
||||
CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*ncclSizeOfT<T>(), cudaHostAllocMapped), result, finish);
|
||||
memset(*ptr, 0, nelem*ncclSizeOfT<T>());
|
||||
}
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T));
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA host alloc %ld bytes", nelem*ncclSizeOfT<T>());
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
@@ -42,14 +49,18 @@ inline ncclResult_t ncclCudaHostFree(void* ptr) {
|
||||
|
||||
template <typename T>
|
||||
ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
||||
void* p = malloc(nelem*sizeof(T));
|
||||
if (p == NULL) {
|
||||
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
|
||||
return ncclSystemError;
|
||||
if (nelem > 0) {
|
||||
void* p = malloc(nelem*ncclSizeOfT<T>());
|
||||
if (p == NULL) {
|
||||
WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
|
||||
return ncclSystemError;
|
||||
}
|
||||
//INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), p);
|
||||
memset(p, 0, nelem*ncclSizeOfT<T>());
|
||||
*ptr = (T*)p;
|
||||
} else {
|
||||
*ptr = NULL;
|
||||
}
|
||||
//INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p);
|
||||
memset(p, 0, nelem*sizeof(T));
|
||||
*ptr = (T*)p;
|
||||
return ncclSuccess;
|
||||
}
|
||||
#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
@@ -60,16 +71,16 @@ ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
|
||||
if (nelem == oldNelem) return ncclSuccess;
|
||||
|
||||
T* oldp = *ptr;
|
||||
T* p = (T*)malloc(nelem*sizeof(T));
|
||||
T* p = (T*)malloc(nelem*ncclSizeOfT<T>());
|
||||
if (p == NULL) {
|
||||
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
|
||||
WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
|
||||
return ncclSystemError;
|
||||
}
|
||||
memcpy(p, oldp, oldNelem*sizeof(T));
|
||||
memcpy(p, oldp, oldNelem*ncclSizeOfT<T>());
|
||||
free(oldp);
|
||||
memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T));
|
||||
memset(p+oldNelem, 0, (nelem-oldNelem)*ncclSizeOfT<T>());
|
||||
*ptr = (T*)p;
|
||||
INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr);
|
||||
INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*ncclSizeOfT<T>(), nelem*ncclSizeOfT<T>(), *ptr);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -111,7 +122,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
|
||||
if (handlep) *handlep = handle;
|
||||
TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle);
|
||||
TRACE(NCCL_ALLOC, "CuMem Alloc Size %zu pointer %p handle %llx", size, *ptr, handle);
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -123,7 +134,7 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
|
||||
CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
|
||||
CUCHECK(cuMemRelease(handle));
|
||||
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
|
||||
TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
|
||||
TRACE(NCCL_ALLOC, "CuMem Free Size %zu pointer %p handle 0x%llx", size, ptr, handle);
|
||||
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
|
||||
CUCHECK(cuMemRelease(handle));
|
||||
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
|
||||
@@ -151,15 +162,17 @@ ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, in
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (ncclCuMemEnable()) {
|
||||
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
|
||||
} else {
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
|
||||
if (nelem > 0) {
|
||||
if (ncclCuMemEnable()) {
|
||||
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
|
||||
} else {
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
|
||||
}
|
||||
}
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T));
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA malloc %ld bytes", nelem*ncclSizeOfT<T>());
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
@@ -170,21 +183,23 @@ ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, in
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
// Need a side stream so as not to interfere with graph capture.
|
||||
cudaStream_t stream;
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
if (ncclCuMemEnable()) {
|
||||
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
|
||||
} else {
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
|
||||
if (nelem > 0) {
|
||||
// Need a side stream so as not to interfere with graph capture.
|
||||
cudaStream_t stream;
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
if (ncclCuMemEnable()) {
|
||||
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
|
||||
} else {
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
|
||||
}
|
||||
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT<T>(), stream), result, finish);
|
||||
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
|
||||
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
|
||||
}
|
||||
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
|
||||
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
|
||||
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T));
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc %ld bytes", nelem*ncclSizeOfT<T>());
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
@@ -195,16 +210,18 @@ ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (ncclCuMemEnable()) {
|
||||
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
|
||||
} else {
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
|
||||
if (nelem > 0) {
|
||||
if (ncclCuMemEnable()) {
|
||||
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
|
||||
} else {
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
|
||||
}
|
||||
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT<T>(), stream), result, finish);
|
||||
}
|
||||
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T));
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc async %ld bytes", nelem*ncclSizeOfT<T>());
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
@@ -230,7 +247,7 @@ ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stre
|
||||
ncclResult_t result = ncclSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish);
|
||||
CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*ncclSizeOfT<T>(), cudaMemcpyDefault, stream), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
return result;
|
||||
@@ -256,13 +273,17 @@ finish:
|
||||
// allocated on separate pages as those pages will be marked DONTFORK
|
||||
// and if they are shared, that could cause a crash in a child process
|
||||
inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
|
||||
size_t page_size = sysconf(_SC_PAGESIZE);
|
||||
void* p;
|
||||
int size_aligned = ROUNDUP(size, page_size);
|
||||
int ret = posix_memalign(&p, page_size, size_aligned);
|
||||
if (ret != 0) return ncclSystemError;
|
||||
memset(p, 0, size);
|
||||
*ptr = p;
|
||||
if (size > 0) {
|
||||
size_t page_size = sysconf(_SC_PAGESIZE);
|
||||
void* p;
|
||||
int size_aligned = ROUNDUP(size, page_size);
|
||||
int ret = posix_memalign(&p, page_size, size_aligned);
|
||||
if (ret != 0) return ncclSystemError;
|
||||
memset(p, 0, size);
|
||||
*ptr = p;
|
||||
} else {
|
||||
*ptr = NULL;
|
||||
}
|
||||
INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,277 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_BITOPS_H_
|
||||
#define NCCL_BITOPS_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#if !__NVCC__
|
||||
#ifndef __host__
|
||||
#define __host__
|
||||
#endif
|
||||
#ifndef __device__
|
||||
#define __device__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define DIVUP(x, y) \
|
||||
(((x)+(y)-1)/(y))
|
||||
|
||||
#define ROUNDUP(x, y) \
|
||||
(DIVUP((x), (y))*(y))
|
||||
|
||||
#define ALIGN_POWER(x, y) \
|
||||
((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
|
||||
|
||||
#define ALIGN_SIZE(size, align) \
|
||||
size = ((size + (align) - 1) / (align)) * (align);
|
||||
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z divUp(X x, Y y) {
|
||||
return (x+y-1)/y;
|
||||
}
|
||||
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z roundUp(X x, Y y) {
|
||||
return (x+y-1) - (x+y-1)%y;
|
||||
}
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z roundDown(X x, Y y) {
|
||||
return x - x%y;
|
||||
}
|
||||
|
||||
// assumes second argument is a power of 2
|
||||
template<typename X, typename Z = decltype(X()+int())>
|
||||
__host__ __device__ constexpr Z alignUp(X x, int a) {
|
||||
return (x + a-1) & Z(-a);
|
||||
}
|
||||
// assumes second argument is a power of 2
|
||||
template<typename X, typename Z = decltype(X()+int())>
|
||||
__host__ __device__ constexpr Z alignDown(X x, int a) {
|
||||
return x & Z(-a);
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int countOneBits(Int x) {
|
||||
#if __CUDA_ARCH__
|
||||
if (sizeof(Int) <= sizeof(unsigned int)) {
|
||||
return __popc((unsigned int)x);
|
||||
} else if (sizeof(Int) <= sizeof(unsigned long long)) {
|
||||
return __popcll((unsigned long long)x);
|
||||
} else {
|
||||
static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
|
||||
return -1;
|
||||
}
|
||||
#else
|
||||
if (sizeof(Int) <= sizeof(unsigned int)) {
|
||||
return __builtin_popcount((unsigned int)x);
|
||||
} else if (sizeof(Int) <= sizeof(unsigned long)) {
|
||||
return __builtin_popcountl((unsigned long)x);
|
||||
} else if (sizeof(Int) <= sizeof(unsigned long long)) {
|
||||
return __builtin_popcountll((unsigned long long)x);
|
||||
} else {
|
||||
static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Returns index of first one bit or returns -1 if mask is zero.
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int firstOneBit(Int mask) {
|
||||
int i;
|
||||
#if __CUDA_ARCH__
|
||||
if (sizeof(Int) <= sizeof(int)) {
|
||||
i = __ffs((int)mask);
|
||||
} else if (sizeof(Int) <= sizeof(long long)) {
|
||||
i = __ffsll((long long)mask);
|
||||
} else {
|
||||
static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
|
||||
}
|
||||
#else
|
||||
if (sizeof(Int) <= sizeof(int)) {
|
||||
i = __builtin_ffs((int)mask);
|
||||
} else if (sizeof(Int) <= sizeof(long)) {
|
||||
i = __builtin_ffsl((long)mask);
|
||||
} else if (sizeof(Int) <= sizeof(long long)) {
|
||||
i = __builtin_ffsll((long long)mask);
|
||||
} else {
|
||||
static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
|
||||
}
|
||||
#endif
|
||||
return i-1;
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int popFirstOneBit(Int* mask) {
|
||||
Int tmp = *mask;
|
||||
*mask &= *mask-1;
|
||||
return firstOneBit(tmp);
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int log2Down(Int x) {
|
||||
int w, n;
|
||||
#if __CUDA_ARCH__
|
||||
if (sizeof(Int) <= sizeof(int)) {
|
||||
w = 8*sizeof(int);
|
||||
n = __clz((int)x);
|
||||
} else if (sizeof(Int) <= sizeof(long long)) {
|
||||
w = 8*sizeof(long long);
|
||||
n = __clzll((long long)x);
|
||||
} else {
|
||||
static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
|
||||
}
|
||||
#else
|
||||
if (x == 0) {
|
||||
return -1;
|
||||
} else if (sizeof(Int) <= sizeof(unsigned int)) {
|
||||
w = 8*sizeof(unsigned int);
|
||||
n = __builtin_clz((unsigned int)x);
|
||||
} else if (sizeof(Int) <= sizeof(unsigned long)) {
|
||||
w = 8*sizeof(unsigned long);
|
||||
n = __builtin_clzl((unsigned long)x);
|
||||
} else if (sizeof(Int) <= sizeof(unsigned long long)) {
|
||||
w = 8*sizeof(unsigned long long);
|
||||
n = __builtin_clzll((unsigned long long)x);
|
||||
} else {
|
||||
static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
|
||||
}
|
||||
#endif
|
||||
return (w-1)-n;
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int log2Up(Int x) {
|
||||
int w, n;
|
||||
if (x != 0) x -= 1;
|
||||
#if __CUDA_ARCH__
|
||||
if (sizeof(Int) <= sizeof(int)) {
|
||||
w = 8*sizeof(int);
|
||||
n = __clz((int)x);
|
||||
} else if (sizeof(Int) <= sizeof(long long)) {
|
||||
w = 8*sizeof(long long);
|
||||
n = __clzll((long long)x);
|
||||
} else {
|
||||
static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
|
||||
}
|
||||
#else
|
||||
if (x == 0) {
|
||||
return 0;
|
||||
} else if (sizeof(Int) <= sizeof(unsigned int)) {
|
||||
w = 8*sizeof(unsigned int);
|
||||
n = __builtin_clz((unsigned int)x);
|
||||
} else if (sizeof(Int) <= sizeof(unsigned long)) {
|
||||
w = 8*sizeof(unsigned long);
|
||||
n = __builtin_clzl((unsigned long)x);
|
||||
} else if (sizeof(Int) <= sizeof(unsigned long long)) {
|
||||
w = 8*sizeof(unsigned long long);
|
||||
n = __builtin_clzll((unsigned long long)x);
|
||||
} else {
|
||||
static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
|
||||
}
|
||||
#endif
|
||||
return w-n;
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ Int pow2Up(Int x) {
|
||||
return Int(1)<<log2Up(x);
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ Int pow2Down(Int x) {
|
||||
return Int(1)<<log2Down(x);
|
||||
}
|
||||
|
||||
template<typename UInt, int nSubBits>
|
||||
inline __host__ UInt reverseSubBits(UInt x) {
|
||||
if (nSubBits >= 16 && 8*sizeof(UInt) == nSubBits) {
|
||||
switch (8*sizeof(UInt)) {
|
||||
case 16: x = __builtin_bswap16(x); break;
|
||||
case 32: x = __builtin_bswap32(x); break;
|
||||
case 64: x = __builtin_bswap64(x); break;
|
||||
default: static_assert(8*sizeof(UInt) <= 64, "Unsupported integer type.");
|
||||
}
|
||||
return reverseSubBits<UInt, 8>(x);
|
||||
} else if (nSubBits == 1) {
|
||||
return x;
|
||||
} else {
|
||||
UInt m = UInt(-1)/((UInt(1)<<(nSubBits/2))+1);
|
||||
x = (x & m)<<(nSubBits/2) | (x & ~m)>>(nSubBits/2);
|
||||
return reverseSubBits<UInt, nSubBits/2>(x);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T> struct ncclToUnsigned;
|
||||
template<> struct ncclToUnsigned<char> { using type = unsigned char; };
|
||||
template<> struct ncclToUnsigned<signed char> { using type = unsigned char; };
|
||||
template<> struct ncclToUnsigned<unsigned char> { using type = unsigned char; };
|
||||
template<> struct ncclToUnsigned<signed short> { using type = unsigned short; };
|
||||
template<> struct ncclToUnsigned<unsigned short> { using type = unsigned short; };
|
||||
template<> struct ncclToUnsigned<signed int> { using type = unsigned int; };
|
||||
template<> struct ncclToUnsigned<unsigned int> { using type = unsigned int; };
|
||||
template<> struct ncclToUnsigned<signed long> { using type = unsigned long; };
|
||||
template<> struct ncclToUnsigned<unsigned long> { using type = unsigned long; };
|
||||
template<> struct ncclToUnsigned<signed long long> { using type = unsigned long long; };
|
||||
template<> struct ncclToUnsigned<unsigned long long> { using type = unsigned long long; };
|
||||
|
||||
// Reverse the bottom nBits bits of x. The top bits will be overwritten with 0's.
|
||||
template<typename Int>
|
||||
inline __host__ __device__ Int reverseBits(Int x, int nBits) {
|
||||
using UInt = typename ncclToUnsigned<Int>::type;
|
||||
union { UInt ux; Int sx; };
|
||||
sx = x;
|
||||
#if __CUDA_ARCH__
|
||||
if (sizeof(Int) <= sizeof(unsigned int)) {
|
||||
ux = __brev(ux);
|
||||
} else if (sizeof(Int) <= sizeof(unsigned long long)) {
|
||||
ux = __brevll(ux);
|
||||
} else {
|
||||
static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer type.");
|
||||
}
|
||||
#else
|
||||
ux = reverseSubBits<UInt, 8*sizeof(UInt)>(ux);
|
||||
#endif
|
||||
ux = nBits==0 ? 0 : ux>>(8*sizeof(UInt)-nBits);
|
||||
return sx;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Custom 8 bit floating point format for approximating 32 bit uints. This format
|
||||
// has nearly the full range of uint32_t except it only keeps the top 3 bits
|
||||
// beneath the leading 1 bit and thus has a max value of 0xf0000000.
|
||||
|
||||
inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
|
||||
int log2x;
|
||||
#if __CUDA_ARCH__
|
||||
log2x = 31-__clz(x|1);
|
||||
#else
|
||||
log2x = 31-__builtin_clz(x|1);
|
||||
#endif
|
||||
uint32_t mantissa = x>>(log2x >= bitsPerPow2 ? log2x-bitsPerPow2 : 0) & ((1u<<bitsPerPow2)-1);
|
||||
uint32_t exponent = log2x >= bitsPerPow2 ? log2x-(bitsPerPow2-1) : 0;
|
||||
return exponent<<bitsPerPow2 | mantissa;
|
||||
}
|
||||
|
||||
inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
|
||||
uint32_t exponent = x>>bitsPerPow2;
|
||||
uint32_t mantissa = (x & ((1u<<bitsPerPow2)-1)) | (exponent!=0 ? 0x8 : 0);
|
||||
if (exponent != 0) exponent -= 1;
|
||||
return mantissa<<exponent;
|
||||
}
|
||||
|
||||
constexpr uint32_t u32fp8MaxValue() { return 0xf0000000; }
|
||||
|
||||
inline __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
|
||||
return u32fpEncode(x, 3);
|
||||
}
|
||||
inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
|
||||
return u32fpDecode(x, 3);
|
||||
}
|
||||
|
||||
#endif
|
||||
+12
-29
@@ -7,42 +7,25 @@
|
||||
#ifndef NCCL_CHANNEL_H_
|
||||
#define NCCL_CHANNEL_H_
|
||||
#include "comm.h"
|
||||
#include "utils.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
|
||||
ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
|
||||
ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
|
||||
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
|
||||
static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
|
||||
int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
|
||||
int peerNode = comm->rankToNode[peer];
|
||||
int peerIndex = comm->rankToLocalRank[peer];
|
||||
int nsteps = comm->maxLocalRanks;
|
||||
int rankIndex = comm->rankToLocalRank[comm->rank];
|
||||
int step, delta;
|
||||
if (coll == ncclFuncSend) {
|
||||
step = (nsteps + peerIndex - rankIndex)%nsteps;
|
||||
delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
|
||||
} else if (coll == ncclFuncRecv) {
|
||||
step = (nsteps + rankIndex - peerIndex)%nsteps;
|
||||
delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
|
||||
|
||||
inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) {
|
||||
if (comm->nNodes > 1) {
|
||||
int nodeDelta = p2pRound/comm->maxLocalRanks;
|
||||
int localDelta = p2pRound%comm->maxLocalRanks;
|
||||
int base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH);
|
||||
base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH;
|
||||
return base & 0xff;
|
||||
} else {
|
||||
return ncclInternalError;
|
||||
return p2pRound & 0xff;
|
||||
}
|
||||
*channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
|
||||
//*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
|
||||
*channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) {
|
||||
int base;
|
||||
NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base));
|
||||
NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -123,23 +123,23 @@
|
||||
} while (0);
|
||||
|
||||
#define NCCLWAIT(call, cond, abortFlagPtr) do { \
|
||||
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
|
||||
uint32_t* tmpAbortFlag = (abortFlagPtr); \
|
||||
ncclResult_t RES = call; \
|
||||
if (RES != ncclSuccess && RES != ncclInProgress) { \
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
|
||||
return ncclInternalError; \
|
||||
} \
|
||||
if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
|
||||
if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECK(*tmpAbortFlag, 0); \
|
||||
} while (!(cond));
|
||||
|
||||
#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
|
||||
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
|
||||
uint32_t* tmpAbortFlag = (abortFlagPtr); \
|
||||
RES = call; \
|
||||
if (RES != ncclSuccess && RES != ncclInProgress) { \
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
|
||||
goto label; \
|
||||
} \
|
||||
if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
|
||||
if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
|
||||
} while (!(cond));
|
||||
|
||||
#define NCCLCHECKTHREAD(a, args) do { \
|
||||
|
||||
@@ -8,6 +8,8 @@
|
||||
#define NCCL_COLLECTIVES_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "nccl_common.h"
|
||||
#include "device.h"
|
||||
|
||||
// CHUNKSIZE must be a multiple of SLICESIZE
|
||||
#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
|
||||
@@ -22,6 +24,12 @@
|
||||
#define REDUCE_CHUNKSTEPS 1
|
||||
#define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
|
||||
|
||||
const char* ncclFuncToString(ncclFunc_t op);
|
||||
const char* ncclDevRedOpToString(ncclDevRedOp_t op);
|
||||
const char* ncclDatatypeToString(ncclDataType_t type);
|
||||
const char* ncclAlgoToString(int algo);
|
||||
const char* ncclProtoToString(int proto);
|
||||
|
||||
inline int ncclTypeSize(ncclDataType_t type) {
|
||||
switch (type) {
|
||||
case ncclInt8:
|
||||
|
||||
+222
-60
@@ -7,7 +7,7 @@
|
||||
#ifndef NCCL_COMM_H_
|
||||
#define NCCL_COMM_H_
|
||||
|
||||
#include "transport.h"
|
||||
//#include "transport.h"
|
||||
#include "p2p.h"
|
||||
#include "collectives.h"
|
||||
#include "nccl_tuner.h"
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "strongstream.h"
|
||||
#include "nccl_net.h"
|
||||
#include "register.h"
|
||||
#include "graph.h"
|
||||
|
||||
#if CUDART_VERSION < 9000
|
||||
struct cudaLaunchParams {
|
||||
@@ -144,7 +145,7 @@ struct ncclChannel {
|
||||
struct ncclNvls nvls;
|
||||
|
||||
int id; // index of this channel
|
||||
uint32_t workFifoSent; // last used work index+1
|
||||
uint32_t workFifoProduced; // +1 successor of last used work fifo byte
|
||||
|
||||
/* comm split sharable resources */
|
||||
struct ncclChannelPeer* collnetPeers;
|
||||
@@ -153,22 +154,15 @@ struct ncclChannel {
|
||||
struct ncclDevChannelPeer* nvlsDevPeers;
|
||||
};
|
||||
|
||||
struct ncclWorkList {
|
||||
struct ncclWorkBatchList {
|
||||
struct ncclWorkBatchList* next;
|
||||
struct ncclDevWorkBatch batch;
|
||||
};
|
||||
struct alignas(16) ncclWorkList {
|
||||
struct ncclWorkList* next;
|
||||
struct ncclWork work;
|
||||
};
|
||||
|
||||
struct ncclPointerList {
|
||||
struct ncclPointerList* next;
|
||||
void *ptr;
|
||||
};
|
||||
|
||||
struct ncclNvlsMcHandleList {
|
||||
struct ncclNvlsMcHandleList *next;
|
||||
CUmemGenericAllocationHandle mcHandle;
|
||||
CUdeviceptr ptr;
|
||||
int dev;
|
||||
size_t size;
|
||||
enum ncclDevWorkType workType;
|
||||
int size; // Size of struct following this node
|
||||
// ncclDevWorkColl, ncclDevWorkColLReg, ncclDevWorkP2p[]...
|
||||
};
|
||||
|
||||
struct ncclCollnetHandleList {
|
||||
@@ -188,33 +182,190 @@ struct ncclKernelPlan {
|
||||
struct ncclKernelPlan* next;
|
||||
|
||||
bool persistent; // aka captured in a graph
|
||||
enum ncclDevWorkStorageType workStorageType;
|
||||
bool kernelSpecialized;
|
||||
void *kernelFn;
|
||||
int channelUbound; // only channels c < channelUbound are present
|
||||
int channelCount; // number of channels present
|
||||
uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask)
|
||||
struct ncclDevKernelArgs* kernelArgs;
|
||||
size_t kernelArgsSize;
|
||||
uint64_t channelMask; // bitset of which channels are present
|
||||
bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
|
||||
int threadPerBlock;
|
||||
// workHeap fields are null until uploadWorkFifo() or preparePersistentKernel()
|
||||
struct ncclWork* workHead;
|
||||
|
||||
int collOpCount; // zero based for this plan
|
||||
int collOpCount; // Number of collectives in this plan.
|
||||
int nWorkBatches; // Number of work batches.
|
||||
size_t workBytes; // Sum size of all work (in the fifo) in bytes.
|
||||
struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
|
||||
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> cleanupQueue;
|
||||
void* workBufPersistent;
|
||||
|
||||
struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
|
||||
struct ncclIntruQueue<struct ncclNvlsMcHandleList, &ncclNvlsMcHandleList::next> nvlsMcHandleQueue;
|
||||
struct ncclIntruQueue<struct ncclCollnetHandleList, &ncclCollnetHandleList::next> collnetHandleQueue;
|
||||
struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
|
||||
};
|
||||
|
||||
struct Channel {
|
||||
int nWork;
|
||||
union {
|
||||
int nWorkElem; // used for coll and reg coll
|
||||
int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1
|
||||
};
|
||||
size_t collBytes;
|
||||
struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
|
||||
struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
|
||||
} channels[MAXCHANNELS];
|
||||
size_t maxBytesPerChannel;
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct ncclTaskColl {
|
||||
struct ncclTaskColl* next;
|
||||
ncclFunc_t func;
|
||||
void const* sendbuff;
|
||||
void* recvbuff;
|
||||
size_t count;
|
||||
int root;
|
||||
ncclDataType_t datatype;
|
||||
ncclRedOp_t opHost;
|
||||
struct ncclDevRedOpFull opDev;
|
||||
int chunkSteps, sliceSteps;
|
||||
// Computed later:
|
||||
size_t trafficBytes;
|
||||
int32_t nMaxChannels:8;
|
||||
int32_t nWarps:8;
|
||||
int32_t algorithm:8, protocol:8;
|
||||
uint32_t isCollnet:1, isNvls:1;
|
||||
uint32_t devFuncId:30;
|
||||
enum ncclRegBufferType regBufType;
|
||||
// number of elements in planner->ipcMemQueue associated with this collective
|
||||
int nCleanupQueueElts;
|
||||
|
||||
void* sendMhandle;
|
||||
void* recvMhandle;
|
||||
};
|
||||
struct ncclTaskP2p {
|
||||
struct ncclTaskP2p* next;
|
||||
void* buff;
|
||||
size_t bytes;
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Roughly sorts ncclTaskColl's by their size descending. This structure is
|
||||
// self-referential, meaning that pointers it contains internally may point
|
||||
// into the structure itself. This means that it is NOT memcpy-moveable:
|
||||
|
||||
struct ncclTaskCollSorter {
|
||||
static constexpr int UnitLog2 = 10; // 1K
|
||||
static constexpr size_t UnitSize = 1<<UnitLog2;
|
||||
static constexpr int MaxLog2 = 30; // 1GB
|
||||
static constexpr size_t MaxSize = 1ull<<MaxLog2;
|
||||
// Number of bins between powers of 2. For 4 bins, the worst case out-of-order
|
||||
// relative magnitude is (5/4)-1 = 25%
|
||||
static constexpr int BitsPerPow2 = 2;
|
||||
static constexpr int BinsPerPow2 = 1<<BitsPerPow2;
|
||||
static constexpr int BinCount = 1 + (MaxLog2-UnitLog2)*BinsPerPow2;
|
||||
|
||||
struct ncclTaskColl* head;
|
||||
struct ncclTaskColl* tail;
|
||||
// Least bin such that it and all above are empty.
|
||||
int binEdge;
|
||||
// Pointer to the pointer to this bin's head node which is either the
|
||||
// previous node's `next` field or `head`.
|
||||
struct ncclTaskColl** bins[BinCount];
|
||||
};
|
||||
|
||||
inline void ncclTaskCollSorterInsert(
|
||||
struct ncclTaskCollSorter* me, struct ncclTaskColl* x, size_t size
|
||||
) {
|
||||
constexpr int UnitLog2 = ncclTaskCollSorter::UnitLog2;
|
||||
constexpr size_t MaxSize = ncclTaskCollSorter::MaxSize;
|
||||
constexpr int BitsPerPow2 = ncclTaskCollSorter::BitsPerPow2;
|
||||
constexpr int BinCount = ncclTaskCollSorter::BinCount;
|
||||
int bin = u32fpEncode(std::min(MaxSize, size)>>UnitLog2, BitsPerPow2);
|
||||
bin = BinCount-1 - bin; // descending bin
|
||||
|
||||
if (me->bins[bin] == nullptr) {
|
||||
if (me->binEdge <= bin) {
|
||||
me->binEdge = bin+1;
|
||||
me->bins[bin] = me->tail ? &me->tail->next : &me->head;
|
||||
me->tail = x;
|
||||
} else {
|
||||
// Find successor non-empty bin after this one.
|
||||
int succ = bin+1;
|
||||
while (me->bins[succ] == nullptr) succ++;
|
||||
// What was our successor's head's previous is now our head's previous.
|
||||
me->bins[bin] = me->bins[succ];
|
||||
// The first node we insert is our tail, so that becomes our successor's
|
||||
// head's new previous.
|
||||
me->bins[succ] = &x->next;
|
||||
}
|
||||
}
|
||||
// Push a new head for this bin.
|
||||
x->next = *me->bins[bin];
|
||||
*me->bins[bin] = x;
|
||||
}
|
||||
|
||||
inline bool ncclTaskCollSorterEmpty(struct ncclTaskCollSorter* me) {
|
||||
return me->head == nullptr;
|
||||
}
|
||||
|
||||
// Reset sorter and return sorted linked list of its coll tasks.
|
||||
inline struct ncclTaskColl* ncclTaskCollSorterDequeueAll(struct ncclTaskCollSorter* me) {
|
||||
struct ncclTaskColl* head = me->head;
|
||||
if (head != nullptr) memset(me, 0, sizeof(*me));
|
||||
return head;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct ncclCudaStreamList {
|
||||
struct ncclCudaStreamList *next;
|
||||
cudaStream_t stream;
|
||||
};
|
||||
|
||||
struct ncclKernelPlanner {
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// State for accumulating tasks between ncclGroupStart/End()
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct Peer {
|
||||
bool sendSeen, recvSeen;
|
||||
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
|
||||
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
|
||||
};
|
||||
struct ncclTaskCollSorter collSorter;
|
||||
struct Peer* peers/*[nRanks]*/;
|
||||
int nTasksColl, nTasksP2p;
|
||||
bool persistent;
|
||||
|
||||
// The list of user streams aggregated over all tasks present.
|
||||
struct ncclCudaStreamList* streams;
|
||||
// The most recent user stream. Ignored if streams==nullptr
|
||||
cudaStream_t streamRecent;
|
||||
// The graph capturing all user streams or invalid if none. Thus we restrict the
|
||||
// user that all streams must be captured in the same graph or not captured
|
||||
// at all. Technically we could probably relax this, but that would mean
|
||||
// collecting a different `ncclTasks` per graph and one for non-graph.
|
||||
struct ncclCudaGraph capturingGraph;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Lists of tasks to be assembled into plans.
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collTaskQueue;
|
||||
struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> collWorkQueue;
|
||||
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> collCleanupQueue;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// State for building current (Work-In-Progress) plan:
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct WipPlan {
|
||||
struct Channel {
|
||||
struct {
|
||||
int workBytes; // Sum size of work metadata referenced by this batch.
|
||||
int nP2ps; // Number of p2p works in this batch
|
||||
int p2pRounds[NCCL_MAX_DEV_WORK_P2P_PER_BATCH]; // which rounds are present in this batch.
|
||||
} wipBatch; // work-in-progress batch which will be next tail of workBatchQueue
|
||||
int nWorkBatchesP2p; // number of p2p batches for this channel.
|
||||
struct ncclIntruQueue<struct ncclWorkBatchList, &ncclWorkBatchList::next> workBatchQueue;
|
||||
struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
|
||||
} channels[MAXCHANNELS];
|
||||
} wipPlan;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// State for launching built plans:
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// List of kernel plans built form tasks.
|
||||
struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
|
||||
// First of the unlaunched kernels in `planQueue`
|
||||
struct ncclKernelPlan* unlaunchedPlansHead;
|
||||
};
|
||||
|
||||
#define NCCL_MAGIC 0x0280028002800280 // Nickel atomic number is 28.
|
||||
@@ -233,12 +384,18 @@ struct ncclComm {
|
||||
struct ncclPeerInfo* peerInfo;
|
||||
struct ncclTopoSystem* topo;
|
||||
|
||||
int netPluginLoaded;
|
||||
ncclNet_t* ncclNet;
|
||||
ncclNetDeviceType netDeviceType;
|
||||
ncclCollNet_t* ncclCollNet;
|
||||
void* bootstrap;
|
||||
// Bitmasks for ncclTransportP2pSetup
|
||||
uint64_t* connectSend;
|
||||
uint64_t* connectRecv;
|
||||
struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
|
||||
bool initAlgoChannels[NCCL_NUM_ALGORITHMS];
|
||||
bool runtimeConn; // if dynamic connection is supported
|
||||
int cuMemSupport;
|
||||
|
||||
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
|
||||
|
||||
@@ -253,6 +410,9 @@ struct ncclComm {
|
||||
cpu_set_t cpuAffinity; // CPU affinity of the GPU
|
||||
int cudaArch; // matches __CUDA_ARCH__ of device
|
||||
|
||||
int cpuArch; // architecture - As defined in src/include/graph.h, e.g. x86/arm/ppc/mixed
|
||||
int cpuVendor; // vendor - As defined in src/include/graph.h
|
||||
|
||||
int node;
|
||||
int nNodes;
|
||||
int localRank;
|
||||
@@ -278,10 +438,11 @@ struct ncclComm {
|
||||
int nChannels; // connection nChannels
|
||||
int collChannels; // enqueue nChannels
|
||||
int nvlsChannels; // enqueue nChannels
|
||||
// all nvls heads stored to check if we can splitShare
|
||||
int nvlsHeads[MAXCHANNELS];
|
||||
// Channels (per peer) for p2p
|
||||
int p2pnChannels;
|
||||
int p2pnChannelsPerPeer;
|
||||
int p2pChannels[MAXCHANNELS];
|
||||
|
||||
// Should this comm allocate LL buffers for network P2P connections?
|
||||
bool allocP2pNetLLBuffers;
|
||||
@@ -303,23 +464,28 @@ struct ncclComm {
|
||||
ncclResult_t asyncResult;
|
||||
|
||||
// Flag to ask NCCL kernels to abort
|
||||
volatile uint32_t *abortFlag;
|
||||
volatile uint32_t *childAbortFlag;
|
||||
uint32_t *abortFlagRefCount;
|
||||
uint32_t* abortFlag;
|
||||
uint32_t* abortFlagDev;
|
||||
int* abortFlagRefCount;
|
||||
uint32_t* childAbortFlag;
|
||||
uint32_t* childAbortFlagDev;
|
||||
uint32_t destroyFlag;
|
||||
|
||||
// Device side of the communicator (for cudaFree's)
|
||||
struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
|
||||
|
||||
// Operation pool.
|
||||
int workFifoDepth; // size of workFifoHeap[], power of 2
|
||||
struct ncclWork* workFifoHeap;
|
||||
struct ncclWork* devWorkFifoHeap;
|
||||
void* workFifoHeapGdrHandle;
|
||||
uint32_t workArgsBytes; // max size of kernel args
|
||||
uint32_t workFifoBytes; // size of workFifoBuf, power of 2
|
||||
void* workFifoBuf;
|
||||
void* workFifoBufDev;
|
||||
void* workFifoBufGdrHandle;
|
||||
|
||||
// Work completion notificaion
|
||||
uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory
|
||||
uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot.
|
||||
uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels.
|
||||
// Monotonic number of bytes (mod 1<<32) consumed per channel. In cudaHost memory.
|
||||
uint32_t* workFifoConsumed/*[MAXCHANNELS]*/;
|
||||
// Last observed value of: min(workFifoConsumed[c] for c < MAXCHANNELS)
|
||||
uint32_t workFifoConsumedLeast;
|
||||
// Monotonic number of bytes (mod 1<<32) sent to fifo.
|
||||
uint32_t workFifoProduced;
|
||||
|
||||
// Intra-process sync
|
||||
struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
|
||||
@@ -337,7 +503,7 @@ struct ncclComm {
|
||||
// Whether this communicator uses collNet
|
||||
int collNetSupport;
|
||||
bool collNetRegSupport;
|
||||
uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes];
|
||||
uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
|
||||
int intraHighestTransportType;
|
||||
int* collNetHeads;
|
||||
int collNetHeadsNum;
|
||||
@@ -355,16 +521,16 @@ struct ncclComm {
|
||||
// pools backed by comm->memPermanent
|
||||
struct ncclMemoryPool memPool_ncclProxyOp;
|
||||
struct ncclMemoryPool memPool_ncclKernelPlan;
|
||||
struct ncclMemoryPool memPool_ncclPointerList;
|
||||
struct ncclMemoryPool memPool_ncclNvlsHandleList;
|
||||
struct ncclMemoryPool memPool_ncclCollnetHandleList;
|
||||
|
||||
// Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
|
||||
// this comm is not yet in a group.
|
||||
struct ncclComm* groupNext;
|
||||
// Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
|
||||
struct ncclComm* preconnectNext;
|
||||
int persistentRefs; // number of persistent plan-lists capturing this comm
|
||||
struct ncclTasks tasks;
|
||||
struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule;
|
||||
|
||||
struct ncclKernelPlanner planner;
|
||||
|
||||
// user-created reduction ops
|
||||
int userRedOpCapacity, userRedOpFreeHead;
|
||||
@@ -373,11 +539,6 @@ struct ncclComm {
|
||||
// Queue of things for the main thread to do
|
||||
struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
|
||||
|
||||
// List of kernel plans built form tasks.
|
||||
struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
|
||||
// First of the unlaunched kernels in `planQueue`
|
||||
struct ncclKernelPlan* unlaunchedPlansHead;
|
||||
|
||||
ncclConfig_t config;
|
||||
// initState is to more conveniently reclaim resources when errors happen.
|
||||
ncclResult_t initState;
|
||||
@@ -389,6 +550,7 @@ struct ncclComm {
|
||||
struct ncclGroupJob *groupJob;
|
||||
|
||||
// Tuning plugin
|
||||
int tunerPluginLoaded;
|
||||
ncclTuner_t* tuner;
|
||||
void *tunerContext;
|
||||
// buffer registration cache
|
||||
|
||||
@@ -80,6 +80,10 @@ DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel);
|
||||
#if CUDART_VERSION >= 11080
|
||||
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx);
|
||||
#endif
|
||||
// cuMem API support
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree);
|
||||
|
||||
@@ -10,21 +10,14 @@
|
||||
#include "nccl.h"
|
||||
#include "nccl_common.h"
|
||||
#include <stdio.h>
|
||||
#include <chrono>
|
||||
#include <type_traits>
|
||||
|
||||
#include <limits.h>
|
||||
#include <string.h>
|
||||
#include <pthread.h>
|
||||
|
||||
// Conform to pthread and NVTX standard
|
||||
#define NCCL_THREAD_NAMELEN 16
|
||||
|
||||
extern int ncclDebugLevel;
|
||||
extern uint64_t ncclDebugMask;
|
||||
extern pthread_mutex_t ncclDebugLock;
|
||||
extern FILE *ncclDebugFile;
|
||||
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||
|
||||
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
|
||||
|
||||
@@ -32,13 +25,13 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
extern thread_local int ncclDebugNoWarn;
|
||||
extern char ncclLastError[];
|
||||
|
||||
#define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
|
||||
#define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
|
||||
#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
|
||||
#define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)
|
||||
|
||||
#ifdef ENABLE_TRACE
|
||||
#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
|
||||
extern std::chrono::steady_clock::time_point ncclEpoch;
|
||||
#else
|
||||
#define TRACE(...)
|
||||
#endif
|
||||
|
||||
+182
-95
@@ -9,8 +9,10 @@
|
||||
|
||||
#include "nccl.h"
|
||||
#include "nccl_common.h"
|
||||
#include "align.h"
|
||||
#include "bitops.h"
|
||||
#include <algorithm>
|
||||
#include <stdint.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
|
||||
|
||||
@@ -21,6 +23,12 @@ extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
|
||||
#define NCCL_MAX_OPS 2048
|
||||
#define NCCL_STEPS 8
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#define NCCL_CUDA_ARCH __CUDA_ARCH__
|
||||
#else
|
||||
#define NCCL_CUDA_ARCH 0
|
||||
#endif
|
||||
|
||||
#include "net_device.h"
|
||||
|
||||
enum ncclDevRedOp_t {
|
||||
@@ -52,8 +60,11 @@ union ncclLLFifoLine {
|
||||
|
||||
#define WARP_SIZE 32
|
||||
#define MAXCHANNELS 32
|
||||
#define NCCL_MAX_LOCAL_RANKS 64
|
||||
#define NCCL_MAX_NTHREADS 640
|
||||
#define NCCL_MIN_NTHREADS (4*WARP_SIZE)
|
||||
#define NCCL_SIMPLE_MAX_NTHREADS 512
|
||||
#define NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE (3*WARP_SIZE)
|
||||
#define NCCL_LL_MAX_NTHREADS 512
|
||||
#define NCCL_LL_LINES_PER_THREAD 8
|
||||
#ifdef TEST_LL_CLEANUP
|
||||
@@ -84,6 +95,9 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
|
||||
#define NCCL_IPC_READ 0x10
|
||||
#define NCCL_NVLS_MIN_POLL 0x20
|
||||
|
||||
// Number of named barriers supported by CUDA
|
||||
#define NCCL_MAX_GROUPS 16
|
||||
|
||||
#define NCCL_MAX_COLLNET_SIZE (1L << 29)
|
||||
|
||||
enum ncclRegBufferType {
|
||||
@@ -196,112 +210,155 @@ struct ncclChannelPeer {
|
||||
|
||||
struct ncclDevComm;
|
||||
|
||||
/* ncclWork is to be a power of two, currently 8x64 bytes, */
|
||||
/* to make sure reads to host from the CUDA kernel are aligned. */
|
||||
/* Make sure to adjust padding at the end of ncclWorkElem. */
|
||||
#define NCCL_WORK_SIZE 512
|
||||
struct alignas(16) ncclDevWorkP2p {
|
||||
void *sendAddr, *recvAddr;
|
||||
size_t sendBytes, recvBytes;
|
||||
int sendRank, recvRank;
|
||||
// From the part index, nP2pChannels, and channelBase the device code can
|
||||
// calculate which part of the transfer a channel is responsible for.
|
||||
uint8_t nP2pChannels; // Always equal to comm->p2pnChannels
|
||||
uint8_t channelBase; // Channel owning first part.
|
||||
// Zero channels indicates no work in that direction.
|
||||
uint8_t nSendChannels, nRecvChannels;
|
||||
// Chunk size stored in 8 bits via u32fp8Encode/Decode.
|
||||
uint8_t sendChunkSize_u32fp8, recvChunkSize_u32fp8;
|
||||
|
||||
enum ncclWorkType : uint8_t {
|
||||
ncclWorkTypeUnused=0,
|
||||
ncclWorkTypeColl=1,
|
||||
ncclWorkTypeP2p=2,
|
||||
ncclWorkTypeRegColl=3
|
||||
};
|
||||
enum ncclWorkP2PType : uint8_t {
|
||||
ncclWorkP2pTypeUnused=0,
|
||||
ncclWorkP2pTypeSend,
|
||||
ncclWorkP2pTypeRecv
|
||||
uint8_t sendProtoLL:1, recvProtoLL:1;
|
||||
uint8_t sendRegistered:1, recvRegistered:1;
|
||||
};
|
||||
|
||||
struct ncclWorkHeader {
|
||||
union {
|
||||
int32_t workNext; // when isLast=0: Offset from kernel argument workHead
|
||||
uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
|
||||
};
|
||||
uint16_t funcIndex;
|
||||
uint8_t isLast:1; // last work for this kernel
|
||||
uint8_t inFifo:1; // is this work in the fifo
|
||||
enum ncclWorkType type;
|
||||
};
|
||||
// Compute the subset of the data transfer corresponding to the given part index.
|
||||
inline __host__ __device__ void ncclP2pPartBounds(int nParts, int part, size_t bytes, size_t* partBeg, size_t* partEnd) {
|
||||
size_t partBytes = alignUp(divUp(bytes, nParts), 4<<10);
|
||||
#if __CUDA_ARCH__
|
||||
*partBeg = min((part+0)*partBytes, bytes);
|
||||
*partEnd = min((part+1)*partBytes, bytes);
|
||||
#else
|
||||
*partBeg = std::min<size_t>((part+0)*partBytes, bytes);
|
||||
*partEnd = std::min<size_t>((part+1)*partBytes, bytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
struct ncclWorkElem {
|
||||
union {
|
||||
uint8_t flagBits;
|
||||
struct {
|
||||
uint8_t isUsed:1, redOpArgIsPtr:1, oneNode:1;
|
||||
};
|
||||
};
|
||||
uint8_t regUsed;
|
||||
uint8_t nWarps;
|
||||
uint8_t direct;
|
||||
// implemented in channel.h
|
||||
inline __host__ uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound);
|
||||
|
||||
// ncclP2pChannelToPart and ncclP2pChannelForPart are inverses. The device code
|
||||
// uses ncclP2pChannelToPart to determine which part "this" channel is responsible for.
|
||||
inline __host__ int ncclP2pChannelForPart(int nP2pChannels, int base, int part) {
|
||||
// Only works because nP2pChannels is pow2
|
||||
int nChannelsLog2 = countOneBits(nP2pChannels-1);
|
||||
int delta = reverseBits(part, nChannelsLog2);
|
||||
return (base + delta) & (nP2pChannels-1);
|
||||
}
|
||||
inline __device__ int ncclP2pChannelToPart(int nP2pChannels, int base, int channel) {
|
||||
// Only works because nP2pChannels is pow2
|
||||
int nChannelsLog2 = countOneBits(nP2pChannels-1);
|
||||
int delta = (channel-base) & (nP2pChannels-1);
|
||||
return reverseBits(delta, nChannelsLog2);
|
||||
}
|
||||
|
||||
struct alignas(16) ncclDevWorkColl {
|
||||
// Running on channels [channelLo..channelHi], hi is inclusive.
|
||||
// nChannels == (channelHi - channelLo) + 1
|
||||
uint32_t channelLo:8, channelHi:8;
|
||||
uint32_t nWarps:8;
|
||||
uint32_t redOpArgIsPtr:1, regUsed:2, oneNode:1, direct:4;
|
||||
uint32_t root;
|
||||
const void *sendbuff;
|
||||
void *recvbuff;
|
||||
|
||||
size_t count;
|
||||
uint64_t redOpArg;
|
||||
uint64_t chunkCount:25, workCount:39;
|
||||
void* recvbuff;
|
||||
void* sendbuff;
|
||||
union {
|
||||
// Continuous-byte-distribution scheduling. The lo and hi channels are of
|
||||
// different size than the channels in the middle.
|
||||
struct {
|
||||
uint64_t lastChunkCount:25;
|
||||
uint64_t workOffset:39;
|
||||
};
|
||||
size_t countLo, countMid, countHi;
|
||||
// Chunk counts where units are ncclProtoGrainSize(protocol) bytes
|
||||
uint64_t chunkGrainsLo:21, chunkGrainsMid:21, chunkGrainsHi:21;
|
||||
} cbd;
|
||||
// Collnet scheduling. All channels divide work evenly.
|
||||
struct {
|
||||
uint64_t bid:32;
|
||||
uint64_t nChannels:32;
|
||||
};
|
||||
size_t count; // Total size, not divided per channel.
|
||||
uint32_t chunkCount;
|
||||
} collnet;
|
||||
};
|
||||
uint64_t redOpArg;
|
||||
};
|
||||
|
||||
#define NCCL_MAX_WORK_ELEMENTS ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem))
|
||||
static_assert(NCCL_MAX_WORK_ELEMENTS == 9, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 9");
|
||||
|
||||
struct ncclWorkElemP2p {
|
||||
int peer : 30;
|
||||
int proto : 2;
|
||||
__host__ __device__ constexpr int ncclProtoGrainSize(int proto) {
|
||||
return proto == NCCL_PROTO_LL ? 16 :
|
||||
proto == NCCL_PROTO_LL128 ? WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS*NCCL_LL128_DATAELEMS*sizeof(uint64_t) :
|
||||
proto == NCCL_PROTO_SIMPLE ? 512 :
|
||||
-1;
|
||||
}
|
||||
|
||||
enum ncclWorkP2PType p2pType;
|
||||
uint8_t reg:1;
|
||||
uint8_t nWarps:5;
|
||||
uint8_t warpStart;
|
||||
uint8_t ngroups;
|
||||
// Important not to use any fields with greater than 4-byte alignment since
|
||||
// we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
|
||||
// there were 8-byte fields.
|
||||
//void* buff;
|
||||
uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
|
||||
//size_t count;
|
||||
uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
|
||||
int chunkSize;
|
||||
};
|
||||
template<typename Int>
|
||||
__host__ __device__ inline void ncclCollCbdPart(
|
||||
struct ncclDevWorkColl* work, uint32_t channelId, int proto, int eltSize,
|
||||
Int* count, Int* partOffset, Int* partCount, Int* chunkCount
|
||||
) {
|
||||
int eltPerGrain = ncclProtoGrainSize(proto)/eltSize;
|
||||
int nMidChannels = work->channelHi - work->channelLo - 1;
|
||||
// We can assum that nMidChannels<0 implies countMid==0, which let's us assume
|
||||
// that countMid*nMidChannels == 0.
|
||||
if (count != nullptr) {
|
||||
*count = work->cbd.countLo + work->cbd.countMid*nMidChannels + work->cbd.countHi;
|
||||
}
|
||||
if (channelId == work->channelLo) {
|
||||
*partOffset = 0;
|
||||
*partCount = work->cbd.countLo;
|
||||
*chunkCount = work->cbd.chunkGrainsLo*eltPerGrain;
|
||||
} else if (channelId == work->channelHi) {
|
||||
*partOffset = work->cbd.countLo + nMidChannels*work->cbd.countMid;
|
||||
*partCount = work->cbd.countHi;
|
||||
*chunkCount = work->cbd.chunkGrainsHi*eltPerGrain;
|
||||
} else {
|
||||
int mid = channelId - work->channelLo - 1;
|
||||
*partOffset = work->cbd.countLo + mid*work->cbd.countMid;
|
||||
*partCount = work->cbd.countMid;
|
||||
*chunkCount = work->cbd.chunkGrainsMid*eltPerGrain;
|
||||
}
|
||||
}
|
||||
|
||||
static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) >= 16, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 16");
|
||||
#define NCCL_MAX_WORK_ELEMENTS_P2P 16
|
||||
|
||||
struct ncclWorkElemReg {
|
||||
struct ncclWorkElem elem;
|
||||
struct alignas(16) ncclDevWorkCollReg {
|
||||
struct ncclDevWorkColl coll;
|
||||
void* dnInputs[NCCL_MAX_DIRECT_ARITY+1];
|
||||
void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
|
||||
void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
|
||||
};
|
||||
|
||||
#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg))
|
||||
static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 2, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 2");
|
||||
|
||||
// Number of named barriers supported by CUDA
|
||||
#define NCCL_MAX_GROUPS 16
|
||||
|
||||
struct ncclWork {
|
||||
struct ncclWorkHeader header;
|
||||
union {
|
||||
char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)];
|
||||
struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
|
||||
struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
|
||||
struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
|
||||
};
|
||||
enum ncclDevWorkType: uint8_t {
|
||||
ncclDevWorkTypeP2p,
|
||||
ncclDevWorkTypeColl,
|
||||
ncclDevWorkTypeCollReg
|
||||
};
|
||||
|
||||
constexpr size_t ncclDevWorkSize(enum ncclDevWorkType type) {
|
||||
return type == ncclDevWorkTypeP2p ? sizeof(ncclDevWorkP2p) :
|
||||
type == ncclDevWorkTypeColl ? sizeof(ncclDevWorkColl) : sizeof(ncclDevWorkCollReg);
|
||||
}
|
||||
|
||||
#define NCCL_MAX_DEV_WORK_BATCH_BYTES 1024
|
||||
#define NCCL_MAX_DEV_WORK_BATCH_COLLS (NCCL_MAX_DEV_WORK_BATCH_BYTES/sizeof(ncclDevWorkColl))
|
||||
#define NCCL_MAX_DEV_WORK_P2P_PER_BATCH 8
|
||||
struct alignas(16) ncclDevWorkBatch {
|
||||
union {
|
||||
struct {
|
||||
// nextExtends: should next one be merged into this one.
|
||||
// nextJump=0: end of this channel's batch list
|
||||
// nextJump>0: batches[thisIndex+nextJump] is next batch in this list
|
||||
uint32_t nextJump:14, nextExtends:1;
|
||||
uint32_t workType:2, funcId:15;
|
||||
};
|
||||
// Unioning bitfields with underlying type hints compiler to emit the best
|
||||
// SASS LD/ST accesses.
|
||||
uint32_t flags;
|
||||
};
|
||||
// Rolling offset in fifo where this batch's work structs begin
|
||||
uint32_t offsetBase;
|
||||
// Set of relative offsets from offsetBase for this channel's subset of the batch:
|
||||
// For each bit index i in offsetMask, find work at fifo offset: offsetBase + i*sizeof(WorkStructType)
|
||||
uint64_t offsetBitset;
|
||||
};
|
||||
static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE");
|
||||
static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0");
|
||||
|
||||
struct ncclDevChannelPeer {
|
||||
// Stripped version of ncclChannelPeer where we only keep the ncclConnInfo
|
||||
@@ -328,9 +385,8 @@ struct ncclDevComm {
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
int p2pChunkSize;
|
||||
|
||||
// Operation list for aggregation
|
||||
int workFifoDepth;
|
||||
struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
|
||||
// Work fifo return credits
|
||||
uint32_t* workConsumed/*[MAXCHANNELS]*/;
|
||||
|
||||
int* collNetDenseToUserRank;
|
||||
|
||||
@@ -346,11 +402,37 @@ struct alignas(16) ncclDevCommAndChannels {
|
||||
struct ncclDevChannel channels[MAXCHANNELS];
|
||||
};
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#define NCCL_CUDA_ARCH __CUDA_ARCH__
|
||||
#else
|
||||
#define NCCL_CUDA_ARCH 0
|
||||
#endif
|
||||
enum ncclDevWorkStorageType: uint8_t {
|
||||
ncclDevWorkStorageTypeArgs=0,
|
||||
ncclDevWorkStorageTypeFifo=1,
|
||||
ncclDevWorkStorageTypePersistent=2
|
||||
};
|
||||
|
||||
struct alignas(16) ncclDevKernelArgs {
|
||||
struct ncclDevComm* comm;
|
||||
uint64_t channelMask;
|
||||
enum ncclDevWorkStorageType workStorageType;
|
||||
uint32_t workMask;
|
||||
void* workBuf;
|
||||
// A channel's first batch is at `blockIdx.x`. Use `nextJump` to follow rest of list.
|
||||
// struct ncclDevWorkBatch batches[];
|
||||
};
|
||||
|
||||
__host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) {
|
||||
//return (cudaArch < 700 || cudaDriver < 12010) ? 4<<10 : (32<<10)-4;
|
||||
return 4<<10;
|
||||
}
|
||||
|
||||
template<size_t capacity>
|
||||
struct alignas(16) ncclDevKernelArgsStorage {
|
||||
union {
|
||||
struct ncclDevKernelArgs args;
|
||||
ulong2 storage[capacity/sizeof(ulong2)];
|
||||
};
|
||||
};
|
||||
|
||||
typedef ncclDevKernelArgsStorage<(4<<10)> ncclDevKernelArgs4K;
|
||||
//typedef ncclDevKernelArgsStorage<(32<<10)-4> ncclDevKernelArgs31K;
|
||||
|
||||
template<typename T>
|
||||
__host__ __device__ constexpr T min_constexpr(T a) { return a; }
|
||||
@@ -366,6 +448,10 @@ __host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) {
|
||||
return max_constexpr<T>((a > b ? a : b), c...);
|
||||
}
|
||||
|
||||
constexpr int ncclDevMaxChannelsForArgsBytes(size_t argsBytes) {
|
||||
return min_constexpr<size_t>(MAXCHANNELS, (argsBytes - sizeof(struct ncclDevKernelArgs))/sizeof(struct ncclDevWorkBatch));
|
||||
}
|
||||
|
||||
// Calculate the unroll factor given:
|
||||
// * bytePerPack: number of bytes accessed per instruction
|
||||
// * insns: max permissible unroll value
|
||||
@@ -412,6 +498,7 @@ extern int const ncclDevKernelCount;
|
||||
extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];
|
||||
|
||||
// Table of most specialized kernel function to run given func index.
|
||||
extern int const ncclDevFuncIdCount;
|
||||
extern int const ncclDevFuncRowToId[];
|
||||
extern void* const ncclDevKernelForFunc[/*funcIndex*/];
|
||||
extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/];
|
||||
|
||||
@@ -24,5 +24,6 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
|
||||
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
|
||||
ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
|
||||
ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
|
||||
ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo);
|
||||
|
||||
#endif // End include guard
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#define NCCL_GDRWRAP_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "alloc.h"
|
||||
#include <stdint.h> // for standard [u]intX_t types
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
@@ -194,7 +195,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
|
||||
char *devMem;
|
||||
void *gdrMap;
|
||||
|
||||
mapSize = sizeof(T)*nelem;
|
||||
mapSize = ncclSizeOfT<T>()*nelem;
|
||||
|
||||
// GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
|
||||
ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
|
||||
@@ -203,7 +204,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
|
||||
uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
|
||||
size_t align = alignedAddr - (uint64_t)devMem;
|
||||
|
||||
//TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize);
|
||||
//TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zu size %zu", alignedAddr, devMem, align, mapSize);
|
||||
NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh));
|
||||
|
||||
NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize));
|
||||
@@ -226,7 +227,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
|
||||
*ptr = (T *)((char *)gdrMap+off);
|
||||
if (devPtr) *devPtr = (T *)(devMem+off+align);
|
||||
|
||||
TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
|
||||
TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zu at %p",
|
||||
md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
|
||||
|
||||
return ncclSuccess;
|
||||
@@ -235,7 +236,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
|
||||
template <typename T>
|
||||
static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
|
||||
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
|
||||
NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T)));
|
||||
NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*ncclSizeOfT<T>()));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -29,6 +29,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
|
||||
int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
|
||||
|
||||
// Query topology
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
|
||||
@@ -46,9 +47,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
|
||||
#define NCCL_TOPO_CPU_ARCH_X86 1
|
||||
#define NCCL_TOPO_CPU_ARCH_POWER 2
|
||||
#define NCCL_TOPO_CPU_ARCH_ARM 3
|
||||
#define NCCL_TOPO_CPU_ARCH_MIXED 4
|
||||
#define NCCL_TOPO_CPU_VENDOR_INTEL 1
|
||||
#define NCCL_TOPO_CPU_VENDOR_AMD 2
|
||||
#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
|
||||
#define NCCL_TOPO_CPU_VENDOR_MIXED 4
|
||||
#define NCCL_TOPO_CPU_TYPE_BDW 1
|
||||
#define NCCL_TOPO_CPU_TYPE_SKL 2
|
||||
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
|
||||
@@ -70,6 +73,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
|
||||
#define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU
|
||||
#define NCCL_TOPO_PATTERN_RING 4 // Ring
|
||||
#define NCCL_TOPO_PATTERN_NVLS 5 // NVLS+SHARP and NVLS+Tree
|
||||
#define NCCL_TOPO_PATTERN_COLLNET_DIRECT 6 // Collnet Direct
|
||||
struct ncclTopoGraph {
|
||||
// Input / output
|
||||
int id; // ring : 0, tree : 1, collnet : 2
|
||||
@@ -113,7 +117,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent);
|
||||
|
||||
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
|
||||
#include "info.h"
|
||||
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup = NULL);
|
||||
ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup=nullptr);
|
||||
|
||||
#endif
|
||||
|
||||
+11
-4
@@ -35,9 +35,12 @@ struct ncclAsyncJob {
|
||||
void(*undo)(struct ncclAsyncJob*);
|
||||
void(*destructor)(void*);
|
||||
ncclGroupJobState_t state;
|
||||
volatile uint32_t *abortFlag; /* point to comm abortFlag */
|
||||
volatile uint32_t *childAbortFlag; /* point to child abortFlag */
|
||||
uint32_t* abortFlag; /* point to comm abortFlag */
|
||||
uint32_t* abortFlagDev; /* point to comm abortFlagDev */
|
||||
uint32_t* childAbortFlag; /* point to child abortFlag */
|
||||
uint32_t* childAbortFlagDev; /* point to child abortFlagDev */
|
||||
ncclComm_t comm;
|
||||
int destroyFlag;
|
||||
};
|
||||
|
||||
ncclResult_t ncclAsyncLaunch(
|
||||
@@ -52,14 +55,14 @@ struct ncclGroupJob {
|
||||
struct ncclComm **groupCommHeadPtr;
|
||||
struct ncclComm **groupCommPreconnectHeadPtr;
|
||||
ncclResult_t *groupErrorPtr;
|
||||
volatile bool *abortFlagPtr;
|
||||
bool *abortFlagPtr;
|
||||
int *groupBlockingPtr;
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
|
||||
bool initialized;
|
||||
};
|
||||
|
||||
ncclResult_t ncclGroupStartInternal();
|
||||
ncclResult_t ncclGroupEndInternal();
|
||||
ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo = NULL);
|
||||
ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -114,6 +117,10 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
|
||||
// Comms gets a new memory stack scope upon joining. Each task batched for
|
||||
// this comm is allocated there.
|
||||
ncclMemoryStackPush(&comm->memScoped);
|
||||
// Initialize planner
|
||||
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
|
||||
memset(&comm->planner, 0, sizeof(comm->planner));
|
||||
comm->planner.peers = tmp;
|
||||
}
|
||||
|
||||
ncclGroupBlocking = comm->config.blocking;
|
||||
|
||||
@@ -8,28 +8,9 @@
|
||||
#define NCCL_INFO_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "device.h"
|
||||
#include "collectives.h"
|
||||
#include "core.h"
|
||||
#include "utils.h"
|
||||
#include "strongstream.h"
|
||||
#define NCCL_MAX_LOCAL_RANKS 64
|
||||
|
||||
typedef enum : uint8_t {
|
||||
ncclPatternRing,
|
||||
ncclPatternRingTwice,
|
||||
ncclPatternPipelineFrom,
|
||||
ncclPatternPipelineTo,
|
||||
ncclPatternTreeUp,
|
||||
ncclPatternTreeDown,
|
||||
ncclPatternTreeUpDown,
|
||||
ncclPatternCollnetChain,
|
||||
ncclPatternCollnetDirect,
|
||||
ncclPatternNvls,
|
||||
ncclPatternNvlsTree,
|
||||
ncclPatternSend,
|
||||
ncclPatternRecv
|
||||
} ncclPattern_t;
|
||||
|
||||
// Used to pass NCCL call information between functions
|
||||
struct ncclInfo {
|
||||
@@ -47,110 +28,6 @@ struct ncclInfo {
|
||||
// Algorithm details
|
||||
int chunkSteps;
|
||||
int sliceSteps;
|
||||
// Computed later
|
||||
ncclDevRedOpFull opFull;
|
||||
ncclPattern_t pattern;
|
||||
size_t nBytes;
|
||||
size_t aggnBytes;
|
||||
size_t workBytes;
|
||||
size_t sendbuffSize;
|
||||
size_t recvbuffSize;
|
||||
int stepSize;
|
||||
int chunkCount;
|
||||
int chunkSize;
|
||||
int channelId;
|
||||
int workFuncIndex;
|
||||
ncclRegBufferType regBufType;
|
||||
void* regBufSend[NCCL_MAX_LOCAL_RANKS];
|
||||
void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
|
||||
// collnet buffer reg handles
|
||||
void* sendMhandle;
|
||||
void* recvMhandle;
|
||||
// Need to initialize
|
||||
int nThreads;
|
||||
int nChannels;
|
||||
int algorithm;
|
||||
int protocol;
|
||||
bool userTuned;
|
||||
struct ncclInfo *next;
|
||||
};
|
||||
|
||||
inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
|
||||
info->nBytes = info->workBytes = info->count * ncclTypeSize(info->datatype);
|
||||
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
|
||||
info->count = info->workBytes;
|
||||
info->datatype = ncclInt8;
|
||||
}
|
||||
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
|
||||
|
||||
/* compute buffer size for NVLS buffer registration */
|
||||
if (info->coll == ncclFuncAllGather) {
|
||||
info->sendbuffSize = info->workBytes;
|
||||
info->recvbuffSize = info->sendbuffSize * nRanks;
|
||||
} else if (info->coll == ncclFuncReduceScatter) {
|
||||
info->recvbuffSize = info->workBytes;
|
||||
info->sendbuffSize = info->recvbuffSize * nRanks;
|
||||
} else {
|
||||
info->sendbuffSize = info->recvbuffSize = info->workBytes;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclTaskColl {
|
||||
struct ncclTaskColl* next;
|
||||
ncclFunc_t func;
|
||||
void const* sendbuff;
|
||||
void* recvbuff;
|
||||
size_t count;
|
||||
int root;
|
||||
ncclDataType_t datatype;
|
||||
ncclDevRedOpFull op;
|
||||
int chunkSteps, sliceSteps;
|
||||
struct ncclInfo info;
|
||||
};
|
||||
struct ncclTaskP2p {
|
||||
ncclTaskP2p *next;
|
||||
void *buff;
|
||||
size_t bytes;
|
||||
// Stateful chunk index. If a p2p gets "cut" over two plans this keeps track
|
||||
// of where it left off.
|
||||
int chunk;
|
||||
};
|
||||
|
||||
struct ncclCudaStreamList {
|
||||
struct ncclCudaStreamList *next;
|
||||
cudaStream_t stream;
|
||||
};
|
||||
struct ncclTasks {
|
||||
struct Peer {
|
||||
bool sendSeen, recvSeen;
|
||||
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
|
||||
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
|
||||
};
|
||||
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collQueue;
|
||||
// Queue for user-tuned executed collectives
|
||||
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collTunedQueue;
|
||||
// Queue for continuous bytes distribution (CBD) collectives
|
||||
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collCBDQueue;
|
||||
// Queue for collnet
|
||||
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collnetQueue;
|
||||
size_t workBytesTotal;
|
||||
int usableChannels;
|
||||
bool sorted;
|
||||
struct Peer* peers/*[nRanks]*/;
|
||||
int *p2pSendOrder, *p2pRecvOrder;
|
||||
int p2pOrderSteps;
|
||||
int nTasksColl, nTasksP2p;
|
||||
|
||||
// The list of user streams aggregated over all tasks present.
|
||||
struct ncclCudaStreamList* streams;
|
||||
// The most recent user stream. Ignored if streams==nullptr
|
||||
cudaStream_t streamRecent;
|
||||
// The graph capturing all user streams or invalid if none. Thus we restrict the
|
||||
// user that all streams must be captured in the same graph or not captured
|
||||
// at all. Technically we could probably relax this, but that would mean
|
||||
// collecting a different `ncclTasks` per graph and one for non-graph.
|
||||
struct ncclCudaGraph capturingGraph;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -7,8 +7,33 @@
|
||||
#ifndef NCCL_DEBUG_H_
|
||||
#define NCCL_DEBUG_H_
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
typedef enum {
|
||||
NCCL_LOG_NONE = 0,
|
||||
NCCL_LOG_VERSION = 1,
|
||||
NCCL_LOG_WARN = 2,
|
||||
NCCL_LOG_INFO = 3,
|
||||
NCCL_LOG_ABORT = 4,
|
||||
NCCL_LOG_TRACE = 5
|
||||
} ncclDebugLogLevel;
|
||||
|
||||
typedef enum {
|
||||
NCCL_INIT = 0x1,
|
||||
NCCL_COLL = 0x2,
|
||||
NCCL_P2P = 0x4,
|
||||
NCCL_SHM = 0x8,
|
||||
NCCL_NET = 0x10,
|
||||
NCCL_GRAPH = 0x20,
|
||||
NCCL_TUNING = 0x40,
|
||||
NCCL_ENV = 0x80,
|
||||
NCCL_ALLOC = 0x100,
|
||||
NCCL_CALL = 0x200,
|
||||
NCCL_PROXY = 0x400,
|
||||
NCCL_NVLS = 0x800,
|
||||
NCCL_BOOTSTRAP = 0x1000,
|
||||
NCCL_REG = 0x2000,
|
||||
NCCL_PROFILE = 0x4000,
|
||||
NCCL_ALL = ~0
|
||||
} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
@@ -40,4 +65,5 @@ typedef enum {
|
||||
#define NCCL_PROTO_LL128 1
|
||||
#define NCCL_PROTO_SIMPLE 2
|
||||
|
||||
#define NCCL_ALGO_PROTO_IGNORE -1.0
|
||||
#endif
|
||||
|
||||
@@ -11,6 +11,54 @@
|
||||
#include "nccl.h"
|
||||
#include "nccl_common.h"
|
||||
|
||||
// API to be implemented by external tuner
|
||||
typedef struct {
|
||||
// Name of the tuner
|
||||
const char* name;
|
||||
|
||||
// Initializes tuner states.
|
||||
// Inputs:
|
||||
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
|
||||
// - nNodes: number of nodes in current communicator.
|
||||
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
|
||||
// Outputs:
|
||||
// - context: tuner context object
|
||||
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
|
||||
|
||||
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
|
||||
// Inputs:
|
||||
// - context: tuner context object
|
||||
// - collType: collective type , e.g., allreduce, allgather…
|
||||
// - nBytes: collective size in bytes
|
||||
// - numPipeOps: number of operations in the group
|
||||
// - numAlgo: number of algorithms in collCostTable
|
||||
// - numProto: number of protocols in collCostTable
|
||||
//
|
||||
// Outputs:
|
||||
// - nChannels: number of channels (hence SMs) to be used.
|
||||
//
|
||||
// InOut:
|
||||
// - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
|
||||
// NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
|
||||
//
|
||||
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
|
||||
// default tuning for the given collective.
|
||||
// Also, the plugin is allowed to not set any output, or set only the
|
||||
// algorithm and protocol, but not only the algorithm or only the protocol.
|
||||
// Unset fields will be set automatically by NCCL.
|
||||
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
|
||||
int* nChannels);
|
||||
|
||||
// Terminates the plugin and cleans up any resources that the plugin allocated.
|
||||
// context: tuner context object
|
||||
ncclResult_t (*destroy)(void* context);
|
||||
} ncclTuner_v3_t;
|
||||
|
||||
typedef ncclTuner_v3_t ncclTuner_t;
|
||||
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
|
||||
|
||||
// API to be implemented by external tuner
|
||||
typedef struct {
|
||||
// Name of the tuner
|
||||
@@ -36,7 +84,7 @@ typedef struct {
|
||||
//
|
||||
// Outputs:
|
||||
// - algorithm: selected algorithm to be used for the given collective
|
||||
// - protocol: selected protocol to be used for the given collective
|
||||
// - protocol: selected protocol to be used for the give collective
|
||||
// - nChannels: number of channels (hence SMs) to be used.
|
||||
//
|
||||
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
|
||||
@@ -46,15 +94,11 @@ typedef struct {
|
||||
// Unset fields will be set automatically by NCCL.
|
||||
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int *algorithm, int *protocol, int* nChannels);
|
||||
int* algorithm, int* protocol, int* nChannels);
|
||||
|
||||
// Terminates the plugin and cleans up any resources that the plugin allocated.
|
||||
// context: tuner context object
|
||||
ncclResult_t (*destroy)(void* context);
|
||||
} ncclTuner_v2_t;
|
||||
|
||||
typedef ncclTuner_v2_t ncclTuner_t;
|
||||
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
|
||||
|
||||
#endif
|
||||
|
||||
@@ -14,8 +14,10 @@
|
||||
|
||||
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
|
||||
|
||||
ncclResult_t ncclNetPluginInit();
|
||||
ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
|
||||
ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
|
||||
ncclResult_t ncclNetInit(struct ncclComm* comm);
|
||||
ncclResult_t ncclNetFinalize(struct ncclComm* comm);
|
||||
int ncclNetVersion(struct ncclComm* comm);
|
||||
|
||||
// Test whether the current GPU support GPU Direct RDMA.
|
||||
|
||||
@@ -253,6 +253,38 @@ typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t;
|
||||
*/
|
||||
#define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2)
|
||||
|
||||
/**
|
||||
* Confidential Compute Feature Status values
|
||||
*/
|
||||
#define NVML_CC_SYSTEM_FEATURE_DISABLED 0
|
||||
#define NVML_CC_SYSTEM_FEATURE_ENABLED 1
|
||||
|
||||
typedef struct nvmlConfComputeSystemState_st {
|
||||
unsigned int environment;
|
||||
unsigned int ccFeature;
|
||||
unsigned int devToolsMode;
|
||||
} nvmlConfComputeSystemState_t;
|
||||
|
||||
/**
|
||||
* Confidential Compute Multigpu mode values
|
||||
*/
|
||||
#define NVML_CC_SYSTEM_MULTIGPU_NONE 0
|
||||
#define NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE 1
|
||||
|
||||
/**
|
||||
* Confidential Compute System settings
|
||||
*/
|
||||
typedef struct {
|
||||
unsigned int version;
|
||||
unsigned int environment;
|
||||
unsigned int ccFeature;
|
||||
unsigned int devToolsMode;
|
||||
unsigned int multiGpuMode;
|
||||
} nvmlSystemConfComputeSettings_v1_t;
|
||||
|
||||
typedef nvmlSystemConfComputeSettings_v1_t nvmlSystemConfComputeSettings_t;
|
||||
#define nvmlSystemConfComputeSettings_v1 NVML_STRUCT_VERSION(SystemConfComputeSettings, 1)
|
||||
|
||||
/* End of nvml.h */
|
||||
#endif // NCCL_NVML_DIRECT
|
||||
|
||||
@@ -268,6 +300,11 @@ extern int ncclNvmlDeviceCount;
|
||||
extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices];
|
||||
extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices];
|
||||
|
||||
struct ncclNvmlCCStatus {
|
||||
bool CCEnabled;
|
||||
bool multiGpuCCEnabled;
|
||||
};
|
||||
|
||||
// All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly.
|
||||
// Outsiders need only call it if they want to inspect the ncclNvml global
|
||||
// tables above.
|
||||
@@ -283,5 +320,6 @@ ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* ma
|
||||
ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
|
||||
ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
|
||||
ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo);
|
||||
ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status);
|
||||
|
||||
#endif // End include guard
|
||||
|
||||
@@ -63,7 +63,7 @@ class payload_schema {
|
||||
nullptr,
|
||||
NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
|
||||
NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
|
||||
nullptr, 0, 0, 0};
|
||||
nullptr, 0, 0, 0, 0, nullptr};
|
||||
};
|
||||
|
||||
// Create NVTX push/pop range with parameters
|
||||
|
||||
@@ -25,9 +25,9 @@
|
||||
*
|
||||
* \section INITIALIZATION_SECTION Initialization
|
||||
*
|
||||
* Typically the tool's library that plugs into NVTX is indirectly
|
||||
* loaded via enviromental properties that are platform specific.
|
||||
* For some platform or special cases, the user may be required
|
||||
* Typically the tool's library that plugs into NVTX is indirectly
|
||||
* loaded via enviromental properties that are platform specific.
|
||||
* For some platform or special cases, the user may be required
|
||||
* to instead explicity initialize instead though. This can also
|
||||
* be helpful to control when the API loads a tool's library instead
|
||||
* of what would typically be the first function call to emit info.
|
||||
@@ -37,16 +37,16 @@
|
||||
*
|
||||
* Markers and ranges are used to describe events at a specific time (markers)
|
||||
* or over a time span (ranges) during the execution of the application
|
||||
* respectively.
|
||||
* respectively.
|
||||
*
|
||||
* \subsection MARKERS Markers
|
||||
*
|
||||
*
|
||||
* Markers denote specific moments in time.
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
* See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on
|
||||
* how to specify the domain.
|
||||
*
|
||||
*
|
||||
* \subsection THREAD_RANGES Thread Ranges
|
||||
*
|
||||
* Thread ranges denote nested time ranges. Nesting is maintained per thread
|
||||
@@ -59,9 +59,9 @@
|
||||
*
|
||||
* \subsection PROCESS_RANGES Process Ranges
|
||||
*
|
||||
* Process ranges denote a time span that can expose arbitrary concurrency, as
|
||||
* Process ranges denote a time span that can expose arbitrary concurrency, as
|
||||
* opposed to thread ranges that only support nesting. In addition the range
|
||||
* start event can happen on a different thread than the end marker. For the
|
||||
* start event can happen on a different thread than the end marker. For the
|
||||
* correlation of a start/end pair an unique correlation ID is used that is
|
||||
* returned from the start API call and needs to be passed into the end API
|
||||
* call.
|
||||
@@ -87,15 +87,15 @@
|
||||
*
|
||||
* The function ::nvtxDomainCreateA or ::nvtxDomainCreateW is used to create
|
||||
* a named domain.
|
||||
*
|
||||
*
|
||||
* Each domain maintains its own
|
||||
* - categories
|
||||
* - thread range stacks
|
||||
* - registered strings
|
||||
*
|
||||
* The function ::nvtxDomainDestroy marks the end of the domain. Destroying
|
||||
* a domain unregisters and destroys all objects associated with it such as
|
||||
* registered strings, resource objects, named categories, and started ranges.
|
||||
* The function ::nvtxDomainDestroy marks the end of the domain. Destroying
|
||||
* a domain unregisters and destroys all objects associated with it such as
|
||||
* registered strings, resource objects, named categories, and started ranges.
|
||||
*
|
||||
* \section RESOURCE_NAMING Resource Naming
|
||||
*
|
||||
@@ -105,41 +105,41 @@
|
||||
* The functions can be called multiple times during the execution of an
|
||||
* application, however, in that case it is implementation dependent which
|
||||
* name will be reported by the tool.
|
||||
*
|
||||
*
|
||||
* \subsection CATEGORY_NAMING Category Naming
|
||||
*
|
||||
* Some function in this library support associating an integer category
|
||||
* to enable filtering and sorting. The category naming functions allow
|
||||
* the application to associate a user friendly name with the integer
|
||||
* category. Support for domains have been added in NVTX_VERSION_2 to
|
||||
* avoid collisions when domains are developed independantly.
|
||||
* Some function in this library support associating an integer category
|
||||
* to enable filtering and sorting. The category naming functions allow
|
||||
* the application to associate a user friendly name with the integer
|
||||
* category. Support for domains have been added in NVTX_VERSION_2 to
|
||||
* avoid collisions when domains are developed independantly.
|
||||
*
|
||||
* \subsection RESOURCE_OBJECTS Resource Objects
|
||||
*
|
||||
* Resource objects are a generic mechanism for attaching data to an application
|
||||
* resource. The identifier field makes the association to a pointer or handle,
|
||||
* while the type field helps provide deeper understanding of the identifier as
|
||||
* Resource objects are a generic mechanism for attaching data to an application
|
||||
* resource. The identifier field makes the association to a pointer or handle,
|
||||
* while the type field helps provide deeper understanding of the identifier as
|
||||
* well as enabling differentiation in cases where handles generated by different
|
||||
* APIs may collide. The resource object may also have an associated message to
|
||||
* associate with the application resource, enabling further annotation of this
|
||||
* associate with the application resource, enabling further annotation of this
|
||||
* object and how it is used.
|
||||
*
|
||||
*
|
||||
* The resource object was introduced in NVTX_VERSION_2 to supersede existing naming
|
||||
* functions and allow the application resource identified by those functions to be
|
||||
* associated to a domain. The other naming functions are still supported for backward
|
||||
* compatibility but will be associated only to the default domain.
|
||||
*
|
||||
* \subsection RESOURCE_NAMING_OS Resource Naming
|
||||
*
|
||||
* Some operating system resources creation APIs do not support providing a user friendly
|
||||
* name, such as some OS thread creation APIs. This API support resource naming though
|
||||
* both through resource objects and functions following the pattern
|
||||
* nvtxName[RESOURCE_TYPE][A|W](identifier, name). Resource objects introduced in NVTX_VERSION 2
|
||||
*
|
||||
* Some operating system resources creation APIs do not support providing a user friendly
|
||||
* name, such as some OS thread creation APIs. This API support resource naming though
|
||||
* both through resource objects and functions following the pattern
|
||||
* nvtxName[RESOURCE_TYPE][A|W](identifier, name). Resource objects introduced in NVTX_VERSION 2
|
||||
* supersede the other functions with a a more general method of assigning names to OS resources,
|
||||
* along with associating them to domains too. The older nvtxName* functions are only associated
|
||||
* along with associating them to domains too. The older nvtxName* functions are only associated
|
||||
* with the default domain.
|
||||
* \section EXTENSIONS Optional Extensions
|
||||
* Optional extensions will either appear within the existing sections the extend or appear
|
||||
* Optional extensions will either appear within the existing sections the extend or appear
|
||||
* in the "Related Pages" when they introduce new concepts.
|
||||
*/
|
||||
|
||||
@@ -159,7 +159,11 @@
|
||||
#define NVTX_INLINE_STATIC __inline static
|
||||
#else /*defined(__GNUC__)*/
|
||||
#define NVTX_API
|
||||
#if defined(__cplusplus) || (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
|
||||
#define NVTX_INLINE_STATIC inline static
|
||||
#else
|
||||
#define NVTX_INLINE_STATIC __inline__ static
|
||||
#endif
|
||||
#endif /* Platform */
|
||||
|
||||
#if defined(NVTX_NO_IMPL)
|
||||
@@ -212,7 +216,7 @@
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/**
|
||||
/**
|
||||
* Result Codes
|
||||
*/
|
||||
|
||||
@@ -281,12 +285,12 @@ typedef enum nvtxColorType_t
|
||||
* ------------------------------------------------------------------------- */
|
||||
typedef enum nvtxMessageType_t
|
||||
{
|
||||
NVTX_MESSAGE_UNKNOWN = 0, /**< Message payload is unused. */
|
||||
NVTX_MESSAGE_UNKNOWN = 0, /**< Message attribute is unused. */
|
||||
NVTX_MESSAGE_TYPE_ASCII = 1, /**< A character sequence is used as payload. */
|
||||
NVTX_MESSAGE_TYPE_UNICODE = 2, /**< A wide character sequence is used as payload. */
|
||||
/* NVTX_VERSION_2 */
|
||||
NVTX_MESSAGE_TYPE_REGISTERED = 3, /**< A unique string handle that was registered
|
||||
with \ref nvtxDomainRegisterStringA() or
|
||||
with \ref nvtxDomainRegisterStringA() or
|
||||
\ref nvtxDomainRegisterStringW(). */
|
||||
} nvtxMessageType_t;
|
||||
|
||||
@@ -338,7 +342,7 @@ NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved);
|
||||
* ------------------------------------------------------------------------- */
|
||||
typedef enum nvtxPayloadType_t
|
||||
{
|
||||
NVTX_PAYLOAD_UNKNOWN = 0, /**< Color payload is unused. */
|
||||
NVTX_PAYLOAD_UNKNOWN = 0, /**< Payload attribute is unused. */
|
||||
NVTX_PAYLOAD_TYPE_UNSIGNED_INT64 = 1, /**< A 64 bit unsigned integer value is used as payload. */
|
||||
NVTX_PAYLOAD_TYPE_INT64 = 2, /**< A 64 bit signed integer value is used as payload. */
|
||||
NVTX_PAYLOAD_TYPE_DOUBLE = 3, /**< A 64 bit floating point value is used as payload. */
|
||||
@@ -714,10 +718,10 @@ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message);
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Ends a process range.
|
||||
*
|
||||
* \param domain - The domain
|
||||
* \param domain - The domain
|
||||
* \param id - The correlation ID returned from a nvtxRangeStart call.
|
||||
*
|
||||
* \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd.
|
||||
* \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd.
|
||||
* It does not need a domain param since that is associated iwth the range ID at ::nvtxDomainRangeStartEx
|
||||
*
|
||||
* \par Example:
|
||||
@@ -929,10 +933,10 @@ NVTX_DECLSPEC int NVTX_API nvtxRangePop(void);
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \cond SHOW_HIDDEN
|
||||
* \brief Resource typing helpers.
|
||||
* \brief Resource typing helpers.
|
||||
*
|
||||
* Classes are used to make it easy to create a series of resource types
|
||||
* per API without collisions
|
||||
* Classes are used to make it easy to create a series of resource types
|
||||
* per API without collisions
|
||||
*/
|
||||
#define NVTX_RESOURCE_MAKE_TYPE(CLASS, INDEX) ((((uint32_t)(NVTX_RESOURCE_CLASS_ ## CLASS))<<16)|((uint32_t)(INDEX)))
|
||||
#define NVTX_RESOURCE_CLASS_GENERIC 1
|
||||
@@ -1062,7 +1066,7 @@ typedef struct nvtxResourceAttributes_v0
|
||||
int32_t identifierType; /* values from enums following the pattern nvtxResource[name]Type_t */
|
||||
|
||||
/**
|
||||
* \brief Identifier for the resource.
|
||||
* \brief Identifier for the resource.
|
||||
* \anchor RESOURCE_IDENTIFIER_FIELD
|
||||
*
|
||||
* An identifier may be a pointer or a handle to an OS or middleware API object.
|
||||
@@ -1093,7 +1097,7 @@ typedef struct nvtxResourceAttributes_v0
|
||||
|
||||
typedef struct nvtxResourceAttributes_v0 nvtxResourceAttributes_t;
|
||||
|
||||
/* \cond SHOW_HIDDEN
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
#define NVTX_RESOURCE_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxResourceAttributes_v0) ) )
|
||||
@@ -1106,7 +1110,7 @@ typedef struct nvtxResourceHandle* nvtxResourceHandle_t;
|
||||
/** \brief Create a resource object to track and associate data with OS and middleware objects
|
||||
*
|
||||
* Allows users to associate an API handle or pointer with a user-provided name.
|
||||
*
|
||||
*
|
||||
*
|
||||
* \param domain - Domain to own the resource object
|
||||
* \param attribs - Attributes to be associated with the resource
|
||||
@@ -1240,7 +1244,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t*
|
||||
* POSIX pthread_t type returned by pthread_self() may not comply with these
|
||||
* expectations. Please use OS-specific thread ID instead of pthread_t.
|
||||
*
|
||||
* The thread name is associated to the default domain. To support domains
|
||||
* The thread name is associated to the default domain. To support domains
|
||||
* use resource objects via ::nvtxDomainResourceCreate.
|
||||
*
|
||||
* \param threadId - The ID of the thread to name.
|
||||
@@ -1457,7 +1461,7 @@ NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain);
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#define NVTX_IMPL_GUARD /* Ensure other headers cannot included directly */
|
||||
#define NVTX_IMPL_GUARD /* Ensure other headers cannot be included directly */
|
||||
|
||||
#include "nvtxDetail/nvtxTypes.h"
|
||||
|
||||
|
||||
@@ -0,0 +1,335 @@
|
||||
/**
|
||||
* The NVTX counters extension is intended to collect counter values of various
|
||||
* sources. It uses the NVTX payload extension to specify the data layout a
|
||||
* counter group.
|
||||
*
|
||||
* A counter group is a set of counters that are collected together (at the same
|
||||
* time). Counters are always registered as a group. Hence, a single counter is
|
||||
* represented by a group with one counter.
|
||||
*
|
||||
* A sample refers to all values for a given timestamp. These values must
|
||||
* include counter values and may include multiple instances of a counter group.
|
||||
*
|
||||
* The NVTX domain handle is the first argument to all counter collect
|
||||
* functions. 0/NULL/nullptr represents the default domain (no domain).
|
||||
*/
|
||||
|
||||
#include "nvToolsExtPayload.h"
|
||||
|
||||
#ifndef NVTOOLSEXT_COUNTERS_H
|
||||
#define NVTOOLSEXT_COUNTERS_H
|
||||
|
||||
/**
|
||||
* \brief The compatibility ID is used for versioning of this extension.
|
||||
*/
|
||||
#ifndef NVTX_EXT_COUNTERS_COMPATID
|
||||
#define NVTX_EXT_COUNTERS_COMPATID 0x0101
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \brief The module ID identifies the payload extension. It has to be unique
|
||||
* among the extension modules.
|
||||
*/
|
||||
#ifndef NVTX_EXT_COUNTERS_MODULEID
|
||||
#define NVTX_EXT_COUNTERS_MODULEID 4
|
||||
#endif
|
||||
|
||||
|
||||
/** Identifies an invalid scope and indicates an error if returned by `nvtxScopeRegister`. */
|
||||
#define NVTX_SCOPE_NONE 0 /* no scope */
|
||||
|
||||
#define NVTX_SCOPE_ROOT 1
|
||||
|
||||
#define NVTX_SCOPE_CURRENT_HW_MACHINE 2 /* Node/machine name, Device? */
|
||||
#define NVTX_SCOPE_CURRENT_HW_SOCKET 3
|
||||
#define NVTX_SCOPE_CURRENT_HW_CPU 4
|
||||
#define NVTX_SCOPE_CURRENT_HW_CPU_LOGICAL 5
|
||||
/* Innermost HW execution context at registration time */
|
||||
#define NVTX_SCOPE_CURRENT_HW_INNERMOST 6
|
||||
|
||||
/* Virtualized hardware, virtual machines, OS (if you don't know any better) */
|
||||
#define NVTX_SCOPE_CURRENT_HYPERVISOR 7
|
||||
#define NVTX_SCOPE_CURRENT_VM 8
|
||||
#define NVTX_SCOPE_CURRENT_KERNEL 9
|
||||
#define NVTX_SCOPE_CURRENT_CONTAINER 10
|
||||
#define NVTX_SCOPE_CURRENT_OS 11
|
||||
|
||||
/* Software scopes */
|
||||
#define NVTX_SCOPE_CURRENT_SW_PROCESS 12 /* Process scope */
|
||||
#define NVTX_SCOPE_CURRENT_SW_THREAD 13 /* Thread scope */
|
||||
#define NVTX_SCOPE_CURRENT_SW_FIBER 14
|
||||
/* Innermost SW execution context at registration time */
|
||||
#define NVTX_SCOPE_CURRENT_SW_INNERMOST 15
|
||||
|
||||
/** Static (user-provided) scope IDs (feed forward) */
|
||||
#define NVTX_SCOPE_ID_STATIC_START (1 << 24)
|
||||
|
||||
/** Dynamically (tool) generated scope IDs */
|
||||
#define NVTX_SCOPE_ID_DYNAMIC_START 4294967296 /* 1 << 32 */
|
||||
|
||||
|
||||
/** Identifier of the semantic extension for counters. */
|
||||
#define NVTX_SEMANTIC_ID_COUNTERS_V1 5
|
||||
|
||||
/*** Flags to augment the counter value. ***/
|
||||
#define NVTX_COUNTERS_FLAG_NONE 0
|
||||
|
||||
/**
|
||||
* Convert the fixed point value to a normalized floating point.
|
||||
* Use the sign/unsign from the underlying type this flag is applied to.
|
||||
* Unsigned [0f : 1f] or signed [-1f : 1f]
|
||||
*/
|
||||
#define NVTX_COUNTERS_FLAG_NORM (1 << 1)
|
||||
|
||||
/**
|
||||
* Tools should apply scale and limits when graphing, ideally in a "soft" way to
|
||||
* to see when limits are exceeded.
|
||||
*/
|
||||
#define NVTX_COUNTERS_FLAG_LIMIT_MIN (1 << 2)
|
||||
#define NVTX_COUNTERS_FLAG_LIMIT_MAX (1 << 3)
|
||||
#define NVTX_COUNTERS_FLAG_LIMITS \
|
||||
(NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX)
|
||||
|
||||
/** Counter time scope **/
|
||||
#define NVTX_COUNTERS_FLAG_TIME_POINT (1 << 5)
|
||||
#define NVTX_COUNTERS_FLAG_TIME_SINCE_LAST (2 << 5)
|
||||
#define NVTX_COUNTERS_FLAG_TIME_UNTIL_NEXT (3 << 5)
|
||||
#define NVTX_COUNTERS_FLAG_TIME_SINCE_START (4 << 5)
|
||||
|
||||
/** Counter value type **/
|
||||
#define NVTX_COUNTERS_FLAG_VALUE_ABSOLUTE (1 << 10)
|
||||
#define NVTX_COUNTERS_FLAG_VALUE_DELTA (2 << 10) // delta to previous counter sample
|
||||
|
||||
/** Counter visualization hints **/
|
||||
#define NVTX_COUNTERS_FLAG_INTERPOLATE (1 << 14)
|
||||
|
||||
/** Datatypes for limits union (value of `limitType`). */
|
||||
#define NVTX_COUNTERS_LIMIT_I64 0
|
||||
#define NVTX_COUNTERS_LIMIT_U64 1
|
||||
#define NVTX_COUNTERS_LIMIT_F64 2
|
||||
|
||||
/** Reasons for the missing sample value. */
|
||||
#define NVTX_COUNTERS_SAMPLE_ZERO 0
|
||||
#define NVTX_COUNTERS_SAMPLE_UNCHANGED 1
|
||||
#define NVTX_COUNTERS_SAMPLE_UNAVAILABLE 2
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/**
|
||||
* \brief Specify additional properties of a counter or counter group.
|
||||
*/
|
||||
typedef struct nvtxSemanticsCounter_v1
|
||||
{
|
||||
/** Header of the semantic extension (with identifier, version, etc.). */
|
||||
struct nvtxSemanticsHeader_v1 header;
|
||||
|
||||
/**
|
||||
* Flag if normalization, scale limits, etc. should be applied to counter
|
||||
* values.
|
||||
*/
|
||||
uint64_t flags;
|
||||
|
||||
/** Unit of the counter value (case insensitive) */
|
||||
const char* unit;
|
||||
|
||||
/** Should be 1 if not used. */
|
||||
uint64_t unitScaleNumerator;
|
||||
|
||||
/** Should be 1 if not used. */
|
||||
uint64_t unitScaleDenominator;
|
||||
|
||||
/** Determines the used union member. Use defines `NVTX_COUNTERS_LIMIT_*`. */
|
||||
int64_t limitType;
|
||||
|
||||
/** Soft graph limit. */
|
||||
union limits_t {
|
||||
int64_t i64[2];
|
||||
uint64_t u64[2];
|
||||
double d[2];
|
||||
} limits;
|
||||
} nvtxSemanticsCounter_t;
|
||||
|
||||
typedef struct nvtxCountersAttr_v1
|
||||
{
|
||||
size_t structSize;
|
||||
|
||||
/**
|
||||
* A schema ID referring to the data layout of the counter group or a
|
||||
* predefined NVTX payloads number type.
|
||||
*/
|
||||
uint64_t schemaId;
|
||||
|
||||
/** Name of the counter group. */
|
||||
const char* name;
|
||||
|
||||
/** Identifier of the scope of the counters. */
|
||||
uint64_t scopeId;
|
||||
|
||||
/**
|
||||
* (Optional) Specify additional semantics for a counter (group). The
|
||||
* semantics provided are applied to the all counters in a group. If the
|
||||
* semantics should only refer to a single counter in a group, the semantics
|
||||
* field of the payload entry has to be used. Accepted semantics are
|
||||
* `nvtxSemanticsCounter_t` and `nvtxSemanticsTime_t`.
|
||||
*/
|
||||
const nvtxSemanticsHeader_t* semantics;
|
||||
} nvtxCountersAttr_t;
|
||||
|
||||
/* Forward declaration of opaque counter group registration structure */
|
||||
struct nvtxCountersRegistration_st;
|
||||
typedef struct nvtxCountersRegistration_st nvtxCountersRegistration;
|
||||
|
||||
/* \brief Counters Handle Structure.
|
||||
* \anchor COUNTERS_HANDLE_STRUCTURE
|
||||
*
|
||||
* This structure is opaque to the user and is used as a handle to reference a counter group.
|
||||
* This type is returned from tools when using the NVTX API to create a counters group.
|
||||
*/
|
||||
typedef nvtxCountersRegistration* nvtxCountersHandle_t;
|
||||
|
||||
typedef struct nvtxCountersBatch_v1
|
||||
{
|
||||
/** Handle to attributes (data layout, scope, etc.) of a counter (group). */
|
||||
nvtxCountersHandle_t hCounter;
|
||||
|
||||
/** Array of counter samples. */
|
||||
const void* counters;
|
||||
|
||||
/** Size of the `counters` array (in bytes). */
|
||||
size_t cntArrSize;
|
||||
|
||||
/** Array of timestamps or reference-time plus delta pair. `NULL` is used, if
|
||||
timestamps are part of the counter (group) layout.) */
|
||||
const void* timestamps;
|
||||
|
||||
/** Size of the `timestamps` array or definition (in bytes). */
|
||||
size_t tsSize;
|
||||
} nvtxCountersBatch_t;
|
||||
|
||||
/**
|
||||
* \brief Register a counter group.
|
||||
*
|
||||
* @param hDomain NVTX domain handle.
|
||||
* @param attr Pointer to the attributes of the counter (group).
|
||||
*
|
||||
* @return Counter handle identifying a counter or counter (group).
|
||||
* The counter handle is unique within the NVTX domain.
|
||||
*/
|
||||
NVTX_DECLSPEC nvtxCountersHandle_t NVTX_API nvtxCountersRegister(
|
||||
nvtxDomainHandle_t hDomain,
|
||||
const nvtxCountersAttr_t* attr);
|
||||
|
||||
/**
|
||||
* \brief Sample one integer counter by value immediately (the NVTX tool determines the timestamp).
|
||||
*
|
||||
* @param hDomain handle of the NVTX domain.
|
||||
* @param hCounter handle of the NVTX counter (group).
|
||||
* @param value 64-bit integer counter value.
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxCountersSampleInt64(
|
||||
nvtxDomainHandle_t hDomain,
|
||||
nvtxCountersHandle_t hCounter,
|
||||
int64_t value);
|
||||
|
||||
/**
|
||||
* \brief Sample one floating point counter by value immediately (the NVTX tool determines the timestamp).
|
||||
*
|
||||
* @param hDomain handle of the NVTX domain.
|
||||
* @param hCounter handle of the NVTX counter (group).
|
||||
* @param value 64-bit floating-point counter value.
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxCountersSampleFloat64(
|
||||
nvtxDomainHandle_t hDomain,
|
||||
nvtxCountersHandle_t hCounter,
|
||||
double value);
|
||||
|
||||
/**
|
||||
* \brief Sample a counter group by reference immediately (the NVTX tool determines the timestamp).
|
||||
*
|
||||
* @param hDomain handle of the NVTX domain.
|
||||
* @param hCounter handle of the NVTX counter (group).
|
||||
* @param counters pointer to one or more counter values.
|
||||
* @param size size of the counter value(s) in bytes.
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxCountersSample(
|
||||
nvtxDomainHandle_t hDomain,
|
||||
nvtxCountersHandle_t hCounter,
|
||||
void* values,
|
||||
size_t size);
|
||||
|
||||
/**
|
||||
* \brief Sample without value.
|
||||
*
|
||||
* @param hDomain handle of the NVTX domain.
|
||||
* @param hCounter handle of the NVTX counter (group).
|
||||
* @param reason reason for the missing sample value.
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxCountersSampleNoValue(
|
||||
nvtxDomainHandle_t hDomain,
|
||||
nvtxCountersHandle_t hCounter,
|
||||
uint8_t reason);
|
||||
|
||||
/**
|
||||
* \brief Submit a batch of counters in the given domain.
|
||||
* Timestamps are part of the counter sample data.
|
||||
*
|
||||
* The size of a data sampling point is defined by the `staticSize` field of the
|
||||
* payload schema. An NVTX tool can assume that the counter samples are stored
|
||||
* as an array with each entry being `staticSize` bytes.
|
||||
*
|
||||
* @param hDomain handle of the NVTX domain
|
||||
* @param hCounter handle of the counter group (includes counter data decoding schema)
|
||||
* @param counters blob containing counter data and timestamps
|
||||
* @param size size of the counter data blob in bytes
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxCountersSubmitBatch(
|
||||
nvtxDomainHandle_t hDomain,
|
||||
nvtxCountersHandle_t hCounter,
|
||||
const void* counters,
|
||||
size_t size);
|
||||
|
||||
/**
|
||||
* \brief Submit a batch of counters in the given domain.
|
||||
* Timestamps are separated from the counter data.
|
||||
*
|
||||
* @param hDomain handle of the NVTX domain
|
||||
* @param counterBatch Pointer to the counter data to be submitted.
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxCountersSubmitBatchEx(
|
||||
nvtxDomainHandle_t hDomain,
|
||||
const nvtxCountersBatch_t* counterBatch);
|
||||
|
||||
|
||||
#define NVTX3EXT_CBID_nvtxCountersRegister 0
|
||||
#define NVTX3EXT_CBID_nvtxCountersSampleInt64 1
|
||||
#define NVTX3EXT_CBID_nvtxCountersSampleFloat64 2
|
||||
#define NVTX3EXT_CBID_nvtxCountersSample 3
|
||||
#define NVTX3EXT_CBID_nvtxCountersSampleNoValue 4
|
||||
#define NVTX3EXT_CBID_nvtxCountersSubmitBatch 5
|
||||
#define NVTX3EXT_CBID_nvtxCountersSubmitBatchEx 6
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility push(internal)
|
||||
#endif
|
||||
|
||||
#define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot be included directly */
|
||||
#include "nvtxDetail/nvtxExtTypes.h"
|
||||
#undef NVTX_EXT_TYPES_GUARD
|
||||
|
||||
#ifndef NVTX_NO_IMPL
|
||||
#define NVTX_EXT_IMPL_COUNTERS_GUARD /* Ensure other headers cannot be included directly */
|
||||
#include "nvtxDetail/nvtxExtImplCounters_v1.h"
|
||||
#undef NVTX_EXT_IMPL_COUNTERS_GUARD
|
||||
#endif /*NVTX_NO_IMPL*/
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility pop
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif /* NVTOOLSEXT_COUNTERS_H */
|
||||
@@ -30,7 +30,7 @@ extern "C" {
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* \cond SHOW_HIDDEN
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief Used to build a non-colliding value for resource types separated class
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
@@ -133,7 +133,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifndef NVTX_NO_IMPL
|
||||
#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */
|
||||
#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot be included directly */
|
||||
#include "nvtxDetail/nvtxImplCuda_v3.h"
|
||||
#undef NVTX_IMPL_GUARD_CUDA
|
||||
#endif /*NVTX_NO_IMPL*/
|
||||
|
||||
@@ -31,7 +31,7 @@ extern "C" {
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* \cond SHOW_HIDDEN
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief Used to build a non-colliding value for resource types separated class
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
@@ -109,7 +109,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t*
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifndef NVTX_NO_IMPL
|
||||
#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */
|
||||
#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot be included directly */
|
||||
#include "nvtxDetail/nvtxImplCudaRt_v3.h"
|
||||
#undef NVTX_IMPL_GUARD_CUDART
|
||||
#endif /*NVTX_NO_IMPL*/
|
||||
|
||||
@@ -0,0 +1,694 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "nvToolsExt.h"
|
||||
|
||||
#ifndef NVTOOLSEXTV3_MEM_V1
|
||||
#define NVTOOLSEXTV3_MEM_V1
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#define NVTX_EXT_MODULEID_MEM 1
|
||||
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief A compatibility ID value used in structures and initialization to
|
||||
* identify version differences.
|
||||
*/
|
||||
#define NVTX_EXT_COMPATID_MEM 0x0102
|
||||
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief This value is returned by functions that return `nvtxMemHeapHandle_t`,
|
||||
* if a tool is not attached.
|
||||
*/
|
||||
#define NVTX_MEM_HEAP_HANDLE_NO_TOOL ((nvtxMemHeapHandle_t)(intptr_t)-1)
|
||||
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief This value is returned by functions that return `nvtxMemRegionHandle_t`
|
||||
* if a tool is not attached.
|
||||
*/
|
||||
#define NVTX_MEM_REGION_HANDLE_NO_TOOL ((nvtxMemRegionHandle_t)(intptr_t)-1)
|
||||
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief This value is returned by functions that return `nvtxMemPermissionsHandle_t`
|
||||
* if a tool is not attached.
|
||||
*/
|
||||
#define NVTX_MEM_PERMISSIONS_HANDLE_NO_TOOL ((nvtxMemPermissionsHandle_t)-1)
|
||||
|
||||
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief This should not be used and is considered an error but defined to
|
||||
* detect an accidental use of zero or NULL.
|
||||
*/
|
||||
#define NVTX_MEM_HEAP_USAGE_UNKNOWN 0x0
|
||||
|
||||
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief This should not be used and is considered an error but defined to
|
||||
* detect an accidental use of zero or NULL.
|
||||
*/
|
||||
#define NVTX_MEM_TYPE_UNKNOWN 0x0
|
||||
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \defgroup MEMORY Memory
|
||||
* See page \ref PAGE_MEMORY.
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* \brief To indicate the full process virtual address space as a heap for
|
||||
* functions where a nvtxMemHeapHandle_t is accepted.
|
||||
*
|
||||
* The heap by default is always read-write-execute permissions without creating regions.
|
||||
* Regions created in this heap have read-write access by default but not execute.
|
||||
*/
|
||||
#define NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE ((nvtxMemHeapHandle_t)0)
|
||||
|
||||
/** \brief This heap is a sub-allocator.
|
||||
*
|
||||
* Heap created with this usage should not be accessed by the user until regions are registered.
|
||||
* Regions from a heap with this usage have read-write access by default but not execute.
|
||||
*/
|
||||
#define NVTX_MEM_HEAP_USAGE_TYPE_SUB_ALLOCATOR 0x1
|
||||
|
||||
/**
|
||||
* \brief This is a heap of memory that has an explicit layout.
|
||||
*
|
||||
* The layout could be static or dynamic (calculated). This often represents an algorithm's
|
||||
* structures that are packed together. By default this heap is assumed to be accessible for
|
||||
* scopes where the memory is naturally accessible by hardware. Regions may be use to further
|
||||
* annotate or restrict access. A tool may have an option to be more strict, but special
|
||||
* consideration must be made for `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`.
|
||||
*
|
||||
* The behavior of this usage is similar to NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE but
|
||||
* a tool can use it to track special behaviors and reservation.
|
||||
*
|
||||
* Memory in a heap with this usage has read-write permissions by default but not execute without
|
||||
* creating regions. Regions created in this heap have the same default permission access.
|
||||
*/
|
||||
#define NVTX_MEM_HEAP_USAGE_TYPE_LAYOUT 0x2
|
||||
|
||||
|
||||
/**
|
||||
* \brief Standard process userspace virtual addresses for linear allocations.
|
||||
*
|
||||
* APIs that map into this space, such as CUDA UVA should use this type.
|
||||
*
|
||||
* Relevant functions: cudaMalloc, cudaMallocManaged, cudaHostAlloc, cudaMallocHost
|
||||
* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is supported
|
||||
*
|
||||
* nvtxMemHeapRegister receives a heapDesc of type nvtxMemVirtualRangeDesc_t
|
||||
*/
|
||||
#define NVTX_MEM_TYPE_VIRTUAL_ADDRESS 0x1
|
||||
|
||||
|
||||
/**
|
||||
* \brief To indicate you are modifying permissions to the process-wide
|
||||
* full virtual address space.
|
||||
*
|
||||
* This is a companion object to `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`.
|
||||
*/
|
||||
#define NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE ((nvtxMemPermissionsHandle_t)0)
|
||||
|
||||
#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_NONE 0x0
|
||||
#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_READ 0x1
|
||||
#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE 0x2
|
||||
#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_ATOMIC 0x4
|
||||
|
||||
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief Forward declaration of opaque memory heap structure.
|
||||
*/
|
||||
struct nvtxMemHeap_v1;
|
||||
typedef struct nvtxMemHeap_v1 nvtxMemHeap_t;
|
||||
|
||||
/** \brief A handle returned by a tool to represent a memory heap. */
|
||||
typedef nvtxMemHeap_t* nvtxMemHeapHandle_t;
|
||||
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief Forward declaration of opaque memory heap structure.
|
||||
*/
|
||||
struct nvtxMemRegion_v1;
|
||||
typedef struct nvtxMemRegion_v1 nvtxMemRegion_t;
|
||||
|
||||
/** \brief A handle returned by a tool to represent a memory region. */
|
||||
typedef nvtxMemRegion_t* nvtxMemRegionHandle_t;
|
||||
|
||||
/** \brief A reference to a memory region (by pointer or handle).
|
||||
* Which member of the union will be determined by a type or flag field outside.
|
||||
*/
|
||||
typedef union nvtxMemRegionRef_t
|
||||
{
|
||||
void const* pointer;
|
||||
nvtxMemRegionHandle_t handle;
|
||||
} nvtxMemRegionRef_t;
|
||||
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief Forward declaration of opaque memory permissions structure
|
||||
*/
|
||||
struct nvtxMemPermissions_v1;
|
||||
typedef struct nvtxMemPermissions_v1 nvtxMemPermissions_t;
|
||||
|
||||
/** \brief A handle returned by a tool to represent a memory permissions mask. */
|
||||
typedef nvtxMemPermissions_t* nvtxMemPermissionsHandle_t;
|
||||
|
||||
|
||||
typedef struct nvtxMemVirtualRangeDesc_v1
|
||||
{
|
||||
size_t size;
|
||||
void const* ptr;
|
||||
} nvtxMemVirtualRangeDesc_v1 ;
|
||||
typedef nvtxMemVirtualRangeDesc_v1 nvtxMemVirtualRangeDesc_t;
|
||||
|
||||
|
||||
/** \brief structure to describe a heap in process virtual memory. */
|
||||
typedef struct nvtxMemHeapDesc_v1
|
||||
{
|
||||
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
|
||||
uint16_t structSize; /* Size of the structure. */
|
||||
uint32_t reserved0;
|
||||
|
||||
/** \brief Usage characteristics of the heap
|
||||
*
|
||||
* Usage characteristics help tools like memcheckers, santiizer,
|
||||
* as well as other debugging and profiling tools to determine some
|
||||
* special behaviors they should apply to the heap and it's regions.
|
||||
* The value follows the convention NVTX_MEM_HEAP_USAGE_*
|
||||
*
|
||||
* Default Value is 0, which is invalid.
|
||||
*/
|
||||
uint32_t usage;
|
||||
|
||||
/** \brief Memory type characteristics of the heap
|
||||
*
|
||||
* The 'type' indicates how to interpret the ptr field of the heapDesc.
|
||||
* This is intended to support many additional types of memory, beyond
|
||||
* standard process virtual memory, such as API specific memory only
|
||||
* addressed by handles or multi-dimensional memory requiring more complex
|
||||
* descriptions to handle features like strides, tiling, or interlace.
|
||||
*
|
||||
* The values conforms to NVTX_MEM_TYPE_*
|
||||
*
|
||||
* The value in the field 'type' identifies the descriptor type that will
|
||||
* be in the field 'typeSpecificDesc'. 'typeSpecificDesc' is void* because
|
||||
* it is extensible. Example usage is if type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS,
|
||||
* then typeSpecificDesc points to a nvtxMemVirtualRangeDesc_t.
|
||||
*
|
||||
* Default Value is 0, which is invalid.
|
||||
*/
|
||||
uint32_t type;
|
||||
|
||||
/** \brief size of the heap memory descriptor pointed to by typeSpecificDesc
|
||||
*
|
||||
* Default Value is 0 which is invalid.
|
||||
*/
|
||||
size_t typeSpecificDescSize;
|
||||
|
||||
/** \brief Pointer to the heap memory descriptor
|
||||
*
|
||||
* The value in the field 'type' identifies the descriptor type that will
|
||||
* be in the field 'typeSpecificDesc'. 'typeSpecificDesc' is void* because
|
||||
* it is extensible. Example usage is if type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS,
|
||||
* then typeSpecificDesc points to a nvtxMemVirtualRangeDesc_t.
|
||||
*
|
||||
* Default Value is 0, which is invalid.
|
||||
*/
|
||||
void const* typeSpecificDesc;
|
||||
|
||||
/** \brief ID of the category the event is assigned to.
|
||||
*
|
||||
* A category is a user-controlled ID that can be used to group
|
||||
* events. The tool may use category IDs to improve filtering or
|
||||
* enable grouping of events in the same category. The functions
|
||||
* \ref ::nvtxNameCategoryA or \ref ::nvtxNameCategoryW can be used
|
||||
* to name a category.
|
||||
*
|
||||
* Default Value is 0.
|
||||
*/
|
||||
uint32_t category;
|
||||
|
||||
/** \brief Message type specified in this attribute structure.
|
||||
*
|
||||
* Defines the message format of the attribute structure's \ref MESSAGE_FIELD
|
||||
* "message" field.
|
||||
*
|
||||
* Default Value is `NVTX_MESSAGE_UNKNOWN`.
|
||||
*/
|
||||
uint32_t messageType; /* nvtxMessageType_t */
|
||||
|
||||
/** \brief Message assigned to this attribute structure. \anchor MESSAGE_FIELD
|
||||
*
|
||||
* The text message that is attached to an event.
|
||||
*/
|
||||
nvtxMessageValue_t message;
|
||||
|
||||
} nvtxMemHeapDesc_v1 ;
|
||||
typedef nvtxMemHeapDesc_v1 nvtxMemHeapDesc_t;
|
||||
|
||||
/**
|
||||
* \brief Create a memory heap to represent a object or range of memory that will be further
|
||||
* sub-divided into regions.
|
||||
*
|
||||
* The handle used to addrss the heap will depend on the heap's type. Where the heap is virtual
|
||||
* memory accessible, the addrss of the heap's memory itself is it's handle. This will likewise
|
||||
* be returned from the function.
|
||||
*
|
||||
* For more advanced types, where the heap is not virtual memory accessible the tools may be
|
||||
* responsible for returning a void const * that that uniquely identifies the object. Please see
|
||||
* the description of each heap type for more details on whether this is expected to be a uniquely
|
||||
* generated by the tool or otherwise.
|
||||
*/
|
||||
NVTX_DECLSPEC nvtxMemHeapHandle_t NVTX_API nvtxMemHeapRegister(
|
||||
nvtxDomainHandle_t domain,
|
||||
nvtxMemHeapDesc_t const* desc);
|
||||
|
||||
/** \brief Destroy a memory heap. */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMemHeapUnregister(
|
||||
nvtxDomainHandle_t domain,
|
||||
nvtxMemHeapHandle_t heap);/* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported */
|
||||
|
||||
/**
|
||||
* \brief Reset the memory heap wipes out any changes, as if it were a fresh heap.
|
||||
*
|
||||
* This includes invalidating all regions and their handles.
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMemHeapReset(
|
||||
nvtxDomainHandle_t domain,
|
||||
nvtxMemHeapHandle_t heap); /* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is supported */
|
||||
|
||||
/**
|
||||
* \brief Register a region of memory inside of a heap.
|
||||
*
|
||||
* The heap refers the the heap within which the region resides. This can be from
|
||||
* `nvtxMemHeapRegister`, `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`, or one provided
|
||||
* from other extension API.
|
||||
*
|
||||
* The regionType arg will define which type is used in regionDescArray.
|
||||
* The most commonly used type is `NVTX_MEM_TYPE_VIRTUAL_ADDRESS`.
|
||||
* In this case regionDescElements is an array of `nvtxMemVirtualRangeDesc_t`.
|
||||
*
|
||||
* The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut.
|
||||
*
|
||||
* The regionHandleArrayOut arg points to an array where the tool will provide region handles. If
|
||||
* a pointer is provided, it is expected to have regionCount elements. This pointer can be NULL if
|
||||
* regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case, the user can use the pointer to the
|
||||
* virtual memory to reference the region in other related functions which accept nvtMemRegionRef_t.
|
||||
*/
|
||||
typedef struct nvtxMemRegionsRegisterBatch_v1
|
||||
{
|
||||
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
|
||||
uint16_t structSize; /* Size of the structure. */
|
||||
|
||||
uint32_t regionType; /* NVTX_MEM_TYPE_* */
|
||||
|
||||
nvtxMemHeapHandle_t heap;
|
||||
|
||||
size_t regionCount;
|
||||
size_t regionDescElementSize;
|
||||
void const* regionDescElements; /* This will also become the handle for this region. */
|
||||
nvtxMemRegionHandle_t* regionHandleElementsOut; /* This will also become the handle for this region. */
|
||||
|
||||
} nvtxMemRegionsRegisterBatch_v1;
|
||||
typedef nvtxMemRegionsRegisterBatch_v1 nvtxMemRegionsRegisterBatch_t;
|
||||
|
||||
/** \brief Register a region of memory inside of a heap of linear process virtual memory
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMemRegionsRegister(
|
||||
nvtxDomainHandle_t domain,
|
||||
nvtxMemRegionsRegisterBatch_t const* desc);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* \brief Register a region of memory inside of a heap.
|
||||
*
|
||||
* The heap refers the the heap within which the region resides.
|
||||
* This can be from nvtxMemHeapRegister, NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE, or
|
||||
* one provided from other extension API.
|
||||
*
|
||||
* The regionType arg will define which type is used in regionDescArray.
|
||||
* The most commonly used type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS.
|
||||
*
|
||||
* The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut.
|
||||
*
|
||||
* The regionHandleArrayOut arg points to an array where the tool will provide region handles. If
|
||||
* a pointer if provided, it is expected to have regionCount elements. This pointer can be NULL if
|
||||
* regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case, the user can use the pointer to the
|
||||
* virtual memory to reference the region in other related functions which accept nvtMemRegionRef_t.
|
||||
*/
|
||||
typedef struct nvtxMemRegionsResizeBatch_v1
|
||||
{
|
||||
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
|
||||
uint16_t structSize; /* Size of the structure. */
|
||||
|
||||
uint32_t regionType; /* NVTX_MEM_TYPE_* */
|
||||
|
||||
size_t regionDescCount;
|
||||
size_t regionDescElementSize;
|
||||
void const* regionDescElements; /* This will also become the handle for this region. */
|
||||
|
||||
} nvtxMemRegionsResizeBatch_v1;
|
||||
typedef nvtxMemRegionsResizeBatch_v1 nvtxMemRegionsResizeBatch_t;
|
||||
|
||||
/** \brief Register a region of memory inside of a heap of linear process virtual memory
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMemRegionsResize(
|
||||
nvtxDomainHandle_t domain,
|
||||
nvtxMemRegionsResizeBatch_t const* desc);
|
||||
|
||||
|
||||
#define NVTX_MEM_REGION_REF_TYPE_UNKNOWN 0x0
|
||||
#define NVTX_MEM_REGION_REF_TYPE_POINTER 0x1
|
||||
#define NVTX_MEM_REGION_REF_TYPE_HANDLE 0x2
|
||||
|
||||
/**
|
||||
* \brief Register a region of memory inside of a heap.
|
||||
*
|
||||
* The heap refers the the heap within which the region resides.
|
||||
* This can be from nvtxMemHeapRegister, `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`, or
|
||||
* one provided from other extension API.
|
||||
*
|
||||
* The regionType arg will define which type is used in `regionDescArray`.
|
||||
* The most commonly used type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS.
|
||||
*
|
||||
* The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut.
|
||||
*
|
||||
* The regionHandleArrayOut arg points to an array where the tool will provide region handles.
|
||||
* If a pointer if provided, it is expected to have regionCount elements.
|
||||
* This pointer can be NULL if regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case,
|
||||
* the user can use the pointer to the virtual memory to reference the region in other
|
||||
* related functions which accept a nvtMemRegionRef_t.
|
||||
*/
|
||||
typedef struct nvtxMemRegionsUnregisterBatch_v1
|
||||
{
|
||||
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
|
||||
uint16_t structSize; /* Size of the structure. */
|
||||
|
||||
uint32_t refType; /* NVTX_MEM_REGION_REF_TYPE_* */
|
||||
|
||||
size_t refCount; /* count of elements in refArray */
|
||||
size_t refElementSize;
|
||||
nvtxMemRegionRef_t const* refElements; /* This will also become the handle for this region. */
|
||||
|
||||
} nvtxMemRegionsUnregisterBatch_v1;
|
||||
typedef nvtxMemRegionsUnregisterBatch_v1 nvtxMemRegionsUnregisterBatch_t;
|
||||
|
||||
/**
|
||||
* \brief Unregistration for regions of process virtual memory
|
||||
*
|
||||
* This is not necessary if the nvtx heap destroy function has been called that
|
||||
* contains this object.
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMemRegionsUnregister(
|
||||
nvtxDomainHandle_t domain,
|
||||
nvtxMemRegionsUnregisterBatch_t const* desc);
|
||||
|
||||
typedef struct nvtxMemRegionNameDesc_v1
|
||||
{
|
||||
uint32_t regionRefType; /* NVTX_MEM_REGION_REF_TYPE_* */
|
||||
uint32_t nameType; /* nvtxMessageType_t */
|
||||
|
||||
nvtxMemRegionRef_t region;
|
||||
nvtxMessageValue_t name;
|
||||
|
||||
uint32_t category;
|
||||
uint32_t reserved0;
|
||||
} nvtxMemRegionNameDesc_v1;
|
||||
typedef nvtxMemRegionNameDesc_v1 nvtxMemRegionNameDesc_t;
|
||||
|
||||
|
||||
typedef struct nvtxMemRegionsNameBatch_v1
|
||||
{
|
||||
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
|
||||
uint16_t structSize; /* Size of the structure. */
|
||||
|
||||
uint32_t reserved0;
|
||||
|
||||
size_t regionCount;
|
||||
size_t regionElementSize;
|
||||
nvtxMemRegionNameDesc_t const* regionElements;
|
||||
size_t reserved1;
|
||||
} nvtxMemRegionsNameBatch_v1 ;
|
||||
typedef nvtxMemRegionsNameBatch_v1 nvtxMemRegionsNameBatch_t;
|
||||
|
||||
|
||||
/** \brief Name or rename a region. */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMemRegionsName(
|
||||
nvtxDomainHandle_t domain,
|
||||
nvtxMemRegionsNameBatch_t const* desc);
|
||||
|
||||
/** \brief There are no permissions for this memory. */
|
||||
#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_NONE 0x0
|
||||
|
||||
/** \brief The memory is readable. */
|
||||
#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_READ 0x1
|
||||
|
||||
/** \brief The memory is writable. */
|
||||
#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_WRITE 0x2
|
||||
|
||||
/** \brief The memory is for atomic RW. */
|
||||
#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_ATOMIC 0x4
|
||||
|
||||
/**
|
||||
* \brief The memory access permissions are reset for a region.
|
||||
*
|
||||
* This is as if never set, rather than documented defaults. As as result any flags
|
||||
* indicating how unspecified regions are handle will affect this area.
|
||||
*
|
||||
* This should not be used with READ, WRITE, nor ATOMIC, as those flags would have no effect.
|
||||
*/
|
||||
#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_RESET 0x8
|
||||
|
||||
|
||||
typedef struct nvtxMemPermissionsAssignRegionDesc_v1
|
||||
{
|
||||
uint32_t flags; /* NVTX_MEM_PERMISSIONS_REGION_FLAGS_* */
|
||||
uint32_t regionRefType; /* NVTX_MEM_REGION_REF_TYPE_* */
|
||||
nvtxMemRegionRef_t region;
|
||||
|
||||
} nvtxMemPermissionsAssignRegionDesc_v1 ;
|
||||
typedef nvtxMemPermissionsAssignRegionDesc_v1 nvtxMemPermissionsAssignRegionDesc_t;
|
||||
|
||||
|
||||
typedef struct nvtxMemPermissionsAssignBatch_v1
|
||||
{
|
||||
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
|
||||
uint16_t structSize; /* Size of the structure. */
|
||||
|
||||
uint32_t reserved0;
|
||||
|
||||
nvtxMemPermissionsHandle_t permissions;
|
||||
|
||||
size_t regionCount;
|
||||
size_t regionElementSize;
|
||||
nvtxMemPermissionsAssignRegionDesc_t const* regionElements;
|
||||
|
||||
size_t reserved1;
|
||||
} nvtxMemPermissionsAssignBatch_v1 ;
|
||||
typedef nvtxMemPermissionsAssignBatch_v1 nvtxMemPermissionsAssignBatch_t;
|
||||
|
||||
|
||||
/** \brief Change the permissions of a region of process virtual memory. */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsAssign(
|
||||
nvtxDomainHandle_t domain,
|
||||
nvtxMemPermissionsAssignBatch_t const* desc);
|
||||
|
||||
|
||||
/**
|
||||
* \brief Create a permissions object for fine grain thread-local control in
|
||||
* multi-threading scenarios
|
||||
*
|
||||
* Unlike the global permissions object (NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE), a new
|
||||
* permissions object is empty. There are no regions registered to it, so more memory is accessible
|
||||
* if bound(bind) without calls to nvtxMemPermissionsSetAccess* first. The permissions are not
|
||||
* active until nvtxMemPermissionsBind. See `nvtxMemPermissionsBind` for more details.
|
||||
*
|
||||
* Use the flags NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_* to control how the regions in
|
||||
* this permission object will interact with global permissions when bound. You may choose to
|
||||
* either replace global memory regions setting or overlay on top of them. The most common uses are
|
||||
* as follows:
|
||||
* * To limit tools to validate writing exclusively specified in this object but inherit all
|
||||
* global read access regions use `NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE`
|
||||
* * To limit tools to validate both read & write permissions exclusively specified in this
|
||||
* object use NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_READ
|
||||
* & NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE
|
||||
*
|
||||
* Also see `nvtxMemPermissionsBind` & `nvtxMemPermissionsSetAccess*`.
|
||||
*/
|
||||
NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemPermissionsCreate(
|
||||
nvtxDomainHandle_t domain,
|
||||
int32_t creationflags); /* NVTX_MEM_PERMISSIONS_CREATE_FLAGS_* */
|
||||
|
||||
/**
|
||||
* \brief Destroy the permissions object.
|
||||
*
|
||||
* If bound(bind), destroy will also unbind it.
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsDestroy(
|
||||
nvtxDomainHandle_t domain,
|
||||
nvtxMemPermissionsHandle_t permissionsHandle); /* only supported on objects from nvtxMemPermissionsCreate */
|
||||
|
||||
/** \brief Reset the permissions object back to its created state. */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsReset(
|
||||
nvtxDomainHandle_t domain,
|
||||
nvtxMemPermissionsHandle_t permissionsHandle);
|
||||
/* NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE and other special handles are supported */
|
||||
|
||||
|
||||
#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_NONE 0x0
|
||||
|
||||
/** \brief Upon binding, with the thread, exclude parent scope write regions instead of overlaying on top of them.
|
||||
*
|
||||
* EX A developer may chose to first prevent all writes except the ones specified to avoid
|
||||
* OOB writes, since there are typically less regions written to than read from.
|
||||
**/
|
||||
#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_WRITE 0x2
|
||||
|
||||
/** \brief Upon binding, with the thread, exclude parent scope read regions instead of overlaying on top of them.
|
||||
*
|
||||
* EX After eliminating any errors when applying strict writes, a developer may then choose to
|
||||
* annotate and enforce strict reads behaviors in segments of code.
|
||||
**/
|
||||
#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_READ 0x1
|
||||
|
||||
/** \brief Upon binding, with the thread, exclude parent scope atomic RW regions instead of overlaying on top of them.
|
||||
*
|
||||
* EX After eliminating any errors from read and write, a developer may chose to ensure
|
||||
* that atomics are in their own region, removing standard read/write, and replacing with
|
||||
* this strict atomic only access. This way they know that conventional reads or writes
|
||||
* will not cause unepected issues.
|
||||
**/
|
||||
#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_ATOMIC 0x4
|
||||
|
||||
|
||||
#define NVTX_MEM_PERMISSIONS_BIND_SCOPE_UNKNOWN 0x0
|
||||
|
||||
/** \brief Bind to thread scope. In this case, tools should validate that local thread's
|
||||
* execution is honoring the permissions as well as the state of NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE
|
||||
* at the time of binding. If this is not bound then NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE should be
|
||||
* used to validate the memory.
|
||||
*
|
||||
* Not all tools will support every scope, such a GPU sanitizer.
|
||||
**/
|
||||
#define NVTX_MEM_PERMISSIONS_BIND_SCOPE_CPU_THREAD 0x1
|
||||
|
||||
/**
|
||||
* \brief Bind to CUDA stream scope.
|
||||
*
|
||||
* In this case, work enqueued to a CUDA stream should be validated by the tool,
|
||||
* when it executes, that it respect the permission of the permission at the point
|
||||
* of binding, as well as the appropriate nvtxMemCudaGetDevicePermissions at the
|
||||
* time of binding. If this is not bound then nvtxMemCudaGetDevicePermissions at
|
||||
* the time of stream enqueue should be used to validate the memory.
|
||||
*
|
||||
* This could apply to work done either on the GPU like a kernel launch or to
|
||||
* CPU based callbacks like cudaStreamAddCallback if the tools supports it.
|
||||
*
|
||||
* Binding is applies locally to a CPU thread so that if N CPU threads are enqueing
|
||||
* work to the same stream (like the default stream) that there cannot be a race
|
||||
* condition between thread binding vs launching their work. IE users should
|
||||
* expect the permissions bound in the thread to be honored by the proceeding
|
||||
* work (launches, copies, etc) invoked from in the CPU thread until unbound.
|
||||
*/
|
||||
#define NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM 0x2
|
||||
|
||||
|
||||
/**
|
||||
* \brief Bind the permissions object into a particular scope on the caller thread
|
||||
*
|
||||
* Permissions do not take affect until binding. Binding permissions is a thread local
|
||||
* activity that overrides global behaviors. This is to avoid multi-threaded race conditions,
|
||||
*
|
||||
* The scope dictates what type of processing it applies to, and when in some cases.
|
||||
* EX1: NVTX_MEM_PERMISSIONS_BIND_SCOPE_CPU_THREAD applies to CPU code accessing memory while bound.
|
||||
* EX2: NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM applies to CUDA streams, and the permissions
|
||||
* must be recorded and applied when the work in the stream dequeues to executes. In this case
|
||||
* it could be GPU or CPU, if the tool support both.
|
||||
*
|
||||
* Bind can be called again on the same object and thread to take any updates to the
|
||||
* specified permission object or the inherited properties.
|
||||
*
|
||||
* Bind flags support changing how the binding process inherits region access control.
|
||||
* In the case of thread scope this is NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE and from CUDA_STREAM
|
||||
* this is nvtxMemCudaGetDevicePermissions. Choosing stricter modes allows the user to
|
||||
* further reduce the access with less work, since memory by default, behaves as natural
|
||||
* until the NVTX annotations instructs a tool to treat it anther way. See strict flags
|
||||
* for more details.
|
||||
*
|
||||
* Also see nvtxMemPermissionsUnbind
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsBind(
|
||||
nvtxDomainHandle_t domain,
|
||||
nvtxMemPermissionsHandle_t permissions, /* special object like NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE are not supported */
|
||||
uint32_t bindScope, /* NVTX_MEM_PERMISSIONS_BIND_SCOPE_* */
|
||||
uint32_t bindFlags); /* NVTX_MEM_PERMISSIONS_BIND_FLAGS_* */
|
||||
|
||||
/**
|
||||
* \brief Unbind the permissions object bound to the caller thread.
|
||||
*
|
||||
* Upon unbind, the thread local permissions for a scope are restored to the default
|
||||
* behavior defined by the scope.
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsUnbind(
|
||||
nvtxDomainHandle_t domain,
|
||||
uint32_t bindScope);
|
||||
|
||||
/** @} */ /*END defgroup*/
|
||||
|
||||
typedef enum NvtxExtMemCallbackId
|
||||
{
|
||||
/* CBID 0 is invalid */
|
||||
NVTX3EXT_CBID_nvtxMemHeapRegister = 1,
|
||||
NVTX3EXT_CBID_nvtxMemHeapUnregister = 2,
|
||||
NVTX3EXT_CBID_nvtxMemHeapReset = 3,
|
||||
NVTX3EXT_CBID_nvtxMemRegionsRegister = 4,
|
||||
NVTX3EXT_CBID_nvtxMemRegionsResize = 5,
|
||||
NVTX3EXT_CBID_nvtxMemRegionsUnregister = 6,
|
||||
NVTX3EXT_CBID_nvtxMemRegionsName = 7,
|
||||
NVTX3EXT_CBID_nvtxMemPermissionsAssign = 8,
|
||||
NVTX3EXT_CBID_nvtxMemPermissionsCreate = 9,
|
||||
NVTX3EXT_CBID_nvtxMemPermissionsDestroy = 10,
|
||||
NVTX3EXT_CBID_nvtxMemPermissionsReset = 11,
|
||||
NVTX3EXT_CBID_nvtxMemPermissionsBind = 12,
|
||||
NVTX3EXT_CBID_nvtxMemPermissionsUnbind = 13,
|
||||
|
||||
/* 14-16 in nvtExtImplMemCudaRt1.h */
|
||||
NVTX3EXT_CBID_nvtxMemCudaGetProcessWidePermissions = 14,
|
||||
NVTX3EXT_CBID_nvtxMemCudaGetDeviceWidePermissions = 15,
|
||||
NVTX3EXT_CBID_nvtxMemCudaSetPeerAccess = 16,
|
||||
|
||||
NVTX3EXT_CBID_MEM_FN_NUM = 17
|
||||
} NvtxExtMemCallbackId;
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility push(internal)
|
||||
#endif
|
||||
|
||||
/* Extension types are required for the implementation and the NVTX handler. */
|
||||
#define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot be included directly */
|
||||
#include "nvtxDetail/nvtxExtTypes.h"
|
||||
#undef NVTX_EXT_TYPES_GUARD
|
||||
|
||||
#ifndef NVTX_NO_IMPL
|
||||
/* Ensure other headers cannot be included directly */
|
||||
#define NVTX_EXT_IMPL_MEM_GUARD
|
||||
#include "nvtxDetail/nvtxExtImplMem_v1.h"
|
||||
#undef NVTX_EXT_IMPL_MEM_GUARD
|
||||
#endif /*NVTX_NO_IMPL*/
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility pop
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif /* NVTOOLSEXTV3_MEM_V1 */
|
||||
@@ -0,0 +1,150 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
#ifndef NVTOOLSEXTV3_MEM_CUDART_V1
|
||||
#define NVTOOLSEXTV3_MEM_CUDART_V1
|
||||
|
||||
#include "nvToolsExtMem.h"
|
||||
|
||||
#include "cuda.h"
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
|
||||
/** \brief The memory is from a CUDA runtime array.
|
||||
*
|
||||
* Relevant functions: cudaMallocArray, cudaMalloc3DArray
|
||||
* Also cudaArray_t from other types such as cudaMipmappedArray_t
|
||||
*
|
||||
* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported
|
||||
*
|
||||
* nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo()
|
||||
* nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCudaArrayRangeDesc_t
|
||||
*/
|
||||
#define NVTX_MEM_TYPE_CUDA_ARRAY 0x11
|
||||
|
||||
/** \brief structure to describe memory in a CUDA array object
|
||||
*/
|
||||
typedef struct nvtxMemCudaArrayRangeDesc_v1
|
||||
{
|
||||
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
|
||||
uint16_t structSize; /* Size of the structure. */
|
||||
uint32_t reserved0;
|
||||
cudaArray_t src;
|
||||
size_t offset[3];
|
||||
size_t extent[3];
|
||||
} nvtxMemCudaArrayRangeDesc_v1;
|
||||
typedef nvtxMemCudaArrayRangeDesc_v1 nvtxMemCudaArrayRangeDesc_t;
|
||||
|
||||
|
||||
/** \brief The memory is from a CUDA device array.
|
||||
*
|
||||
* Relevant functions: cuArrayCreate, cuArray3DCreate
|
||||
* Also CUarray from other types such as CUmipmappedArray
|
||||
*
|
||||
* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported
|
||||
*
|
||||
* nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo()
|
||||
* nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCuArrayRangeDesc_t
|
||||
*/
|
||||
#define NVTX_MEM_TYPE_CU_ARRAY 0x12
|
||||
|
||||
/** \brief structure to describe memory in a CUDA array object
|
||||
*/
|
||||
typedef struct nvtxMemCuArrayRangeDesc_v1
|
||||
{
|
||||
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
|
||||
uint16_t structSize; /* Size of the structure. */
|
||||
uint32_t reserved0;
|
||||
CUarray src;
|
||||
size_t offset[3];
|
||||
size_t extent[3];
|
||||
} nvtxMemCuArrayRangeDesc_v1;
|
||||
typedef nvtxMemCuArrayRangeDesc_v1 nvtxMemCuArrayRangeDesc_t;
|
||||
|
||||
/* Reserving 0x2-0xF for more common types */
|
||||
|
||||
#define NVTX_MEM_CUDA_PEER_ALL_DEVICES -1
|
||||
|
||||
/** \brief Get the permission object that represent the CUDA runtime device
|
||||
* or cuda driver context
|
||||
*
|
||||
* This object will allow developers to adjust permissions applied to work executed
|
||||
* on the GPU. It may be inherited or overridden by permissions object bound
|
||||
* with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags.
|
||||
*
|
||||
* Ex. change the peer to peer access permissions between devices in entirety
|
||||
* or punch through special holes
|
||||
*
|
||||
* By default, all memory is accessible that naturally would be to a CUDA kernel until
|
||||
* modified otherwise by nvtxMemCudaSetPeerAccess or changing regions.
|
||||
*
|
||||
* This object should also represent the CUDA driver API level context.
|
||||
*/
|
||||
NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetProcessWidePermissions(
|
||||
nvtxDomainHandle_t domain);
|
||||
|
||||
/** \brief Get the permission object that represent the CUDA runtime device
|
||||
* or cuda driver context
|
||||
*
|
||||
* This object will allow developers to adjust permissions applied to work executed
|
||||
* on the GPU. It may be inherited or overridden by permissions object bound
|
||||
* with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags.
|
||||
*
|
||||
* Ex. change the peer to peer access permissions between devices in entirety
|
||||
* or punch through special holes
|
||||
*
|
||||
* By default, all memory is accessible that naturally would be to a CUDA kernel until
|
||||
* modified otherwise by nvtxMemCudaSetPeerAccess or changing regions.
|
||||
*
|
||||
* This object should also represent the CUDA driver API level context.
|
||||
*/
|
||||
NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetDeviceWidePermissions(
|
||||
nvtxDomainHandle_t domain,
|
||||
int device);
|
||||
|
||||
/** \brief Change the default behavior for all memory mapped in from a particular device.
|
||||
*
|
||||
* While typically all memory defaults to readable and writable, users may desire to limit
|
||||
* access to reduced default permissions such as read-only and a per-device basis.
|
||||
*
|
||||
* Regions can used to further override smaller windows of memory.
|
||||
*
|
||||
* devicePeer can be NVTX_MEM_CUDA_PEER_ALL_DEVICES
|
||||
*
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMemCudaSetPeerAccess(
|
||||
nvtxDomainHandle_t domain,
|
||||
nvtxMemPermissionsHandle_t permissions,
|
||||
int devicePeer, /* device number such as from cudaGetDevice() or NVTX_MEM_CUDA_PEER_ALL_DEVICES */
|
||||
uint32_t flags); /* NVTX_MEM_PERMISSIONS_REGION_FLAGS_* */
|
||||
|
||||
/** @} */ /*END defgroup*/
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility push(internal)
|
||||
#endif
|
||||
|
||||
#ifndef NVTX_NO_IMPL
|
||||
#define NVTX_EXT_IMPL_MEM_CUDART_GUARD /* Ensure other headers cannot be included directly */
|
||||
#include "nvtxDetail/nvtxExtImplMemCudaRt_v1.h"
|
||||
#undef NVTX_EXT_IMPL_MEM_CUDART_GUARD
|
||||
#endif /*NVTX_NO_IMPL*/
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility pop
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif /* NVTOOLSEXTV3_MEM_CUDART_V1 */
|
||||
@@ -30,11 +30,11 @@ extern "C" {
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* \cond SHOW_HIDDEN
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief Used to build a non-colliding value for resource types separated class
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
#define NVTX_RESOURCE_CLASS_OPENCL 6
|
||||
#define NVTX_RESOURCE_CLASS_OPENCL 6
|
||||
/** \endcond */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
@@ -183,7 +183,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name)
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifndef NVTX_NO_IMPL
|
||||
#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
|
||||
#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot be included directly */
|
||||
#include "nvtxDetail/nvtxImplOpenCL_v3.h"
|
||||
#undef NVTX_IMPL_GUARD_OPENCL
|
||||
#endif /*NVTX_NO_IMPL*/
|
||||
|
||||
File diff ditekan karena terlalu besar
Load Diff
@@ -0,0 +1,170 @@
|
||||
/*
|
||||
* Copyright 2023 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "nvtxDetail/nvtxExtPayloadHelperInternal.h"
|
||||
|
||||
|
||||
/* This is just an empty marker (for readability), which can be omitted. */
|
||||
/* TODO: Fix issue with trailing comma at end of entry list. */
|
||||
#define NVTX_PAYLOAD_ENTRIES
|
||||
|
||||
|
||||
/**
|
||||
* Use this macro for payload entries that are defined by a schema (nested
|
||||
* payload schema).
|
||||
*/
|
||||
#define NVTX_PAYLOAD_NESTED(schemaId) _NVTX_PAYLOAD_NESTED(schemaId)
|
||||
|
||||
|
||||
/**
|
||||
* \brief Define a payload schema for an existing C `struct` definition.
|
||||
*
|
||||
* This macro does
|
||||
* 1) create schema description (array of schema entries).
|
||||
* 2) set the schema attributes for a static data layout.
|
||||
*
|
||||
* It can be used in static code or within a function context.
|
||||
*
|
||||
* Example:
|
||||
* NVTX_DEFINE_SCHEMA_FOR_STRUCT(your_struct, "SchemaName",
|
||||
* NVTX_PAYLOAD_ENTRIES(
|
||||
* (index, TYPE_INT, "integer value"),
|
||||
* (dpfloat, TYPE_DOUBLE, "fp64 value"),
|
||||
* (text, TYPE_CSTRING, "text", NULL, 24)
|
||||
* )
|
||||
* )
|
||||
*
|
||||
* It is required to at least provide the struct name and the payload entries.
|
||||
* The first two fields (member name and NVTX entry type) of each payload entry
|
||||
* are required.
|
||||
*
|
||||
* The optional parameters are only allowed to be passed in the predefined order.
|
||||
* Hence, `payload_flags` requires `payload_schema` to be given and
|
||||
* `prefix` requires `payload_flags` and `payload_schema` to be given.
|
||||
* The payload entries are always the last parameter. A maximum of 16 schema
|
||||
* entries is supported.
|
||||
*
|
||||
* It is recommended to use `NVTX_PAYLOAD_SCHEMA_REGISTER` to register the schema.
|
||||
*
|
||||
* @param struct_id The name of the struct.
|
||||
* @param schema_name (Optional 1) name of the payload schema. Default is `NULL`.
|
||||
* @param prefix (Optional 2) prefix before the schema and attributes variables,
|
||||
* e.g. `static const`. Leave this empty, if no prefix is desired.
|
||||
* @param schema_flags (Optional 2) flags to augment the payload schema.
|
||||
* Default is `NVTX_PAYLOAD_SCHEMA_FLAG_NONE`.
|
||||
* @param schema_id (Optional 4) User-defined payload schema ID.
|
||||
* @param entries (Mandatory) Payload schema entries. This is always the last
|
||||
* parameter to the macro.
|
||||
*/
|
||||
#define NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, ...) \
|
||||
_NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, __VA_ARGS__)
|
||||
|
||||
|
||||
/**
|
||||
* \brief Define a C struct together with a matching schema.
|
||||
*
|
||||
* This macro does
|
||||
* 1) define the payload type (typedef struct).
|
||||
* 2) create schema description (array of schema entries).
|
||||
* 3) set the schema attributes for a static data layout.
|
||||
*
|
||||
* The macro can be used in static code or within a function context.
|
||||
*
|
||||
* It defines the schema attributes in `struct_id##Attr`. Thus, it is recommended
|
||||
* to use `NVTX_PAYLOAD_SCHEMA_REGISTER(domain, struct_id)` to register the schema.
|
||||
*
|
||||
* Example:
|
||||
* NVTX_DEFINE_STRUCT_WITH_SCHEMA(your_struct_name, "Your schema name",
|
||||
* NVTX_PAYLOAD_ENTRIES(
|
||||
* (int, index, TYPE_INT, "integer value"),
|
||||
* (double, dpfloat, TYPE_DOUBLE, "fp64 value"),
|
||||
* (const char, (text, 24), TYPE_CSTRING, "text", NULL, 24)
|
||||
* )
|
||||
* )
|
||||
*
|
||||
* The first three fields (C type, member, entry type) of each entry are required.
|
||||
* A fixed-size array or string requires a special notation with the member
|
||||
* name and the size separated by comma and put into brackets (see last entry
|
||||
* in the example).
|
||||
*
|
||||
* The optional parameters are positional (only allowed to be passed in the
|
||||
* predefined order). A maximum of 16 schema entries is supported.
|
||||
*
|
||||
* @param struct_id The name of the struct.
|
||||
* @param schema_name (Optional 1) name of the payload schema. Default is `NULL`.
|
||||
* @param prefix (Optional 2) prefix before the schema and attributes variables,
|
||||
* e.g. `static const`. Leave this empty, if no prefix is desired.
|
||||
* @param schema_flags (Optional 3) flags to augment the payload schema.
|
||||
* Default is `NVTX_PAYLOAD_SCHEMA_FLAG_NONE`.
|
||||
* @param schema_id (Optional 4) User-defined payload schema ID.
|
||||
* @param entries (Mandatory) The schema entries. This is always the last
|
||||
* parameter to the macro.
|
||||
*/
|
||||
#define NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, ...) \
|
||||
_NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, __VA_ARGS__)
|
||||
|
||||
/**
|
||||
* \brief Initialize and register the NVTX binary payload schema.
|
||||
*
|
||||
* This does essentially the same as `NVTX_DEFINE_STRUCT_WITH_SCHEMA`, but in
|
||||
* addition the schema is registered. The schema ID will be defined as follows:
|
||||
* `const uint64_t struct_id##_schemaId`.
|
||||
*
|
||||
* @param domain The NVTX domain handle (0 for default domain).
|
||||
* All other parameters are similar to `NVTX_DEFINE_STRUCT_WITH_SCHEMA`.
|
||||
*/
|
||||
#define NVTX_DEFINE_STRUCT_WITH_SCHEMA_AND_REGISTER(domain, struct_id, ...) \
|
||||
_NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, __VA_ARGS__) \
|
||||
const uint64_t struct_id##_schemaId = nvtxPayloadSchemaRegister(domain, &struct_id##Attr);
|
||||
|
||||
/**
|
||||
* \brief Define payload schema for an existing `struct` and register the schema.
|
||||
*
|
||||
* This does essentially the same as `NVTX_PAYLOAD_STATIC_SCHEMA_DEFINE`, but in
|
||||
* addition, the schema is registered and `uint64_t struct_id##_schemaId` set.
|
||||
*
|
||||
* @param domain The NVTX domain handle (0 for default domain).
|
||||
* All other parameters are similar to `NVTX_PAYLOAD_STATIC_SCHEMA_DEFINE`.
|
||||
*/
|
||||
#define NVTX_DEFINE_SCHEMA_FOR_STRUCT_AND_REGISTER(domain, struct_id, ...) \
|
||||
_NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, __VA_ARGS__) \
|
||||
const uint64_t struct_id##_schemaId = nvtxPayloadSchemaRegister(domain, &struct_id##Attr);
|
||||
|
||||
/**
|
||||
* \brief Create a type definition for the given struct ID and members.
|
||||
*
|
||||
* This is a convenience macro. A normal `typedef` can be used instead.
|
||||
*
|
||||
* Example usage:
|
||||
* NVTX_DEFINE_STRUCT(your_struct,
|
||||
* (double, fp64),
|
||||
* (uint8_t, u8),
|
||||
* (float, fp32[3])
|
||||
* )
|
||||
*
|
||||
* @param struct_id The name of the struct.
|
||||
* @param members The members of the struct.
|
||||
*/
|
||||
#define NVTX_DEFINE_STRUCT(struct_id, ...) \
|
||||
_NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, __VA_ARGS__)
|
||||
|
||||
/**
|
||||
* \brief Register an NVTX binary payload schema.
|
||||
*
|
||||
* This is a convenience macro, which takes the same `struct_id` that has been
|
||||
* used in other helper macros. Instead, `nvtxPayloadSchemaRegister` can also be
|
||||
* used, but `&struct_id##Attr` has to be passed.
|
||||
*
|
||||
* @param domain The NVTX domain handle (0 for default domain).
|
||||
* @param struct_id The name of the struct.
|
||||
*
|
||||
* @return NVTX schema ID
|
||||
*/
|
||||
#define NVTX_PAYLOAD_SCHEMA_REGISTER(domain, struct_id) \
|
||||
nvtxPayloadSchemaRegister(domain, &struct_id##Attr);
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Copyright 2024 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
/**
|
||||
* NVTX semantic headers require nvToolsExtPayload.h to be included beforehand.
|
||||
*/
|
||||
|
||||
#ifndef NVTX_SEMANTIC_ID_COUNTERS_V1
|
||||
#define NVTX_SEMANTIC_ID_COUNTERS_V1 2
|
||||
|
||||
/**
|
||||
* Flags to extend the semantics of counters.
|
||||
*/
|
||||
#define NVTX_COUNTERS_FLAGS_NONE 0
|
||||
|
||||
/**
|
||||
* Convert the fixed point value to a normalized floating point value.
|
||||
* Unsigned [0f : 1f] or signed [-1f : 1f] is determined by the underlying type
|
||||
* this flag is applied to.
|
||||
*/
|
||||
#define NVTX_COUNTERS_FLAG_NORMALIZE (1 << 1)
|
||||
|
||||
/**
|
||||
* Visual tools should apply scale and limits when graphing.
|
||||
*/
|
||||
#define NVTX_COUNTERS_FLAG_LIMIT_MIN (1 << 2)
|
||||
#define NVTX_COUNTERS_FLAG_LIMIT_MAX (1 << 3)
|
||||
#define NVTX_COUNTERS_FLAG_LIMITS \
|
||||
(NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX)
|
||||
|
||||
/**
|
||||
* Counter time scopes.
|
||||
*/
|
||||
#define NVTX_COUNTERS_FLAG_TIMESCOPE_POINT (1 << 5)
|
||||
#define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_LAST (2 << 5)
|
||||
#define NVTX_COUNTERS_FLAG_TIMESCOPE_UNTIL_NEXT (3 << 5)
|
||||
#define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_START (4 << 5)
|
||||
|
||||
/**
|
||||
* Counter value types.
|
||||
*/
|
||||
#define NVTX_COUNTERS_FLAG_VALUETYPE_ABSOLUTE (1 << 10)
|
||||
/** Delta to previous value of same counter type. */
|
||||
#define NVTX_COUNTERS_FLAG_VALUETYPE_DELTA (2 << 10)
|
||||
|
||||
/**
|
||||
* Datatypes for the `limits` union.
|
||||
*/
|
||||
#define NVTX_COUNTERS_LIMIT_I64 0
|
||||
#define NVTX_COUNTERS_LIMIT_U64 1
|
||||
#define NVTX_COUNTERS_LIMIT_F64 2
|
||||
|
||||
/**
|
||||
*\brief Specify counter semantics.
|
||||
*/
|
||||
typedef struct nvtxSemanticsCounter_v1 {
|
||||
/** Header of the semantic extensions (with identifier, version, etc.). */
|
||||
struct nvtxSemanticsHeader_v1 header;
|
||||
|
||||
/** Flags to provide more context about the counter value. */
|
||||
uint64_t flags;
|
||||
|
||||
/** Unit of the counter value (case-insensitive). */
|
||||
const char* unit;
|
||||
|
||||
/** Should be 1 if not used. */
|
||||
uint64_t unitScaleNumerator;
|
||||
|
||||
/** Should be 1 if not used. */
|
||||
uint64_t unitScaleDenominator;
|
||||
|
||||
/** Determines the used union member. Use defines `NVTX_COUNTER_LIMIT_*`. */
|
||||
int64_t limitType;
|
||||
|
||||
/** Graph limits {minimum, maximum}. */
|
||||
union limits_t {
|
||||
int64_t i64[2];
|
||||
uint64_t u64[2];
|
||||
double d[2];
|
||||
} limits;
|
||||
} nvtxSemanticsCounter_t;
|
||||
|
||||
#endif /* NVTX_SEMANTIC_ID_COUNTERS_V1 */
|
||||
@@ -0,0 +1,30 @@
|
||||
/*
|
||||
* Copyright 2024 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
/**
|
||||
* NVTX semantic headers require nvToolsExtPayload.h to be included beforehand.
|
||||
*/
|
||||
|
||||
#ifndef NVTX_SEMANTIC_ID_SCOPE_V1
|
||||
#define NVTX_SEMANTIC_ID_SCOPE_V1 1
|
||||
|
||||
/**
|
||||
* \brief Specify the NVTX scope for a payload entry.
|
||||
*
|
||||
* This allows the scope to be set for a specific value or counter in a payload.
|
||||
* The scope must be known at schema registration time.
|
||||
*/
|
||||
typedef struct nvtxSemanticsScope_v1
|
||||
{
|
||||
struct nvtxSemanticsHeader_v1 header;
|
||||
|
||||
/** Specifies the scope of a payload entry, e.g. a counter or timestamp. */
|
||||
uint64_t scopeId;
|
||||
} nvtxSemanticsScope_t;
|
||||
|
||||
#endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */
|
||||
@@ -15,23 +15,23 @@
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/* \cond SHOW_HIDDEN
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
#define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) )
|
||||
/** \endcond */
|
||||
|
||||
|
||||
/**
|
||||
/**
|
||||
* \page PAGE_SYNCHRONIZATION Synchronization
|
||||
*
|
||||
* This section covers a subset of the API that allow users to track additional
|
||||
* synchronization details of their application. Naming OS synchronization primitives
|
||||
* may allow users to better understand the data collected by traced synchronization
|
||||
* synchronization details of their application. Naming OS synchronization primitives
|
||||
* may allow users to better understand the data collected by traced synchronization
|
||||
* APIs. Additionally, a user defined synchronization object can allow the users to
|
||||
* to tell the tools when the user is building their own synchronization system
|
||||
* that do not rely on the OS to provide behaviors and instead use techniques like
|
||||
* atomic operations and spinlocks.
|
||||
* atomic operations and spinlocks.
|
||||
*
|
||||
* See module \ref SYNCHRONIZATION for details.
|
||||
*
|
||||
@@ -59,7 +59,7 @@ extern "C" {
|
||||
*
|
||||
* bool Lock() {
|
||||
* nvtxDomainSyncUserAcquireStart(hSync);
|
||||
* bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic
|
||||
* bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic
|
||||
|
||||
* if (acquired) {
|
||||
* nvtxDomainSyncUserAcquireSuccess(hSync);
|
||||
@@ -76,12 +76,12 @@ extern "C" {
|
||||
* }
|
||||
* };
|
||||
* \endcode
|
||||
*
|
||||
*
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* \cond SHOW_HIDDEN
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief Used to build a non-colliding value for resource types separated class
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
@@ -154,8 +154,8 @@ typedef struct nvtxSyncUser* nvtxSyncUser_t;
|
||||
/** \brief User Defined Synchronization Object Attributes Structure.
|
||||
* \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE
|
||||
*
|
||||
* This structure is used to describe the attributes of a user defined synchronization
|
||||
* object. The layout of the structure is defined by a specific version of the tools
|
||||
* This structure is used to describe the attributes of a user defined synchronization
|
||||
* object. The layout of the structure is defined by a specific version of the tools
|
||||
* extension library and can change between different versions of the Tools Extension
|
||||
* library.
|
||||
*
|
||||
@@ -259,7 +259,7 @@ typedef struct nvtxSyncUserAttributes_v0
|
||||
typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Create a user defined synchronization object
|
||||
/** \brief Create a user defined synchronization object
|
||||
* This is used to track non-OS synchronization working with spinlocks and atomics
|
||||
*
|
||||
* \param domain - Domain to own the resource
|
||||
@@ -317,7 +317,7 @@ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Signal to tools of failure in acquiring a user defined synchronization object
|
||||
* This should be called after \ref nvtxDomainSyncUserAcquireStart
|
||||
*
|
||||
*
|
||||
* \param handle - A handle to the object to operate on.
|
||||
*
|
||||
* \sa
|
||||
@@ -374,7 +374,7 @@ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle);
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifndef NVTX_NO_IMPL
|
||||
#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot included directly */
|
||||
#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot be included directly */
|
||||
#include "nvtxDetail/nvtxImplSync_v3.h"
|
||||
#undef NVTX_IMPL_GUARD_SYNC
|
||||
#endif /*NVTX_NO_IMPL*/
|
||||
|
||||
@@ -12,6 +12,11 @@
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
/* Temporary helper #defines, #undef'ed at end of header */
|
||||
@@ -1937,9 +1942,9 @@ class event_attributes {
|
||||
0, // color value
|
||||
NVTX_PAYLOAD_UNKNOWN, // payload type
|
||||
0, // reserved 4B
|
||||
0, // payload value (union)
|
||||
{0}, // payload value (union)
|
||||
NVTX_MESSAGE_UNKNOWN, // message type
|
||||
0 // message value (union)
|
||||
{0} // message value (union)
|
||||
}
|
||||
{
|
||||
}
|
||||
@@ -2003,20 +2008,20 @@ class event_attributes {
|
||||
attributes_.messageType = m.get_type();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Variadic constructor where the first argument is a binary payload.
|
||||
/**
|
||||
* @brief Variadic constructor where the first argument is an extended payload.
|
||||
*
|
||||
* Sets the value of the `EventAttribute`s message based on `m` and forwards
|
||||
* Sets the `ullValue` of the `EventAttribute`s payload and forwards
|
||||
* the remaining variadic parameter pack to the next constructor.
|
||||
*
|
||||
*/
|
||||
template <typename... Args>
|
||||
NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(nvtxPayloadData_t const* bpl, Args const&... args) noexcept
|
||||
NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(nvtxPayloadData_t const* p, Args const&... args) noexcept
|
||||
: event_attributes(args...)
|
||||
{
|
||||
attributes_.payloadType = NVTX_PAYLOAD_TYPE_BINARY;
|
||||
attributes_.payloadType = NVTX_PAYLOAD_TYPE_EXT;
|
||||
attributes_.reserved0 = 1; // NCCL uses only a single binary payload per event.
|
||||
attributes_.payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(bpl);
|
||||
attributes_.payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p);
|
||||
}
|
||||
|
||||
~event_attributes() = default;
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
/*
|
||||
* Copyright 2023 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_EXT_HELPER_MACROS_H
|
||||
#define NVTX_EXT_HELPER_MACROS_H
|
||||
|
||||
/* Combine tokens */
|
||||
#define _NVTX_EXT_CONCAT(a, b) a##b
|
||||
#define NVTX_EXT_CONCAT(a, b) _NVTX_EXT_CONCAT(a, b)
|
||||
|
||||
/* Resolves to the number of arguments passed. */
|
||||
#define NVTX_EXT_NUM_ARGS(...) \
|
||||
NVTX_EXT_SELECTA16(__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, throwaway)
|
||||
#define NVTX_EXT_SELECTA16(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, ...) a16
|
||||
|
||||
/* Cast argument(s) to void to prevent unused variable warnings. */
|
||||
#define _NVTX_EXT_VOIDIFY1(a1) (void)a1;
|
||||
#define _NVTX_EXT_VOIDIFY2(a1, a2) (void)a1; (void)a2;
|
||||
#define _NVTX_EXT_VOIDIFY3(a1, a2, a3) (void)a1; (void)a2; (void)a3;
|
||||
#define _NVTX_EXT_VOIDIFY4(a1, a2, a3, a4) (void)a1; (void)a2; (void)a3; (void)a4;
|
||||
|
||||
/* Mark function arguments as unused. */
|
||||
#define NVTX_EXT_HELPER_UNUSED_ARGS(...) \
|
||||
NVTX_EXT_CONCAT(_NVTX_EXT_VOIDIFY, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
|
||||
|
||||
#endif /* NVTX_EXT_HELPER_MACROS_H */
|
||||
+28
-22
@@ -14,7 +14,12 @@
|
||||
#define NVTX_EXT_IMPL_H
|
||||
/* ---- Include required platform headers ---- */
|
||||
|
||||
#if defined(_WIN32)
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <wchar.h>
|
||||
|
||||
#if defined(_WIN32)
|
||||
|
||||
#include <Windows.h>
|
||||
|
||||
@@ -22,27 +27,19 @@
|
||||
#include <unistd.h>
|
||||
|
||||
#if defined(__ANDROID__)
|
||||
#include <android/api-level.h>
|
||||
#include <android/api-level.h>
|
||||
#endif
|
||||
|
||||
#if defined(__linux__) || defined(__CYGWIN__)
|
||||
#include <sched.h>
|
||||
#endif
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <limits.h>
|
||||
#include <dlfcn.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <pthread.h>
|
||||
#include <stdlib.h>
|
||||
#include <wchar.h>
|
||||
|
||||
#endif
|
||||
|
||||
@@ -66,26 +63,35 @@
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
// #ifdef __GNUC__
|
||||
// #pragma GCC visibility push(hidden)
|
||||
// #endif
|
||||
|
||||
/*
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility push(hidden)
|
||||
#endif
|
||||
*/
|
||||
#define NVTX_EXTENSION_FRESH 0
|
||||
#define NVTX_EXTENSION_DISABLED 1
|
||||
#define NVTX_EXTENSION_STARTING 2
|
||||
#define NVTX_EXTENSION_LOADED 3
|
||||
|
||||
NVTX_LINKONCE_DEFINE_GLOBAL NvtxExtInitializeInjectionFunc_t NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = (NvtxExtInitializeInjectionFunc_t)0;
|
||||
/* Function slots are local to each extension */
|
||||
typedef struct nvtxExtGlobals1_t
|
||||
{
|
||||
NvtxExtInitializeInjectionFunc_t injectionFnPtr;
|
||||
} nvtxExtGlobals1_t;
|
||||
|
||||
NVTX_LINKONCE_DEFINE_GLOBAL nvtxExtGlobals1_t NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1) =
|
||||
{
|
||||
(NvtxExtInitializeInjectionFunc_t)0
|
||||
};
|
||||
|
||||
#define NVTX_EXT_INIT_GUARD
|
||||
#include "nvtxExtInit.h"
|
||||
#undef NVTX_EXT_INIT_GUARD
|
||||
|
||||
// #ifdef __GNUC__
|
||||
// #pragma GCC visibility pop
|
||||
// #endif
|
||||
|
||||
/*
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility pop
|
||||
#endif
|
||||
*/
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
@@ -0,0 +1,148 @@
|
||||
/*
|
||||
* Copyright 2023-2024 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_EXT_IMPL_COUNTERS_GUARD
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExtCounters.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
#define NVTX_EXT_IMPL_GUARD
|
||||
#include "nvtxExtImpl.h"
|
||||
#undef NVTX_EXT_IMPL_GUARD
|
||||
|
||||
#ifndef NVTX_EXT_IMPL_COUNTERS_V1
|
||||
#define NVTX_EXT_IMPL_COUNTERS_V1
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/* Macros to create versioned symbols. */
|
||||
#define NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
|
||||
NAME##_v##VERSION##_bpl##COMPATID
|
||||
#define NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
|
||||
NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
|
||||
#define NVTX_EXT_COUNTERS_VERSIONED_ID(NAME) \
|
||||
NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COUNTERS_COMPATID)
|
||||
|
||||
#ifdef NVTX_DISABLE
|
||||
|
||||
#include "nvtxExtHelperMacros.h"
|
||||
|
||||
#define NVTX_EXT_COUNTERS_IMPL_FN_V1(ret_val, fn_name, signature, arg_names) \
|
||||
ret_val fn_name signature { \
|
||||
NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
|
||||
return ((ret_val)(intptr_t)-1); \
|
||||
}
|
||||
|
||||
#else /* NVTX_DISABLE */
|
||||
|
||||
/*
|
||||
* Function slots for the counters extension. First entry is the module state,
|
||||
* initialized to `0` (`NVTX_EXTENSION_FRESH`).
|
||||
*/
|
||||
#define NVTX_EXT_COUNTERS_SLOT_COUNT 63
|
||||
NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
|
||||
NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX_EXT_COUNTERS_SLOT_COUNT + 1]
|
||||
= {0};
|
||||
|
||||
/* Avoid warnings about missing prototype. */
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)(void);
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)()
|
||||
{
|
||||
intptr_t* fnSlots = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots) + 1;
|
||||
nvtxExtModuleSegment_t segment = {
|
||||
0, /* unused (only one segment) */
|
||||
NVTX_EXT_COUNTERS_SLOT_COUNT,
|
||||
fnSlots
|
||||
};
|
||||
|
||||
nvtxExtModuleInfo_t module = {
|
||||
NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
|
||||
NVTX_EXT_COUNTERS_MODULEID, NVTX_EXT_COUNTERS_COMPATID,
|
||||
1, &segment, /* number of segments, segments */
|
||||
NULL, /* no export function needed */
|
||||
/* bake type sizes and alignment information into program binary */
|
||||
NULL
|
||||
};
|
||||
|
||||
NVTX_INFO( "%s\n", __FUNCTION__ );
|
||||
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
|
||||
NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots));
|
||||
}
|
||||
|
||||
#define NVTX_EXT_COUNTERS_IMPL_FN_V1(ret_type, fn_name, signature, arg_names) \
|
||||
typedef ret_type (*fn_name##_impl_fntype)signature; \
|
||||
NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
|
||||
intptr_t slot = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
|
||||
if (slot != NVTX_EXTENSION_DISABLED) { \
|
||||
if (slot != NVTX_EXTENSION_FRESH) { \
|
||||
return (*(fn_name##_impl_fntype)slot) arg_names; \
|
||||
} else { \
|
||||
NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)(); \
|
||||
/* Re-read function slot after extension initialization. */ \
|
||||
slot = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
|
||||
if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
|
||||
return (*(fn_name##_impl_fntype)slot) arg_names; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
NVTX_EXT_FN_RETURN_INVALID(ret_type) \
|
||||
}
|
||||
|
||||
#endif /*NVTX_DISABLE*/
|
||||
|
||||
/* Non-void functions. */
|
||||
#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
|
||||
|
||||
NVTX_EXT_COUNTERS_IMPL_FN_V1(nvtxCountersHandle_t, nvtxCountersRegister,
|
||||
(nvtxDomainHandle_t domain, const nvtxCountersAttr_t* attr),
|
||||
(domain, attr))
|
||||
|
||||
#undef NVTX_EXT_FN_RETURN_INVALID
|
||||
/* END: Non-void functions. */
|
||||
|
||||
/* void functions. */
|
||||
#define NVTX_EXT_FN_RETURN_INVALID(rtype)
|
||||
#define return
|
||||
|
||||
NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleInt64,
|
||||
(nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, int64_t value),
|
||||
(domain, hCounter, value))
|
||||
|
||||
NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleFloat64,
|
||||
(nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, double value),
|
||||
(domain, hCounter, value))
|
||||
|
||||
NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSample,
|
||||
(nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, void* values, size_t size),
|
||||
(domain, hCounter, values, size))
|
||||
|
||||
NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleNoValue,
|
||||
(nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, uint8_t reason),
|
||||
(domain, hCounter, reason))
|
||||
|
||||
NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSubmitBatch,
|
||||
(nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounters,
|
||||
const void* counters, size_t size), (domain, hCounters, counters, size))
|
||||
|
||||
NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSubmitBatchEx,
|
||||
(nvtxDomainHandle_t domain, const nvtxCountersBatch_t* countersBatch),
|
||||
(domain, countersBatch))
|
||||
|
||||
#undef return
|
||||
#undef NVTX_EXT_FN_RETURN_INVALID
|
||||
/* END: void functions. */
|
||||
|
||||
/* Keep NVTX_EXT_COUNTERS_IMPL_FN_V1 defined for a future version of this extension. */
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif /* NVTX_EXT_IMPL_COUNTERS_V1 */
|
||||
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_EXT_IMPL_MEM_CUDART_GUARD
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExtMemCudaRt.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifdef NVTX_DISABLE
|
||||
|
||||
#include "nvtxExtHelperMacros.h"
|
||||
|
||||
#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
|
||||
ret_val fn_name signature { \
|
||||
NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
|
||||
return ((ret_val)(intptr_t)-1); \
|
||||
}
|
||||
|
||||
#else /* NVTX_DISABLE */
|
||||
|
||||
#define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \
|
||||
typedef ret_type ( * fn_name##_impl_fntype )signature; \
|
||||
NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
|
||||
intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
|
||||
if (slot != NVTX_EXTENSION_DISABLED) { \
|
||||
if (slot != NVTX_EXTENSION_FRESH) { \
|
||||
return (*(fn_name##_impl_fntype)slot) arg_names; \
|
||||
} else { \
|
||||
NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \
|
||||
/* Re-read function slot after extension initialization. */ \
|
||||
slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
|
||||
if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
|
||||
return (*(fn_name##_impl_fntype)slot) arg_names; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
NVTX_EXT_FN_RETURN_INVALID(ret_type) \
|
||||
}
|
||||
|
||||
#endif /*NVTX_DISABLE*/
|
||||
|
||||
/* Non-void functions. */
|
||||
#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
|
||||
|
||||
NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetProcessWidePermissions, (nvtxDomainHandle_t domain), (domain))
|
||||
|
||||
NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetDeviceWidePermissions, (nvtxDomainHandle_t domain, int device), (domain, device))
|
||||
|
||||
#undef NVTX_EXT_FN_RETURN_INVALID
|
||||
/* END: Non-void functions. */
|
||||
|
||||
/* void functions. */
|
||||
#define NVTX_EXT_FN_RETURN_INVALID(rtype)
|
||||
#define return
|
||||
|
||||
NVTX_EXT_FN_IMPL(void, nvtxMemCudaSetPeerAccess, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, int devicePeer, uint32_t flags), (domain, permissions, devicePeer, flags))
|
||||
|
||||
#undef return
|
||||
#undef NVTX_EXT_FN_RETURN_INVALID
|
||||
/* END: void functions. */
|
||||
|
||||
#undef NVTX_EXT_FN_IMPL
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
@@ -0,0 +1,133 @@
|
||||
/*
|
||||
* Copyright 2009-2020,2023 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_EXT_IMPL_MEM_GUARD
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExtMem.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
#define NVTX_EXT_IMPL_GUARD
|
||||
#include "nvtxExtImpl.h"
|
||||
#undef NVTX_EXT_IMPL_GUARD
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#define NVTXMEM_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) NAME##_v##VERSION##_mem##COMPATID
|
||||
#define NVTXMEM_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) NVTXMEM_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
|
||||
#define NVTX_EXT_MEM_VERSIONED_ID(NAME) NVTXMEM_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_MEM)
|
||||
|
||||
#ifdef NVTX_DISABLE
|
||||
|
||||
#include "nvtxExtHelperMacros.h"
|
||||
|
||||
#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
|
||||
ret_val fn_name signature { \
|
||||
NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
|
||||
return ((ret_val)(intptr_t)-1); \
|
||||
}
|
||||
|
||||
#else /* NVTX_DISABLE */
|
||||
|
||||
/*
|
||||
* Function slots for the memory extension. First entry is the module
|
||||
* state, initialized to `0` (`NVTX_EXTENSION_FRESH`).
|
||||
*/
|
||||
NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
|
||||
NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_MEM_FN_NUM + 2]
|
||||
= {0};
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)()
|
||||
{
|
||||
intptr_t* fnSlots = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots) + 1;
|
||||
nvtxExtModuleSegment_t segment = {
|
||||
0, /* unused (only one segment) */
|
||||
NVTX3EXT_CBID_MEM_FN_NUM,
|
||||
fnSlots
|
||||
};
|
||||
|
||||
nvtxExtModuleInfo_t module = {
|
||||
NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
|
||||
NVTX_EXT_MODULEID_MEM, NVTX_EXT_COMPATID_MEM,
|
||||
1, &segment,
|
||||
NULL, /* no export function needed */
|
||||
NULL
|
||||
};
|
||||
|
||||
NVTX_INFO( "%s\n", __FUNCTION__ );
|
||||
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
|
||||
NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots));
|
||||
}
|
||||
|
||||
#define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \
|
||||
typedef ret_type ( * fn_name##_impl_fntype )signature; \
|
||||
NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
|
||||
intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
|
||||
if (slot != NVTX_EXTENSION_DISABLED) { \
|
||||
if (slot != NVTX_EXTENSION_FRESH) { \
|
||||
return (*(fn_name##_impl_fntype)slot) arg_names; \
|
||||
} else { \
|
||||
NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \
|
||||
/* Re-read function slot after extension initialization. */ \
|
||||
slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
|
||||
if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
|
||||
return (*(fn_name##_impl_fntype)slot) arg_names; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
NVTX_EXT_FN_RETURN_INVALID(ret_type) \
|
||||
}
|
||||
|
||||
#endif /*NVTX_DISABLE*/
|
||||
|
||||
/* Non-void functions. */
|
||||
#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
|
||||
|
||||
NVTX_EXT_FN_IMPL(nvtxMemHeapHandle_t, nvtxMemHeapRegister, (nvtxDomainHandle_t domain, nvtxMemHeapDesc_t const* desc), (domain, desc))
|
||||
|
||||
NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemPermissionsCreate, (nvtxDomainHandle_t domain, int32_t creationflags), (domain, creationflags))
|
||||
|
||||
#undef NVTX_EXT_FN_RETURN_INVALID
|
||||
/* END: Non-void functions. */
|
||||
|
||||
/* void functions. */
|
||||
#define NVTX_EXT_FN_RETURN_INVALID(rtype)
|
||||
#define return
|
||||
|
||||
NVTX_EXT_FN_IMPL(void, nvtxMemHeapUnregister, (nvtxDomainHandle_t domain, nvtxMemHeapHandle_t heap), (domain, heap))
|
||||
|
||||
NVTX_EXT_FN_IMPL(void, nvtxMemHeapReset, (nvtxDomainHandle_t domain, nvtxMemHeapHandle_t heap), (domain, heap))
|
||||
|
||||
NVTX_EXT_FN_IMPL(void, nvtxMemRegionsRegister, (nvtxDomainHandle_t domain, nvtxMemRegionsRegisterBatch_t const* desc), (domain, desc))
|
||||
|
||||
NVTX_EXT_FN_IMPL(void, nvtxMemRegionsResize, (nvtxDomainHandle_t domain,nvtxMemRegionsResizeBatch_t const* desc), (domain, desc))
|
||||
|
||||
NVTX_EXT_FN_IMPL(void, nvtxMemRegionsUnregister, (nvtxDomainHandle_t domain,nvtxMemRegionsUnregisterBatch_t const* desc), (domain, desc))
|
||||
|
||||
NVTX_EXT_FN_IMPL(void, nvtxMemRegionsName, (nvtxDomainHandle_t domain,nvtxMemRegionsNameBatch_t const* desc), (domain, desc))
|
||||
|
||||
NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsAssign, (nvtxDomainHandle_t domain,nvtxMemPermissionsAssignBatch_t const* desc), (domain, desc))
|
||||
|
||||
NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsDestroy, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions), (domain, permissions))
|
||||
|
||||
NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsReset, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions), (domain, permissions))
|
||||
|
||||
NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsBind, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, uint32_t bindScope, uint32_t bindFlags), (domain, permissions, bindScope, bindFlags))
|
||||
|
||||
NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsUnbind, (nvtxDomainHandle_t domain, uint32_t bindScope), (domain, bindScope))
|
||||
|
||||
#undef return
|
||||
#undef NVTX_EXT_FN_RETURN_INVALID
|
||||
/* END: void functions. */
|
||||
|
||||
#undef NVTX_EXT_FN_IMPL
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
@@ -0,0 +1,155 @@
|
||||
/*
|
||||
* Copyright 2021-2023 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
#define NVTX_EXT_IMPL_GUARD
|
||||
#include "nvtxExtImpl.h"
|
||||
#undef NVTX_EXT_IMPL_GUARD
|
||||
|
||||
#ifndef NVTX_EXT_IMPL_PAYLOAD_V1
|
||||
#define NVTX_EXT_IMPL_PAYLOAD_V1
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/* Macros to create versioned symbols. */
|
||||
#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
|
||||
NAME##_v##VERSION##_bpl##COMPATID
|
||||
#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
|
||||
NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
|
||||
#define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \
|
||||
NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_PAYLOAD_COMPATID)
|
||||
|
||||
#ifdef NVTX_DISABLE
|
||||
|
||||
#include "nvtxExtHelperMacros.h"
|
||||
|
||||
#define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_val, fn_name, signature, arg_names) \
|
||||
ret_val fn_name signature { \
|
||||
NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
|
||||
return ((ret_val)(intptr_t)-1); \
|
||||
}
|
||||
|
||||
#else /* NVTX_DISABLE */
|
||||
|
||||
#include "nvtxExtPayloadTypeInfo.h"
|
||||
|
||||
/*
|
||||
* Function slots for the payload extension. First entry is the module state,
|
||||
* initialized to `0` (`NVTX_EXTENSION_FRESH`).
|
||||
*/
|
||||
#define NVTX_EXT_PAYLOAD_SLOT_COUNT 63
|
||||
NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
|
||||
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX_EXT_PAYLOAD_SLOT_COUNT + 1]
|
||||
= {0};
|
||||
|
||||
/* Avoid warnings about missing prototype. */
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(void);
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
|
||||
{
|
||||
intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1;
|
||||
nvtxExtModuleSegment_t segment = {
|
||||
0, /* unused (only one segment) */
|
||||
NVTX_EXT_PAYLOAD_SLOT_COUNT,
|
||||
fnSlots
|
||||
};
|
||||
|
||||
nvtxExtModuleInfo_t module = {
|
||||
NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
|
||||
NVTX_EXT_PAYLOAD_MODULEID, NVTX_EXT_PAYLOAD_COMPATID,
|
||||
1, &segment, /* number of segments, segments */
|
||||
NULL, /* no export function needed */
|
||||
/* bake type sizes and alignment information into program binary */
|
||||
&(NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo))
|
||||
};
|
||||
|
||||
NVTX_INFO( "%s\n", __FUNCTION__ );
|
||||
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
|
||||
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots));
|
||||
}
|
||||
|
||||
#define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_type, fn_name, signature, arg_names) \
|
||||
typedef ret_type (*fn_name##_impl_fntype)signature; \
|
||||
NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
|
||||
intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
|
||||
if (slot != NVTX_EXTENSION_DISABLED) { \
|
||||
if (slot != NVTX_EXTENSION_FRESH) { \
|
||||
return (*(fn_name##_impl_fntype)slot) arg_names; \
|
||||
} else { \
|
||||
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \
|
||||
/* Re-read function slot after extension initialization. */ \
|
||||
slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
|
||||
if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
|
||||
return (*(fn_name##_impl_fntype)slot) arg_names; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
NVTX_EXT_FN_RETURN_INVALID(ret_type) \
|
||||
}
|
||||
|
||||
#endif /*NVTX_DISABLE*/
|
||||
|
||||
/* Non-void functions. */
|
||||
#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
|
||||
|
||||
NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadSchemaRegister,
|
||||
(nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr),
|
||||
(domain, attr))
|
||||
|
||||
NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadEnumRegister,
|
||||
(nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr),
|
||||
(domain, attr))
|
||||
|
||||
NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePushPayload,
|
||||
(nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count),
|
||||
(domain, payloadData, count))
|
||||
|
||||
NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePopPayload,
|
||||
(nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count),
|
||||
(domain, payloadData, count))
|
||||
|
||||
NVTX_EXT_PAYLOAD_IMPL_FN_V1(nvtxRangeId_t, nvtxRangeStartPayload,
|
||||
(nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count),
|
||||
(domain, payloadData, count))
|
||||
|
||||
NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint8_t, nvtxDomainIsEnabled, (nvtxDomainHandle_t domain), (domain))
|
||||
|
||||
NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxScopeRegister, (nvtxDomainHandle_t domain,
|
||||
const nvtxScopeAttr_t* attr), (domain, attr))
|
||||
|
||||
#undef NVTX_EXT_FN_RETURN_INVALID
|
||||
/* END: Non-void functions. */
|
||||
|
||||
/* void functions. */
|
||||
#define NVTX_EXT_FN_RETURN_INVALID(rtype)
|
||||
#define return
|
||||
|
||||
NVTX_EXT_PAYLOAD_IMPL_FN_V1(void, nvtxMarkPayload, (nvtxDomainHandle_t domain,
|
||||
const nvtxPayloadData_t* payloadData, size_t count), (domain, payloadData, count))
|
||||
|
||||
NVTX_EXT_PAYLOAD_IMPL_FN_V1(void, nvtxRangeEndPayload, (nvtxDomainHandle_t domain,
|
||||
nvtxRangeId_t id, const nvtxPayloadData_t* payloadData, size_t count),
|
||||
(domain, id, payloadData, count))
|
||||
|
||||
#undef return
|
||||
#undef NVTX_EXT_FN_RETURN_INVALID
|
||||
/* END: void functions. */
|
||||
|
||||
/* Keep NVTX_EXT_PAYLOAD_IMPL_FN_V1 defined for a future version of this extension. */
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif /* NVTX_EXT_IMPL_PAYLOAD_V1 */
|
||||
|
||||
+88
-73
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright 2009-2023 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
@@ -22,7 +22,7 @@ extern "C" {
|
||||
#define NVTX_PATHCHAR wchar_t
|
||||
#define NVTX_STR(x) L##x
|
||||
#define NVTX_GETENV _wgetenv
|
||||
#define NVTX_BUFSIZE MAX_PATH
|
||||
#define NVTX_BUFSIZE 16384
|
||||
#define NVTX_DLLHANDLE HMODULE
|
||||
#define NVTX_DLLOPEN(x) LoadLibraryW(x)
|
||||
#define NVTX_DLLFUNC GetProcAddress
|
||||
@@ -39,14 +39,14 @@ extern "C" {
|
||||
#define NVTX_PATHCHAR char
|
||||
#define NVTX_STR(x) x
|
||||
#define NVTX_GETENV getenv
|
||||
#define NVTX_BUFSIZE PATH_MAX
|
||||
#define NVTX_BUFSIZE 16384
|
||||
#define NVTX_DLLHANDLE void*
|
||||
#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
|
||||
#define NVTX_DLLFUNC dlsym
|
||||
#define NVTX_DLLCLOSE dlclose
|
||||
#define NVTX_YIELD() sched_yield()
|
||||
#define NVTX_MEMBAR() __sync_synchronize()
|
||||
/* Ensure full memory barrier for atomics, to match Windows functions */
|
||||
/* Ensure full memory barrier for atomics, to match Windows functions. */
|
||||
#define NVTX_ATOMIC_WRITE_32(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value)
|
||||
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
|
||||
#define NVTX_ATOMIC_WRITE_PTR(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value)
|
||||
@@ -63,7 +63,7 @@ extern "C" {
|
||||
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
|
||||
#endif
|
||||
|
||||
/* Define this to 1 for platforms that support environment variables */
|
||||
/* Define this to 1 for platforms that support environment variables. */
|
||||
/* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */
|
||||
/* Try: #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */
|
||||
#define NVTX_SUPPORT_ENV_VARS 1
|
||||
@@ -72,16 +72,16 @@ extern "C" {
|
||||
#define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1
|
||||
|
||||
/* Injection libraries implementing InitializeInjectionNvtxExtension may be statically linked,
|
||||
* and this will override any dynamic injection. Useful for platforms where dynamic
|
||||
* injection is not available. Since weak symbols not explicitly marked extern are
|
||||
* guaranteed to be initialized to zero if no definitions are found by the linker, the
|
||||
* dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */
|
||||
* which will override any dynamic injection. This is useful for platforms, where dynamic
|
||||
* injection is not available. Since weak symbols, not explicitly marked extern, are
|
||||
* guaranteed to be initialized to zero, if no definitions are found by the linker, the
|
||||
* dynamic injection process proceeds normally, if pfnInitializeInjectionNvtx2 is 0. */
|
||||
#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__)
|
||||
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1
|
||||
/* To statically inject an NVTX library, define InitializeInjectionNvtxExtension_fnptr as a normal
|
||||
* symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension (which
|
||||
* does not need to be named "InitializeInjectionNvtxExtension" as is necessary in a dynamic
|
||||
* injection library. */
|
||||
* symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension, which
|
||||
* does not need to be named "InitializeInjectionNvtxExtension" as it is necessary in a dynamic
|
||||
* injection library. */
|
||||
__attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxExtension_fnptr;
|
||||
#else
|
||||
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0
|
||||
@@ -89,35 +89,37 @@ __attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxEx
|
||||
|
||||
|
||||
|
||||
/* This function tries to find or load an NVTX injection library and get the
|
||||
* address of its InitializeInjectionExtension function. If such a function pointer
|
||||
* is found, it is called, and passed the address of this NVTX instance's
|
||||
* nvtxGetExportTable function, so the injection can attach to this instance.
|
||||
* If the initialization fails for any reason, any dynamic library loaded will
|
||||
* be freed, and all NVTX implementation functions will be set to no-ops. If
|
||||
* initialization succeeds, NVTX functions not attached to the tool will be set
|
||||
* to no-ops. This is implemented as one function instead of several small
|
||||
* functions to minimize the number of weak symbols the linker must resolve.
|
||||
* Order of search is:
|
||||
* - Pre-injected library exporting InitializeInjectionNvtxExtension
|
||||
* - Loadable library exporting InitializeInjectionNvtxExtension
|
||||
* - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
|
||||
* - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
|
||||
* - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
|
||||
*/
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr);
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr)
|
||||
/* This function tries to find or load an NVTX injection library and get the address of its
|
||||
* `InitializeInjectionExtension` function. If such a function pointer is found, it is called and
|
||||
* passed the address of this NVTX instance's `nvtxGetExportTable` function, so that the injection
|
||||
* can attach to this instance.
|
||||
* If the initialization fails for any reason, any dynamic library loaded will be freed, and all
|
||||
* NVTX implementation functions will be set to no-ops. If the initialization succeeds, NVTX
|
||||
* functions that are not attached to the tool will be set to no-ops. This is implemented as one
|
||||
* function instead of several small functions to minimize the number of weak symbols the linker
|
||||
* must resolve. The order of search is:
|
||||
* 1) Pre-injected library exporting InitializeInjectionNvtxExtension
|
||||
* 2) Loadable library exporting InitializeInjectionNvtxExtension
|
||||
* - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
|
||||
* - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
|
||||
* 3) Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
|
||||
*/
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(
|
||||
NvtxExtInitializeInjectionFunc_t* out_init_fnptr);
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(
|
||||
NvtxExtInitializeInjectionFunc_t* out_init_fnptr)
|
||||
{
|
||||
const char* const initFuncName = "InitializeInjectionNvtxExtension";
|
||||
NvtxExtInitializeInjectionFunc_t init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
|
||||
NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0;
|
||||
|
||||
if(out_init_fnptr){
|
||||
if (out_init_fnptr)
|
||||
{
|
||||
*out_init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
|
||||
}
|
||||
|
||||
#if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY
|
||||
/* Use POSIX global symbol chain to query for init function from any module */
|
||||
/* Use POSIX global symbol chain to query for init function from any module. */
|
||||
init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(0, initFuncName);
|
||||
#endif
|
||||
|
||||
@@ -127,7 +129,7 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
|
||||
{
|
||||
#if NVTX_SUPPORT_ENV_VARS
|
||||
/* If env var NVTX_INJECTION64_PATH is set, it should contain the path
|
||||
* to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
|
||||
to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
|
||||
const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4)
|
||||
? NVTX_STR("NVTX_INJECTION32_PATH")
|
||||
: NVTX_STR("NVTX_INJECTION64_PATH");
|
||||
@@ -135,12 +137,12 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
|
||||
NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE];
|
||||
const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0;
|
||||
|
||||
/* Refer to this variable explicitly in case all references to it are #if'ed out */
|
||||
/* Refer to this variable explicitly in case all references to it are #if'ed out. */
|
||||
(void)injectionLibraryPathBuf;
|
||||
|
||||
#if NVTX_SUPPORT_ENV_VARS
|
||||
/* Disable the warning for getenv & _wgetenv -- this usage is safe because
|
||||
* these functions are not called again before using the returned value. */
|
||||
these functions are not called again before using the returned value. */
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning( push )
|
||||
#pragma warning( disable : 4996 )
|
||||
@@ -188,7 +190,8 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
|
||||
|
||||
pkgName[bytesRead] = 0;
|
||||
|
||||
/* String can contain colon as a process separator. In this case the package name is before the colon. */
|
||||
/* String can contain colon as a process separator. In this case the
|
||||
package name is before the colon. */
|
||||
pos = 0;
|
||||
while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0')
|
||||
{
|
||||
@@ -223,8 +226,8 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
|
||||
}
|
||||
#endif
|
||||
|
||||
/* At this point, injectionLibraryPath is specified if a dynamic
|
||||
* injection library was specified by a tool. */
|
||||
/* At this point, `injectionLibraryPath` is specified if a dynamic
|
||||
injection library was specified by a tool. */
|
||||
if (injectionLibraryPath)
|
||||
{
|
||||
/* Load the injection library */
|
||||
@@ -236,7 +239,7 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Attempt to get the injection library's entry-point */
|
||||
/* Attempt to get the injection library's entry-point. */
|
||||
init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName);
|
||||
if (!init_fnptr)
|
||||
{
|
||||
@@ -252,8 +255,8 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
|
||||
#if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY
|
||||
if (!init_fnptr)
|
||||
{
|
||||
/* Check weakly-defined function pointer. A statically-linked injection can define this as
|
||||
* a normal symbol and it will take precedence over a dynamic injection. */
|
||||
/* Check weakly-defined function pointer. A statically-linked injection can define
|
||||
this as a normal symbol and it will take precedence over a dynamic injection. */
|
||||
if (InitializeInjectionNvtxExtension_fnptr)
|
||||
{
|
||||
init_fnptr = InitializeInjectionNvtxExtension_fnptr;
|
||||
@@ -261,13 +264,13 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
|
||||
}
|
||||
#endif
|
||||
|
||||
if(out_init_fnptr){
|
||||
if (out_init_fnptr)
|
||||
{
|
||||
*out_init_fnptr = init_fnptr;
|
||||
}
|
||||
|
||||
/* At this point, if init_fnptr is not set, then no tool has specified
|
||||
* an NVTX injection library -- return non-success result so all NVTX
|
||||
* API functions will be set to no-ops. */
|
||||
/* At this point, if `init_fnptr` is not set, no tool has specified an NVTX injection library.
|
||||
Non-success result is returned, so that all NVTX API functions will be set to no-ops. */
|
||||
if (!init_fnptr)
|
||||
{
|
||||
return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE;
|
||||
@@ -276,16 +279,19 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
|
||||
return NVTX_SUCCESS;
|
||||
}
|
||||
|
||||
/* Avoid warnings about missing prototypes. */
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
|
||||
nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState);
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
|
||||
nvtxExtModuleInfo_t* moduleInfo,
|
||||
intptr_t* moduleState
|
||||
)
|
||||
nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState)
|
||||
{
|
||||
intptr_t old;
|
||||
|
||||
NVTX_INFO( "%s\n", __FUNCTION__ );
|
||||
|
||||
if( *moduleState == NVTX_EXTENSION_LOADED) {
|
||||
if (*moduleState == NVTX_EXTENSION_LOADED)
|
||||
{
|
||||
NVTX_INFO("Module loaded\n");
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -296,45 +302,55 @@ NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
|
||||
NVTX_EXTENSION_FRESH);
|
||||
if (old == NVTX_EXTENSION_FRESH)
|
||||
{
|
||||
NvtxExtInitializeInjectionFunc_t init_fnptr = NVTX_VERSIONED_IDENTIFIER(injectionFnPtr);
|
||||
NvtxExtInitializeInjectionFunc_t init_fnptr =
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr;
|
||||
int entryPointStatus = 0;
|
||||
int forceAllToNoops = 0;
|
||||
size_t s;
|
||||
|
||||
/* Load & initialize injection library -- it will assign the function pointers */
|
||||
if(init_fnptr == 0){
|
||||
/* Load and initialize injection library, which will assign the function pointers. */
|
||||
if (init_fnptr == 0)
|
||||
{
|
||||
int result = 0;
|
||||
|
||||
/* try to load vanilla NVTX first*/
|
||||
/* Try to load vanilla NVTX first. */
|
||||
nvtxInitialize(0);
|
||||
|
||||
result = NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(&init_fnptr);
|
||||
/*at this point init_fnptr will be either 0 or a real function*/
|
||||
/* At this point `init_fnptr` will be either 0 or a real function. */
|
||||
|
||||
if(result == NVTX_SUCCESS) {
|
||||
NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = init_fnptr;
|
||||
if (result == NVTX_SUCCESS)
|
||||
{
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr = init_fnptr;
|
||||
}
|
||||
else {
|
||||
else
|
||||
{
|
||||
NVTX_ERR("Failed to load injection library\n");
|
||||
}
|
||||
}
|
||||
|
||||
if(init_fnptr != 0) {
|
||||
/* Invoke injection library's initialization function. If it returns
|
||||
* 0 (failure) and a dynamic injection was loaded, unload it. */
|
||||
if (init_fnptr != 0)
|
||||
{
|
||||
/* Invoke injection library's initialization function. If it returns
|
||||
0 (failure) and a dynamic injection was loaded, unload it. */
|
||||
entryPointStatus = init_fnptr(moduleInfo);
|
||||
if (entryPointStatus == 0) {
|
||||
if (entryPointStatus == 0)
|
||||
{
|
||||
NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n");
|
||||
}
|
||||
}
|
||||
|
||||
/* Clean up any functions that are still uninitialized so that they are skipped.
|
||||
* Set all to null if injection init function failed as well.
|
||||
*/
|
||||
/* Clean up any functions that are still uninitialized so that they are
|
||||
skipped. Set all to null if injection init function failed as well. */
|
||||
forceAllToNoops = (init_fnptr == 0) || (entryPointStatus == 0);
|
||||
for(size_t s = 0; s < moduleInfo->segmentsCount; ++s){
|
||||
nvtxExtModuleSegment_t* segment = moduleInfo->segments+s;
|
||||
for(size_t i = 0; i < segment->slotCount; ++i){
|
||||
if(forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH)){
|
||||
for (s = 0; s < moduleInfo->segmentsCount; ++s)
|
||||
{
|
||||
nvtxExtModuleSegment_t* segment = moduleInfo->segments + s;
|
||||
size_t i;
|
||||
for (i = 0; i < segment->slotCount; ++i)
|
||||
{
|
||||
if (forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH))
|
||||
{
|
||||
segment->functionSlots[i] = NVTX_EXTENSION_DISABLED;
|
||||
}
|
||||
}
|
||||
@@ -342,12 +358,11 @@ NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
|
||||
|
||||
NVTX_MEMBAR();
|
||||
|
||||
/* Signal that initialization has finished, so now the assigned function pointers will be used */
|
||||
NVTX_ATOMIC_WRITE_PTR(
|
||||
moduleState,
|
||||
NVTX_EXTENSION_LOADED);
|
||||
/* Signal that initialization has finished and the assigned function
|
||||
pointers will be used. */
|
||||
NVTX_ATOMIC_WRITE_PTR(moduleState, NVTX_EXTENSION_LOADED);
|
||||
}
|
||||
else /* Spin-wait until initialization has finished */
|
||||
else /* Spin-wait until initialization has finished. */
|
||||
{
|
||||
NVTX_MEMBAR();
|
||||
while (*moduleState != NVTX_EXTENSION_LOADED)
|
||||
@@ -0,0 +1,272 @@
|
||||
/*
|
||||
* Copyright 2023 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H
|
||||
#define NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H
|
||||
|
||||
/* General helper macros */
|
||||
#include "nvtxExtHelperMacros.h"
|
||||
|
||||
/* Get variable name with line number (almost unique per file). */
|
||||
#define _NVTX_PAYLOAD_DATA_VAR NVTX_EXT_CONCAT(nvtxDFDB,__LINE__)
|
||||
|
||||
/* Create real arguments from just pasting tokens next to each other. */
|
||||
#define _NVTX_PAYLOAD_PASS_THROUGH(...) __VA_ARGS__
|
||||
|
||||
/* Avoid prefixing `NVTX_PAYLOAD_ENTRY_` for nested payloads. */
|
||||
#define NVTX_PAYLOAD_ENTRY_THROWAWAY
|
||||
#define _NVTX_PAYLOAD_NESTED(id) THROWAWAY id
|
||||
|
||||
/*
|
||||
* Create the NVTX binary payloads schema attributes.
|
||||
*
|
||||
* @param struct_id The name of the struct.
|
||||
* @param schema_name The name of the schema.
|
||||
* @param schema_flags Additional schema flags
|
||||
* @param mask_add Fields to be added to the mask.
|
||||
* @param num_entries The number schema entries.
|
||||
*/
|
||||
#define NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, mask_add, num_entries) \
|
||||
nvtxPayloadSchemaAttr_t struct_id##Attr = { \
|
||||
/*.fieldMask = */NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | mask_add \
|
||||
NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | \
|
||||
NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | \
|
||||
NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE, \
|
||||
/*.name = */schema_name, \
|
||||
/*.type = */NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, \
|
||||
/*.flags = */schema_flags, \
|
||||
/*.entries = */struct_id##Schema, /*.numEntries = */num_entries, \
|
||||
/*.payloadStaticSize = */sizeof(struct_id), \
|
||||
/*.packAlign = */0, /*.schemaId = */schema_id};
|
||||
|
||||
|
||||
/*****************************************************************/
|
||||
/*** Helper for `NVTX_DEFINE_SCHEMA_FOR_STRUCT[_AND_REGISTER]` ***/
|
||||
|
||||
/* First part of schema entry for different number of arguments. */
|
||||
#define _NVTX_PAYLOAD_SCHEMA_EF2(member, etype) \
|
||||
0, NVTX_PAYLOAD_ENTRY_##etype, NULL, NULL, 0,
|
||||
#define _NVTX_PAYLOAD_SCHEMA_EF3(member, etype, name) \
|
||||
0, NVTX_PAYLOAD_ENTRY_##etype, name, NULL, 0,
|
||||
#define _NVTX_PAYLOAD_SCHEMA_EF4(member, etype, name, desc) \
|
||||
0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, 0,
|
||||
#define _NVTX_PAYLOAD_SCHEMA_EF5(member, etype, name, desc, arraylen) \
|
||||
0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
|
||||
#define _NVTX_PAYLOAD_SCHEMA_EF6(member, etype, name, desc, arraylen, flags) \
|
||||
NVTX_PAYLOAD_ENTRY_FLAG_##flags, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
|
||||
|
||||
#define _NVTX_PAYLOAD_SCHEMA_ENTRY_FRONT(...) \
|
||||
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_SCHEMA_EF, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
|
||||
|
||||
/* Second part of schema entry (append struct member).
|
||||
(At least two arguments are passed (`member` and `etype`). */
|
||||
#define _NVTX_PAYLOAD_SCHEMA_ENTRY_END(member, ...) member
|
||||
|
||||
/* Resolve to schema entry. `entry` is `(ctype, name, ...)`. */
|
||||
#define _NVTX_PAYLOAD_SCHEMA_ENTRY(struct_id, entry) \
|
||||
{_NVTX_PAYLOAD_SCHEMA_ENTRY_FRONT entry \
|
||||
offsetof(struct_id, _NVTX_PAYLOAD_SCHEMA_ENTRY_END entry)},
|
||||
|
||||
/* Handle up to 16 schema entries. */
|
||||
#define _NVTX_PAYLOAD_SME1(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1)
|
||||
#define _NVTX_PAYLOAD_SME2(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME1(s,__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_SME3(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME2(s,__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_SME4(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME3(s,__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_SME5(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME4(s,__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_SME6(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME5(s,__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_SME7(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME6(s,__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_SME8(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME7(s,__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_SME9(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME8(s,__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_SME10(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME9(s,__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_SME11(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME10(s,__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_SME12(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME11(s,__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_SME13(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME12(s,__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_SME14(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME13(s,__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_SME15(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME14(s,__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_SME16(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME15(s,__VA_ARGS__)
|
||||
|
||||
#define _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, ...) \
|
||||
nvtxPayloadSchemaEntry_t struct_id##Schema[] = { \
|
||||
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_SME, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(struct_id, __VA_ARGS__) \
|
||||
{0, 0} \
|
||||
};
|
||||
|
||||
/*
|
||||
* Handle optional parameters for `NVTX_DEFINE_SCHEMA_FOR_STRUCT[_AND_REGISTER]`.
|
||||
*/
|
||||
#define _NVTX_DEFINE_S4S_6(struct_id, schema_name, prefix, schema_flags, schema_id, entries) \
|
||||
prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
|
||||
prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, \
|
||||
NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID |,\
|
||||
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
|
||||
#define _NVTX_DEFINE_S4S_5(struct_id, schema_name, prefix, schema_flags, entries) \
|
||||
prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
|
||||
prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, 0, \
|
||||
NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS |, \
|
||||
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
|
||||
#define _NVTX_DEFINE_S4S_4(struct_id, schema_name, prefix, entries) \
|
||||
prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
|
||||
prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, \
|
||||
NVTX_PAYLOAD_SCHEMA_ATTR_NAME |, \
|
||||
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
|
||||
#define _NVTX_DEFINE_S4S_3(struct_id, schema_name, entries) \
|
||||
_NVTX_DEFINE_S4S_4(struct_id, schema_name, /*prefix*/, entries)
|
||||
#define _NVTX_DEFINE_S4S_2(struct_id, entries) \
|
||||
_NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
|
||||
NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, NULL, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, ,\
|
||||
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
|
||||
|
||||
#define _NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, ...) \
|
||||
NVTX_EXT_CONCAT(_NVTX_DEFINE_S4S_, \
|
||||
NVTX_EXT_NUM_ARGS(struct_id, __VA_ARGS__))(struct_id, __VA_ARGS__)
|
||||
|
||||
/*** END: Helper for `NVTX_PAYLOAD_STATIC_SCHEMA_{DEFINE,SETUP}` ***/
|
||||
|
||||
|
||||
/******************************************************************/
|
||||
/*** Helper for `NVTX_DEFINE_STRUCT_WITH_SCHEMA[_AND_REGISTER]` ***/
|
||||
|
||||
/* Extract struct member for fixed-size arrays. */
|
||||
#define _NVTX_PAYLOAD_STRUCT_ARR_MEM1(name) name
|
||||
#define _NVTX_PAYLOAD_STRUCT_ARR_MEM2(name, count) name[count]
|
||||
|
||||
/* Extract type and member name and handle special case of fixed-size array. */
|
||||
#define _NVTX_PAYLOAD_STRUCT_E2(type, member) type member;
|
||||
#define _NVTX_PAYLOAD_STRUCT_E3(type, member, etype) type member;
|
||||
#define _NVTX_PAYLOAD_STRUCT_E4(type, member, etype, name) type member;
|
||||
#define _NVTX_PAYLOAD_STRUCT_E5(type, member, etype, name, desc) type member;
|
||||
#define _NVTX_PAYLOAD_STRUCT_E6(type, member, etype, name, desc, arraylen) \
|
||||
type NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT_ARR_MEM, NVTX_EXT_NUM_ARGS member) member;
|
||||
#define _NVTX_PAYLOAD_STRUCT_E7(type, member, etype, name, desc, arraylen, flags) \
|
||||
_NVTX_PAYLOAD_STRUCT_E6(type, member, etype, name, desc, arraylen)
|
||||
|
||||
/* Handle different number of arguments per struct entry. */
|
||||
#define _NVTX_PAYLOAD_STRUCT_ENTRY_(...) \
|
||||
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT_E, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
|
||||
|
||||
/* Handle up to 16 struct members. */
|
||||
#define _NVTX_PAYLOAD_STRUCT_ENTRY(entry) _NVTX_PAYLOAD_STRUCT_ENTRY_ entry
|
||||
#define _NVTX_PAYLOAD_STRUCT1(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1)
|
||||
#define _NVTX_PAYLOAD_STRUCT2(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT1(__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_STRUCT3(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT2(__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_STRUCT4(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT3(__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_STRUCT5(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT4(__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_STRUCT6(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT5(__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_STRUCT7(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT6(__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_STRUCT8(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT7(__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_STRUCT9(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT8(__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_STRUCT10(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT9(__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_STRUCT11(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT10(__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_STRUCT12(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT11(__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_STRUCT13(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT12(__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_STRUCT14(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT13(__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_STRUCT15(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT14(__VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_STRUCT16(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT15(__VA_ARGS__)
|
||||
|
||||
/* Generate the typedef. */
|
||||
#define _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, ...) \
|
||||
typedef struct { \
|
||||
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) \
|
||||
} struct_id;
|
||||
|
||||
/* Generate first part of the schema entry. */
|
||||
#define _NVTX_PAYLOAD_INIT_SCHEMA_N3(type, memberId, etype) \
|
||||
0, NVTX_PAYLOAD_ENTRY_##etype, NULL, NULL, 0,
|
||||
#define _NVTX_PAYLOAD_INIT_SCHEMA_N4(type, memberId, etype, name) \
|
||||
0, NVTX_PAYLOAD_ENTRY_##etype, name, NULL, 0,
|
||||
#define _NVTX_PAYLOAD_INIT_SCHEMA_N5(type, memberId, etype, name, desc) \
|
||||
0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, 0,
|
||||
#define _NVTX_PAYLOAD_INIT_SCHEMA_N6(type, memberId, etype, name, desc, arraylen) \
|
||||
0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
|
||||
#define _NVTX_PAYLOAD_INIT_SCHEMA_N7(type, memberId, etype, name, desc, arraylen, flags) \
|
||||
NVTX_PAYLOAD_ENTRY_FLAG_##flags, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
|
||||
|
||||
#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_FRONT(...) \
|
||||
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SCHEMA_N, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
|
||||
|
||||
#define _NVTX_PAYLOAD_ARRAY_MEMBER1(name) name
|
||||
#define _NVTX_PAYLOAD_ARRAY_MEMBER2(name, count) name
|
||||
|
||||
/* Resolve to last part of schema entry (append struct member). */
|
||||
#define _NVTX_PAYLOAD_INIT_SCHEMA_NX3(type, memberId, ...) memberId
|
||||
#define _NVTX_PAYLOAD_INIT_SCHEMA_NX4(type, memberId, ...) memberId
|
||||
#define _NVTX_PAYLOAD_INIT_SCHEMA_NX5(type, memberId, ...) memberId
|
||||
#define _NVTX_PAYLOAD_INIT_SCHEMA_NX6(type, memberId, ...) \
|
||||
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_ARRAY_MEMBER, NVTX_EXT_NUM_ARGS memberId) memberId
|
||||
#define _NVTX_PAYLOAD_INIT_SCHEMA_NX7(type, memberId, ...) \
|
||||
_NVTX_PAYLOAD_INIT_SCHEMA_NX6(type, memberId, __VA_ARGS__)
|
||||
|
||||
#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_END(...) \
|
||||
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SCHEMA_NX, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
|
||||
|
||||
/* Resolve to schema entry. `entry` is `(ctype, name, ...)`. */
|
||||
#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(struct_id, entry) \
|
||||
{_NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_FRONT entry \
|
||||
offsetof(struct_id, _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_END entry)},
|
||||
|
||||
/* Handle up to 16 schema entries. */
|
||||
#define _NVTX_PAYLOAD_INIT_SME1(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1)
|
||||
#define _NVTX_PAYLOAD_INIT_SME2(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME1(s, __VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_INIT_SME3(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME2(s, __VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_INIT_SME4(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME3(s, __VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_INIT_SME5(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME4(s, __VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_INIT_SME6(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME5(s, __VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_INIT_SME7(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME6(s, __VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_INIT_SME8(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME7(s, __VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_INIT_SME9(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME8(s, __VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_INIT_SME10(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME9(s, __VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_INIT_SME11(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME10(s, __VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_INIT_SME12(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME11(s, __VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_INIT_SME13(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME12(s, __VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_INIT_SME14(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME13(s, __VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_INIT_SME15(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME14(s, __VA_ARGS__)
|
||||
#define _NVTX_PAYLOAD_INIT_SME16(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME15(s, __VA_ARGS__)
|
||||
|
||||
#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, ...) \
|
||||
nvtxPayloadSchemaEntry_t struct_id##Schema[] = { \
|
||||
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SME, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(struct_id, __VA_ARGS__) \
|
||||
{0, 0} \
|
||||
};
|
||||
|
||||
/*
|
||||
* Handle optional parameters for `NVTX_DEFINE_STRUCT_WITH_SCHEMA[_AND_REGISTER]`.
|
||||
*/
|
||||
#define _NVTX_DEFINE_SWS_6(struct_id, schema_name, prefix, schema_flags, schema_id, entries) \
|
||||
_NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
|
||||
prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
|
||||
prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, \
|
||||
NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS | \
|
||||
NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID |, \
|
||||
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
|
||||
#define _NVTX_DEFINE_SWS_5(struct_id, schema_name, prefix, schema_flags, entries) \
|
||||
_NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
|
||||
prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
|
||||
prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, 0, \
|
||||
NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS |, \
|
||||
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
|
||||
#define _NVTX_DEFINE_SWS_4(struct_id, schema_name, prefix, entries) \
|
||||
_NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
|
||||
prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
|
||||
prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, \
|
||||
NVTX_PAYLOAD_SCHEMA_ATTR_NAME |, \
|
||||
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
|
||||
#define _NVTX_DEFINE_SWS_3(struct_id, schema_name, entries) \
|
||||
_NVTX_DEFINE_SWS_4(struct_id, schema_name, /* no prefix */, entries)
|
||||
#define _NVTX_DEFINE_SWS_2(struct_id, entries) \
|
||||
_NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
|
||||
_NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
|
||||
NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, NULL, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, , \
|
||||
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
|
||||
|
||||
#define _NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, ...) \
|
||||
NVTX_EXT_CONCAT(_NVTX_DEFINE_SWS_, \
|
||||
NVTX_EXT_NUM_ARGS(struct_id, __VA_ARGS__))(struct_id, __VA_ARGS__)
|
||||
|
||||
/*** END: Helper for `NVTX_PAYLOAD_STATIC_SCHEMA_{INIT,CREATE}` */
|
||||
|
||||
#endif /* NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H */
|
||||
+14
-6
@@ -10,14 +10,14 @@
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
typedef void* pointer_type;
|
||||
typedef void* nvtx_payload_pointer_type;
|
||||
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
|
||||
#include <uchar.h>
|
||||
#include <stdalign.h>
|
||||
#endif
|
||||
|
||||
/* `alignof` is available as of C11 or C++11 */
|
||||
/* `alignof` is available as of C11 or C++11. */
|
||||
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L)
|
||||
|
||||
#define nvtx_alignof(type) alignof(type)
|
||||
@@ -54,7 +54,7 @@ MKTYPEDEF(double);
|
||||
MKTYPEDEF2(long double, longdouble);
|
||||
|
||||
MKTYPEDEF(size_t);
|
||||
MKTYPEDEF(pointer_type);
|
||||
MKTYPEDEF(nvtx_payload_pointer_type);
|
||||
|
||||
MKTYPEDEF(wchar_t);
|
||||
|
||||
@@ -85,8 +85,16 @@ MKTYPEDEF(wchar_t);
|
||||
/*
|
||||
* Helper array to get the alignment for each predefined C/C++ language type.
|
||||
* The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`.
|
||||
*
|
||||
* In C++, `const` variables use internal linkage by default, but we need it to
|
||||
* be public (extern) since weak declarations must be public.
|
||||
*/
|
||||
const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
|
||||
NVTX_LINKONCE_DEFINE_GLOBAL
|
||||
#ifdef __cplusplus
|
||||
extern
|
||||
#endif
|
||||
const nvtxPayloadEntryTypeInfo_t
|
||||
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo)[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
|
||||
{
|
||||
/* The first entry contains this array's length and the size of each entry in this array. */
|
||||
{NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE, sizeof(nvtxPayloadEntryTypeInfo_t)},
|
||||
@@ -119,7 +127,7 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
|
||||
/* NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE */ {sizeof(long double), nvtx_alignof2(long double, longdouble)},
|
||||
|
||||
/* NVTX_PAYLOAD_ENTRY_TYPE_SIZE */ {sizeof(size_t), nvtx_alignof(size_t)},
|
||||
/* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(pointer_type), nvtx_alignof(pointer_type)},
|
||||
/* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(nvtx_payload_pointer_type), nvtx_alignof(nvtx_payload_pointer_type)},
|
||||
|
||||
/*** Special character types ***/
|
||||
/* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)},
|
||||
@@ -140,4 +148,4 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
|
||||
};
|
||||
|
||||
#undef nvtx_alignof
|
||||
#undef nvtx_alignof2
|
||||
#undef nvtx_alignof2
|
||||
@@ -10,37 +10,34 @@
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <wchar.h>
|
||||
|
||||
/* ---- Include required platform headers ---- */
|
||||
|
||||
#if defined(_WIN32)
|
||||
#if defined(_WIN32)
|
||||
|
||||
#include <Windows.h>
|
||||
#include <windows.h>
|
||||
|
||||
#else
|
||||
#include <unistd.h>
|
||||
|
||||
#if defined(__ANDROID__)
|
||||
#include <android/api-level.h>
|
||||
#include <android/api-level.h>
|
||||
#endif
|
||||
|
||||
#if defined(__linux__) || defined(__CYGWIN__)
|
||||
#include <sched.h>
|
||||
#endif
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <limits.h>
|
||||
#include <dlfcn.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <pthread.h>
|
||||
#include <stdlib.h>
|
||||
#include <wchar.h>
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -14,11 +14,11 @@
|
||||
|
||||
/* Prefer macros over inline functions to reduce symbol resolution at link time */
|
||||
|
||||
#if defined(_WIN32)
|
||||
#if defined(_WIN32)
|
||||
#define NVTX_PATHCHAR wchar_t
|
||||
#define NVTX_STR(x) L##x
|
||||
#define NVTX_GETENV _wgetenv
|
||||
#define NVTX_BUFSIZE MAX_PATH
|
||||
#define NVTX_BUFSIZE 16384
|
||||
#define NVTX_DLLHANDLE HMODULE
|
||||
#define NVTX_DLLOPEN(x) LoadLibraryW(x)
|
||||
#define NVTX_DLLFUNC GetProcAddress
|
||||
@@ -31,7 +31,7 @@
|
||||
#define NVTX_PATHCHAR char
|
||||
#define NVTX_STR(x) x
|
||||
#define NVTX_GETENV getenv
|
||||
#define NVTX_BUFSIZE PATH_MAX
|
||||
#define NVTX_BUFSIZE 16384
|
||||
#define NVTX_DLLHANDLE void*
|
||||
#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
|
||||
#define NVTX_DLLFUNC dlsym
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
* In some situations it is desirable to declare a variable without initializing
|
||||
* it, refer to it in code or other variables' initializers, and then initialize
|
||||
* it later. Similarly, functions can be prototyped, have their address taken,
|
||||
* and then have their body defined later. In such cases, use the FWDDECL macros
|
||||
* and then have their body defined later. In such cases, use the FWDDECL macros
|
||||
* when forward-declaring LINKONCE global variables without initializers and
|
||||
* function prototypes, and then use the DEFINE macros when later defining them.
|
||||
* Although in many cases the FWDDECL macro is equivalent to the DEFINE macro,
|
||||
|
||||
@@ -1,86 +0,0 @@
|
||||
/*
|
||||
* Copyright 2021 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
#define NVTX_EXT_IMPL_GUARD
|
||||
#include "nvtxExtImpl.h"
|
||||
#undef NVTX_EXT_IMPL_GUARD
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
|
||||
NAME##_v##VERSION##_mem##COMPATID
|
||||
#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
|
||||
NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
|
||||
#define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \
|
||||
NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_PAYLOAD)
|
||||
|
||||
/*
|
||||
* Function slots for the binary payload extension. First entry is the module
|
||||
* state, initialized to `0` (`NVTX_EXTENSION_FRESH`).
|
||||
*/
|
||||
NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
|
||||
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_PAYLOAD_FN_NUM + 1]
|
||||
= {0};
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
|
||||
{
|
||||
intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1;
|
||||
nvtxExtModuleSegment_t segment = {
|
||||
0, // unused (only one segment)
|
||||
NVTX3EXT_CBID_PAYLOAD_FN_NUM,
|
||||
fnSlots
|
||||
};
|
||||
|
||||
nvtxExtModuleInfo_t module = {
|
||||
NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
|
||||
NVTX_EXT_MODULEID_PAYLOAD, NVTX_EXT_COMPATID_PAYLOAD,
|
||||
1, &segment, // number of segments, segments
|
||||
NULL, // no export function needed
|
||||
// bake type sizes and alignment information into program binary
|
||||
&nvtxExtPayloadTypeInfo
|
||||
};
|
||||
|
||||
NVTX_INFO( "%s\n", __FUNCTION__ );
|
||||
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
|
||||
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots));
|
||||
}
|
||||
|
||||
#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
|
||||
typedef ret_val ( * fn_name##_impl_fntype )signature; \
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION ret_val fn_name signature { \
|
||||
intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
|
||||
if (slot != NVTX_EXTENSION_DISABLED) { \
|
||||
if (slot) { \
|
||||
return (*(fn_name##_impl_fntype)slot) arg_names; \
|
||||
} else { \
|
||||
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \
|
||||
slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
|
||||
if (slot != NVTX_EXTENSION_DISABLED && slot) { \
|
||||
return (*(fn_name##_impl_fntype)slot) arg_names; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
return ((ret_val)(intptr_t)-1); \
|
||||
}
|
||||
|
||||
NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadSchemaRegister, (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr), (domain, attr))
|
||||
|
||||
NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadEnumRegister, (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr), (domain, attr))
|
||||
|
||||
#undef NVTX_EXT_FN_IMPL
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
@@ -10,6 +10,9 @@
|
||||
#define NCCL_P2P_H_
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "core.h"
|
||||
|
||||
#if CUDART_VERSION < 12030
|
||||
// MNNVL: FABRIC handle support lifted from CUDA 12.3
|
||||
|
||||
+21
-5
@@ -16,13 +16,29 @@
|
||||
#include "shm.h"
|
||||
#include "p2p.h"
|
||||
|
||||
typedef enum : uint8_t {
|
||||
ncclPatternRing,
|
||||
ncclPatternRingTwice,
|
||||
ncclPatternPipelineFrom,
|
||||
ncclPatternPipelineTo,
|
||||
ncclPatternTreeUp,
|
||||
ncclPatternTreeDown,
|
||||
ncclPatternTreeUpDown,
|
||||
ncclPatternCollnetChain,
|
||||
ncclPatternCollnetDirect,
|
||||
ncclPatternNvls,
|
||||
ncclPatternNvlsTree,
|
||||
ncclPatternSend,
|
||||
ncclPatternRecv
|
||||
} ncclPattern_t;
|
||||
|
||||
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
|
||||
|
||||
struct ncclProxyArgs;
|
||||
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*);
|
||||
|
||||
#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
|
||||
static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
|
||||
static_assert(2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH <= MAXCHANNELS, "Not enough sub space for max work elements");
|
||||
|
||||
union ncclProxyOpSpecifics {
|
||||
struct {
|
||||
@@ -124,8 +140,9 @@ struct ncclProxyArgs {
|
||||
|
||||
// ProxyOps are used to communicate between main thread and service thread
|
||||
// Make sure we have enough to store two full rounds of operations on all channels.
|
||||
// Otherwise we'd be unable to post half of them to free new elements.
|
||||
#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P)
|
||||
// Otherwise we'd be unable to post half of them to free new elements. Each
|
||||
// p2p work contains a send and recv proxy op hence the 2x before it.
|
||||
#define MAX_OPS_PER_PEER (2*MAXCHANNELS*2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH)
|
||||
|
||||
struct ncclProxyOpsPool {
|
||||
struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS];
|
||||
@@ -243,7 +260,7 @@ struct ncclProxyState {
|
||||
bool dmaBufSupport;
|
||||
ncclNet_t* ncclNet;
|
||||
ncclCollNet_t* ncclCollNet;
|
||||
volatile uint32_t* abortFlag;
|
||||
uint32_t* abortFlag;
|
||||
// Service threads
|
||||
pthread_t thread;
|
||||
pthread_t threadUDS;
|
||||
@@ -301,7 +318,6 @@ enum proxyMode {
|
||||
};
|
||||
|
||||
ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
|
||||
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp, int reg);
|
||||
ncclResult_t ncclProxyStart(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS);
|
||||
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
#ifndef NCCL_REGISTER_H_
|
||||
#define NCCL_REGISTER_H_
|
||||
|
||||
#include "device.h"
|
||||
|
||||
#include <cuda.h>
|
||||
#include <stdint.h>
|
||||
|
||||
enum {
|
||||
NET_REG_COMPLETE = 0x01,
|
||||
NVLS_REG_COMPLETE = 0x02,
|
||||
|
||||
@@ -13,12 +13,14 @@
|
||||
#include "core.h"
|
||||
|
||||
#define NTRANSPORTS 4
|
||||
#define TRANSPORT_UNDEFINED -1
|
||||
#define TRANSPORT_P2P 0
|
||||
#define TRANSPORT_SHM 1
|
||||
#define TRANSPORT_NET 2
|
||||
#define TRANSPORT_COLLNET 3
|
||||
|
||||
#include "proxy.h"
|
||||
#include "comm.h"
|
||||
|
||||
extern struct ncclTransport p2pTransport;
|
||||
extern struct ncclTransport shmTransport;
|
||||
@@ -45,6 +47,7 @@ struct ncclPeerInfo {
|
||||
int cudaCompCap;
|
||||
// MNNVL support
|
||||
nvmlGpuFabricInfoV_t fabricInfo;
|
||||
int cuMemSupport;
|
||||
};
|
||||
|
||||
#define CONNECT_SIZE 128
|
||||
@@ -57,17 +60,21 @@ struct ncclConnect {
|
||||
#define NVLS_HANDLE_SIZE 64
|
||||
struct ncclNvlsSharedRes {
|
||||
int refCount;
|
||||
CUmulticastObjectProp properties;
|
||||
bool inited;
|
||||
CUmulticastObjectProp bufProp;
|
||||
CUmulticastObjectProp signalProp;
|
||||
CUmemAccessDesc accessDesc;
|
||||
int dev;
|
||||
size_t size;
|
||||
size_t granularity;
|
||||
CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
|
||||
size_t buffSize;
|
||||
size_t creditSize;
|
||||
CUmemGenericAllocationHandle mcBuffHandle; // Multicast handle for NVLS buffer
|
||||
CUmemGenericAllocationHandle mcCreditHandle; // Multicast handle for NVLS credit buffer
|
||||
char* mcBuff; // Multicast NVLS buffer address
|
||||
CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
|
||||
char* mcCredit; // Multicast NVLS credit address
|
||||
CUmemGenericAllocationHandle ucBuffHandle; // Unicast Handle for NVLS buffer
|
||||
CUmemGenericAllocationHandle ucCreditHandle; // Unicast Handle for NVLS credit buffer
|
||||
char* ucBuff; // Unicast NVLS buffer address
|
||||
char shareableHandle[NVLS_HANDLE_SIZE];
|
||||
size_t ucGran;
|
||||
char* ucCredit; // Unicast NVLS credit address
|
||||
int nChannels;
|
||||
struct ncclShmemCollBuff nvlsShmem;
|
||||
void *nvlsShmemHandle;
|
||||
@@ -84,6 +91,7 @@ struct ncclCollNetSharedRes {
|
||||
void* resources;
|
||||
int nChannels;
|
||||
size_t buffSize;
|
||||
int intraHighestTransportType;
|
||||
};
|
||||
|
||||
struct ncclTransportComm {
|
||||
@@ -111,7 +119,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
|
||||
ncclResult_t ncclNvlsInit(struct ncclComm* comm);
|
||||
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
|
||||
ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
|
||||
ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm);
|
||||
ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm);
|
||||
ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
|
||||
ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
|
||||
ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
|
||||
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
|
||||
@@ -121,6 +131,14 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
|
||||
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
|
||||
ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
|
||||
ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufUsed, void** outHandle);
|
||||
ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, struct ncclKernelPlan *plan, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle);
|
||||
ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
|
||||
ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyconn, void* handle);
|
||||
|
||||
ncclResult_t ncclTransportRingConnect(struct ncclComm* comm);
|
||||
ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm);
|
||||
|
||||
ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* graphs[]);
|
||||
ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm);
|
||||
ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -9,14 +9,15 @@
|
||||
#define NCCL_INT_TUNER_H_
|
||||
|
||||
#include "nccl_tuner.h"
|
||||
#include "comm.h"
|
||||
|
||||
// Tuning plugin to override NCCL's default algorithm/protocol tuning.
|
||||
|
||||
// Attempts to load NCCL tuner from environmental variable.
|
||||
// Returns ncclSuccess if the correct tuner symbol has been found and
|
||||
// successully loaded. Otherwise returns an error and also logs the error.
|
||||
ncclResult_t ncclTunerPluginLoad(ncclTuner_t** tuner);
|
||||
ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm);
|
||||
|
||||
// Cleans up NCCL tuner plugin.
|
||||
ncclResult_t ncclTunerPluginUnload(ncclTuner_t** tuner);
|
||||
ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm);
|
||||
#endif
|
||||
|
||||
+39
-48
@@ -9,12 +9,14 @@
|
||||
|
||||
#include "nccl.h"
|
||||
#include "alloc.h"
|
||||
#include "bitops.h"
|
||||
#include "checks.h"
|
||||
#include <stdint.h>
|
||||
#include <time.h>
|
||||
#include <sched.h>
|
||||
#include <algorithm>
|
||||
#include <new>
|
||||
#include <type_traits>
|
||||
|
||||
int ncclCudaCompCap();
|
||||
|
||||
@@ -30,11 +32,6 @@ uint64_t getHostHash();
|
||||
uint64_t getPidHash();
|
||||
ncclResult_t getRandomData(void* buffer, size_t bytes);
|
||||
|
||||
const char* ncclOpToString(ncclRedOp_t op);
|
||||
const char* ncclDatatypeToString(ncclDataType_t type);
|
||||
const char* ncclAlgoToString(int algo);
|
||||
const char* ncclProtoToString(int proto);
|
||||
|
||||
struct netIf {
|
||||
char prefix[64];
|
||||
int port;
|
||||
@@ -44,9 +41,7 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList);
|
||||
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
|
||||
|
||||
static long log2i(long n) {
|
||||
long l = 0;
|
||||
while (n>>=1) l++;
|
||||
return l;
|
||||
return log2Down(n);
|
||||
}
|
||||
|
||||
inline uint64_t clockNano() {
|
||||
@@ -96,8 +91,11 @@ void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
|
||||
void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
|
||||
void ncclMemoryStackPush(struct ncclMemoryStack* me);
|
||||
void ncclMemoryStackPop(struct ncclMemoryStack* me);
|
||||
void* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t size, size_t align);
|
||||
template<typename T>
|
||||
T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1);
|
||||
template<typename Header, typename Element>
|
||||
inline Header* ncclMemoryStackAllocInlineArray(struct ncclMemoryStack* me, size_t nElt);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
|
||||
@@ -140,11 +138,14 @@ T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me);
|
||||
template<typename T, T *T::*next>
|
||||
void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x);
|
||||
template<typename T, T *T::*next>
|
||||
void ncclIntruQueueEnqueueFront(ncclIntruQueue<T,next> *me, T *x);
|
||||
template<typename T, T *T::*next>
|
||||
T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me);
|
||||
template<typename T, T *T::*next>
|
||||
T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me);
|
||||
template<typename T, T *T::*next>
|
||||
void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *memPool);
|
||||
void ncclIntruQueueTransfer(ncclIntruQueue<T,next> *dst, ncclIntruQueue<T,next> *src);
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
|
||||
@@ -233,6 +234,12 @@ inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size,
|
||||
return obj;
|
||||
}
|
||||
|
||||
inline void* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t size, size_t align) {
|
||||
void *obj = ncclMemoryStack::allocate(me, size, align);
|
||||
memset(obj, 0, size);
|
||||
return obj;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
|
||||
void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T));
|
||||
@@ -240,6 +247,17 @@ inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
|
||||
return (T*)obj;
|
||||
}
|
||||
|
||||
template<typename Header, typename Element>
|
||||
inline Header* ncclMemoryStackAllocInlineArray(struct ncclMemoryStack* me, size_t nElt) {
|
||||
size_t size = sizeof(Header);
|
||||
size = (size + alignof(Element)-1) & -alignof(Element);
|
||||
size += nElt*sizeof(Element);
|
||||
size_t align = alignof(Header) < alignof(Element) ? alignof(Element) : alignof(Header);
|
||||
void *obj = ncclMemoryStack::allocate(me, size, align);
|
||||
memset(obj, 0, size);
|
||||
return (Header*)obj;
|
||||
}
|
||||
|
||||
inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
|
||||
using Frame = ncclMemoryStack::Frame;
|
||||
Frame tmp = me->topFrame;
|
||||
@@ -343,6 +361,13 @@ inline void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x) {
|
||||
me->tail = x;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
inline void ncclIntruQueueEnqueueFront(ncclIntruQueue<T,next> *me, T *x) {
|
||||
if (me->head == nullptr) me->tail = x;
|
||||
x->*next = me->head;
|
||||
me->head = x;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
inline T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me) {
|
||||
T *ans = me->head;
|
||||
@@ -388,45 +413,11 @@ inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me) {
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *pool) {
|
||||
T *head = me->head;
|
||||
me->head = nullptr;
|
||||
me->tail = nullptr;
|
||||
while (head != nullptr) {
|
||||
T *tmp = head->*next;
|
||||
ncclMemoryPoolFree(pool, tmp);
|
||||
head = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
/* cmp function determines the sequence of objects in the queue. If cmp returns value >= 0, it means a > b,
|
||||
* and we should put a before b; otherwise, b should be put ahead of a. */
|
||||
template<typename T, T *T::*next>
|
||||
inline void ncclIntruQueueSortEnqueue(ncclIntruQueue<T,next> *me, T *x, int (*cmp)(T *a, T *b)) {
|
||||
T *cur = me->head;
|
||||
T *prev = NULL;
|
||||
|
||||
if (cur == NULL) {
|
||||
x->*next = nullptr;
|
||||
me->tail = me->head = x;
|
||||
} else {
|
||||
while (cur) {
|
||||
if (cmp(cur, x) > 0) {
|
||||
prev = cur;
|
||||
cur = cur->next;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
x->*next = cur;
|
||||
if (prev) {
|
||||
prev->*next = x;
|
||||
if (cur == NULL) me->tail = x;
|
||||
} else {
|
||||
me->head = x;
|
||||
}
|
||||
}
|
||||
void ncclIntruQueueTransfer(ncclIntruQueue<T,next> *dst, ncclIntruQueue<T,next> *src) {
|
||||
(dst->tail ? dst->tail->next : dst->head) = src->head;
|
||||
if (src->tail) dst->tail = src->tail;
|
||||
src->head = nullptr;
|
||||
src->tail = nullptr;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
+379
-540
File diff ditekan karena terlalu besar
Load Diff
+7
-6
@@ -2,11 +2,11 @@
|
||||
#include "nvtx.h"
|
||||
|
||||
static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = {
|
||||
{"Sum", ncclSum},
|
||||
{"Product", ncclProd},
|
||||
{"Max", ncclMax},
|
||||
{"Min", ncclMin},
|
||||
{"Avg", ncclAvg}
|
||||
{"Sum", ncclSum, 0},
|
||||
{"Product", ncclProd, 0},
|
||||
{"Max", ncclMax, 0},
|
||||
{"Min", ncclMin, 0},
|
||||
{"Avg", ncclAvg, 0}
|
||||
};
|
||||
|
||||
// Must be called before the first call to any reduction operation.
|
||||
@@ -19,7 +19,8 @@ void initNvtxRegisteredEnums() {
|
||||
.entries = NvtxEnumRedSchema,
|
||||
.numEntries = std::extent<decltype(NvtxEnumRedSchema)>::value,
|
||||
.sizeOfEnum = sizeof(ncclRedOp_t),
|
||||
.schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP
|
||||
.schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP,
|
||||
.extension = nullptr
|
||||
};
|
||||
|
||||
nvtxPayloadEnumRegister(nvtx3::domain::get<nccl_domain>(), &eAttr);
|
||||
|
||||
@@ -52,8 +52,6 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
|
||||
WARN("%s : invalid type %d", info->opName, info->datatype);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
// Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
|
||||
NCCLCHECK(ncclInfoSetDerived(info, info->comm->nRanks));
|
||||
|
||||
if (info->op < 0 || ncclMaxRedOp < info->op) {
|
||||
WARN("%s : invalid reduction operation %d", info->opName, info->op);
|
||||
|
||||
@@ -59,6 +59,10 @@ DECLARE_CUDA_PFN(cuGetErrorString);
|
||||
DECLARE_CUDA_PFN(cuGetErrorName);
|
||||
/* enqueue.cc */
|
||||
DECLARE_CUDA_PFN(cuMemGetAddressRange);
|
||||
DECLARE_CUDA_PFN(cuLaunchKernel);
|
||||
#if CUDA_VERSION >= 11080
|
||||
DECLARE_CUDA_PFN(cuLaunchKernelEx);
|
||||
#endif
|
||||
/* proxy.cc */
|
||||
DECLARE_CUDA_PFN(cuCtxCreate);
|
||||
DECLARE_CUDA_PFN(cuCtxDestroy);
|
||||
@@ -137,6 +141,10 @@ static ncclResult_t cudaPfnFuncLoader(void) {
|
||||
LOAD_SYM(cuCtxGetCurrent, 1);
|
||||
LOAD_SYM(cuCtxSetCurrent, 1);
|
||||
LOAD_SYM(cuCtxGetDevice, 1);
|
||||
LOAD_SYM(cuLaunchKernel, 1);
|
||||
#if CUDA_VERSION >= 11080
|
||||
LOAD_SYM(cuLaunchKernelEx, 1);
|
||||
#endif
|
||||
/* cuMem API support */
|
||||
LOAD_SYM(cuMemAddressReserve, 1);
|
||||
LOAD_SYM(cuMemAddressFree, 1);
|
||||
|
||||
@@ -130,7 +130,7 @@ ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint6
|
||||
int ret;
|
||||
GDRLOCKCALL(gdr_internal_pin_buffer(g, addr, size, p2p_token, va_space, handle), ret);
|
||||
if (ret != 0) {
|
||||
WARN("gdr_pin_buffer(addr %lx, size %zi) failed: %d", addr, size, ret);
|
||||
WARN("gdr_pin_buffer(addr %lx, size %zu) failed: %d", addr, size, ret);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -172,7 +172,7 @@ ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
|
||||
int ret;
|
||||
GDRLOCKCALL(gdr_internal_map(g, handle, va, size), ret);
|
||||
if (ret != 0) {
|
||||
WARN("gdr_map(handle %lx, size %zi) failed: %d", handle.h, size, ret);
|
||||
WARN("gdr_map(handle %lx, size %zu) failed: %d", handle.h, size, ret);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -186,7 +186,7 @@ ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
|
||||
int ret;
|
||||
GDRLOCKCALL(gdr_internal_unmap(g, handle, va, size), ret);
|
||||
if (ret != 0) {
|
||||
WARN("gdr_unmap(handle %lx, va %p, size %zi) failed: %d", handle.h, va, size, ret);
|
||||
WARN("gdr_unmap(handle %lx, va %p, size %zu) failed: %d", handle.h, va, size, ret);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -218,7 +218,7 @@ ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const vo
|
||||
int ret;
|
||||
GDRLOCKCALL(gdr_internal_copy_to_mapping(handle, map_d_ptr, h_ptr, size), ret);
|
||||
if (ret != 0) {
|
||||
WARN("gdr_copy_to_mapping(handle %lx, map_d_ptr %p, h_ptr %p, size %zi) failed: %d", handle.h, map_d_ptr, h_ptr, size, ret);
|
||||
WARN("gdr_copy_to_mapping(handle %lx, map_d_ptr %p, h_ptr %p, size %zu) failed: %d", handle.h, map_d_ptr, h_ptr, size, ret);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -232,7 +232,7 @@ ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void
|
||||
int ret;
|
||||
GDRLOCKCALL(gdr_internal_copy_from_mapping(handle, h_ptr, map_d_ptr, size), ret);
|
||||
if (ret != 0) {
|
||||
WARN("gdr_copy_from_mapping(handle %lx, h_ptr %p, map_d_ptr %p, size %zi) failed: %d", handle.h, h_ptr, map_d_ptr, size, ret);
|
||||
WARN("gdr_copy_from_mapping(handle %lx, h_ptr %p, map_d_ptr %p, size %zu) failed: %d", handle.h, h_ptr, map_d_ptr, size, ret);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -132,7 +132,7 @@ ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
|
||||
WARN("UDS: Receiving data over socket failed : %d", errno);
|
||||
return ncclSystemError;
|
||||
}
|
||||
if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
|
||||
if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
|
||||
}
|
||||
|
||||
if (recvFd != NULL) {
|
||||
@@ -221,7 +221,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
|
||||
WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno);
|
||||
return ncclSystemError;
|
||||
}
|
||||
if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
|
||||
if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -41,11 +41,19 @@ namespace {
|
||||
NCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values))
|
||||
// MNNVL support
|
||||
NCCL_NVML_FN(nvmlDeviceGetGpuFabricInfoV, nvmlReturn_t, (nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo))
|
||||
// CC support
|
||||
NCCL_NVML_FN(nvmlSystemGetConfComputeState, nvmlReturn_t, (nvmlConfComputeSystemState_t *state));
|
||||
NCCL_NVML_FN(nvmlSystemGetConfComputeSettings, nvmlReturn_t, (nvmlSystemConfComputeSettings_t *setting));
|
||||
|
||||
std::mutex lock; // NVML has had some thread safety bugs
|
||||
bool initialized = false;
|
||||
thread_local bool threadInitialized = false;
|
||||
ncclResult_t initResult;
|
||||
|
||||
union nvmlCCInfoInternal {
|
||||
nvmlConfComputeSystemState_t settingV12020;
|
||||
nvmlSystemConfComputeSettings_t settingV12040;
|
||||
};
|
||||
}
|
||||
|
||||
ncclResult_t ncclNvmlEnsureInitialized() {
|
||||
@@ -87,6 +95,9 @@ ncclResult_t ncclNvmlEnsureInitialized() {
|
||||
{(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"},
|
||||
// MNNVL support
|
||||
{(void**)&pfn_nvmlDeviceGetGpuFabricInfoV, "nvmlDeviceGetGpuFabricInfoV"},
|
||||
// CC support
|
||||
{(void**)&pfn_nvmlSystemGetConfComputeState, "nvmlSystemGetConfComputeState"},
|
||||
{(void**)&pfn_nvmlSystemGetConfComputeSettings, "nvmlSystemGetConfComputeSettings"}
|
||||
};
|
||||
for(Symbol sym: symbols) {
|
||||
*sym.ppfn = dlsym(libhandle, sym.name);
|
||||
@@ -282,3 +293,33 @@ ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricI
|
||||
NVMLTRY(nvmlDeviceGetGpuFabricInfoV, device, gpuFabricInfo);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) {
|
||||
NCCLCHECK(ncclNvmlEnsureInitialized());
|
||||
std::lock_guard<std::mutex> locked(lock);
|
||||
nvmlCCInfoInternal ccInfo;
|
||||
if (pfn_nvmlSystemGetConfComputeSettings != NULL) {
|
||||
ccInfo.settingV12040.version = nvmlSystemConfComputeSettings_v1;
|
||||
NVMLTRY(nvmlSystemGetConfComputeSettings, &ccInfo.settingV12040);
|
||||
if (ccInfo.settingV12040.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED)
|
||||
status->CCEnabled = true;
|
||||
else
|
||||
status->CCEnabled = false;
|
||||
|
||||
if (ccInfo.settingV12040.multiGpuMode == NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE)
|
||||
status->multiGpuCCEnabled = true;
|
||||
else
|
||||
status->multiGpuCCEnabled = false;
|
||||
} else if (pfn_nvmlSystemGetConfComputeState != NULL) {
|
||||
NVMLTRY(nvmlSystemGetConfComputeState, &ccInfo.settingV12020);
|
||||
if (ccInfo.settingV12020.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED)
|
||||
status->CCEnabled = true;
|
||||
else
|
||||
status->CCEnabled = false;
|
||||
status->multiGpuCCEnabled = false;
|
||||
} else {
|
||||
status->CCEnabled = false;
|
||||
status->multiGpuCCEnabled = false;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -84,4 +84,4 @@ const char *ncclGetEnv(const char *name) {
|
||||
static pthread_once_t once = PTHREAD_ONCE_INIT;
|
||||
pthread_once(&once, initEnv);
|
||||
return getenv(name);
|
||||
}
|
||||
}
|
||||
|
||||
+25
-9
@@ -63,13 +63,28 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
|
||||
* goes down to 0, unlink should be called in order to delete shared memory file. */
|
||||
if (shmPath[0] == '\0') {
|
||||
sprintf(shmPath, "/dev/shm/nccl-XXXXXX");
|
||||
retry_mkstemp:
|
||||
fd = mkstemp(shmPath);
|
||||
if (fd < 0) {
|
||||
if (errno == EINTR) {
|
||||
INFO(NCCL_ALL, "mkstemp: Failed to create %s, error: %s (%d) - retrying", shmPath, strerror(errno), errno);
|
||||
goto retry_mkstemp;
|
||||
}
|
||||
WARN("Error: failed to create shared memory file %p, error %s (%d)", shmPath, strerror(errno), errno);
|
||||
ret = ncclSystemError;
|
||||
goto fail;
|
||||
}
|
||||
} else {
|
||||
SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
|
||||
}
|
||||
|
||||
retry_fallocate:
|
||||
if (fallocate(fd, 0, 0, realShmSize) != 0) {
|
||||
WARN("Error: failed to extend %s to %ld bytes", shmPath, realShmSize);
|
||||
if (errno == EINTR) {
|
||||
INFO(NCCL_ALL, "fallocate: Failed to extend %s to %ld bytes, error: %s (%d) - retrying", shmPath, realShmSize, strerror(errno), errno);
|
||||
goto retry_fallocate;
|
||||
}
|
||||
WARN("Error: failed to extend %s to %ld bytes, error: %s (%d)", shmPath, realShmSize, strerror(errno), errno);
|
||||
ret = ncclSystemError;
|
||||
goto fail;
|
||||
}
|
||||
@@ -80,7 +95,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
|
||||
|
||||
hptr = (char*)mmap(NULL, realShmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
if (hptr == MAP_FAILED) {
|
||||
WARN("Could not map %s size %zi, error: %s", shmPath, realShmSize, strerror(errno));
|
||||
WARN("Error: Could not map %s size %zu, error: %s (%d)", shmPath, realShmSize, strerror(errno), errno);
|
||||
ret = ncclSystemError;
|
||||
hptr = NULL;
|
||||
goto fail;
|
||||
@@ -93,7 +108,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
|
||||
if (remref == 0) {
|
||||
/* the last peer has completed attachment, it should unlink the shm mem file. */
|
||||
if (unlink(shmPath) != 0) {
|
||||
WARN("unlink shared memory %s failed, error: %s", shmPath, strerror(errno));
|
||||
INFO(NCCL_ALLOC, "unlink shared memory %s failed, error: %s (%d)", shmPath, strerror(errno), errno);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -110,7 +125,8 @@ exit:
|
||||
*handle = (ncclShmHandle_t)tmphandle;
|
||||
return ret;
|
||||
fail:
|
||||
WARN("Error while %s shared memory segment %s (size %ld)", create ? "creating" : "attaching to", shmPath, shmSize);
|
||||
WARN("Error while %s shared memory segment %s (size %ld), error: %s (%d)", create ? "creating" : "attaching to",
|
||||
shmPath, shmSize, strerror(errno), errno);
|
||||
if (tmphandle) {
|
||||
shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle);
|
||||
ncclShmClose((ncclShmHandle_t)tmphandle);
|
||||
@@ -129,7 +145,7 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle) {
|
||||
close(tmphandle->fd);
|
||||
if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) {
|
||||
if (unlink(tmphandle->shmPath) != 0) {
|
||||
WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
|
||||
WARN("unlink shared memory %s failed, error: %s (%d)", tmphandle->shmPath, strerror(errno), errno);
|
||||
ret = ncclSystemError;
|
||||
}
|
||||
}
|
||||
@@ -139,7 +155,7 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle) {
|
||||
if (tmphandle->shmPtr) {
|
||||
if (tmphandle->devShmPtr) CUDACHECK(cudaHostUnregister(tmphandle->shmPtr));
|
||||
if (munmap(tmphandle->shmPtr, tmphandle->realShmSize) != 0) {
|
||||
WARN("munmap of shared memory %p size %ld failed, error: %s", tmphandle->shmPtr, tmphandle->realShmSize, strerror(errno));
|
||||
WARN("munmap of shared memory %p size %ld failed, error: %s (%d)", tmphandle->shmPtr, tmphandle->realShmSize, strerror(errno), errno);
|
||||
ret = ncclSystemError;
|
||||
}
|
||||
}
|
||||
@@ -152,9 +168,9 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct shmHandleInternal* tmphandle = (struct shmHandleInternal*)handle;
|
||||
if (tmphandle) {
|
||||
if (tmphandle->shmPath != NULL) {
|
||||
if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) {
|
||||
if (unlink(tmphandle->shmPath) != 0) {
|
||||
WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
|
||||
WARN("unlink shared memory %s failed, error: %s (%d)", tmphandle->shmPath, strerror(errno), errno);
|
||||
ret = ncclSystemError;
|
||||
}
|
||||
free(tmphandle->shmPath);
|
||||
@@ -184,7 +200,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff
|
||||
uint64_t t0 = clockNano();
|
||||
while(__atomic_load_n(shmem->cnt[curRound], __ATOMIC_ACQUIRE) != comm->localRanks + 1) {
|
||||
if (clockNano() - t0 >= 5 * 1000) sched_yield();
|
||||
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 1) {
|
||||
if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE) == 1) {
|
||||
ret = ncclInternalError;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user