Rework core for NVIDIA Trusted Computing
 * Compress work structs so that they are shared between channels
 * Utilize the full amount of kernel argument space permitted (4k)
   before resorting to work fifo.
 * Rework the task preprocessing phase.
 * Use a separate abortDevFlag which is kept in sync with abortFlag
   using cudaMemcpy operations.
 * Rename src/include/align.h to src/include/bitops.h

Add lazy connection establishment for collective operations
 * Move buffer allocation and connection establishment to the first
   collective operation using that algorithm.
 * Accelerate init time and reduce memory usage.
 * Avoid allocating NVLS buffers if all calls are registered.
 * Compute algo/proto in ncclLaunchCollTasksInfo early on.
 * Connect peers in ncclCollPreconnectFunc if not connected already.
 * Also move shared buffer creation to the first send/recv call.

Accelerate intra-node NVLink detection
 * Make each rank only detect NVLinks attached to its GPU.
 * Fuse XMLs to reconstruct the full NVLink topology

Add init profiling to report time spend in different init phases.
 * Report timings of bootstrap, allgather, search, connect, etc.
 * Add new "PROFILE" category for NCCL_DEBUG_SUBSYS.

Add support for PCI p2p on split PCI switches
 * Detect split PCI switches through a kernel module exposing
   switch information.
 * Update the topology XML and graph to add those inter-switch
   connections.

Add cost estimation API
 * Add a new ncclGroupEndSimulate primitive to return the estimated
   time a group would take.

Net/IB: Add separate traffic class for fifo messages
 * Add NCCL_IB_FIFO_TC to control the traffic class of fifo messages
   independently from NCCL_IB_TC.
   Merges PR #1194

Net/IB: Add support for IB router
 * Use flid instead of lid if subnets do not match
 * Warn if flid is 0

Optimizations and fixes for device network offload (unpack)
 * Double the default number of channels
 * Cache netDeviceType
 * Fix save/increment head logic to enable Tree support.

Support ncclGroupStart/End for ncclCommAbort/Destroy
 * Allow Abort/Destroy to be called within a group when managing
   multiple GPUs with a single process.

Improve Tuner API
 * Provide to the plugin the original cost table so that the plugin
   can leave unknown or disabled algo/proto combinations untouched.
 * Remove nvlsSupport and collnetSupport.

Do not print version to stdout when using a debug file
 * Also print version from all processes with INFO debug level.
   Fixes issue #1271

Fix clang warnings in NVTX headers
 * Update NVTX headers to the latest version
   Fixes issue #1270

Disable port fusion in heterogeneous systems
 * Do not fuse ports if a mix of multi-port and single port are detected.

Fix NVLS graphs search for dual NICs.
 * Fix NVLS graph search when we have more than one NIC per GPU.

Fix crash with collnetDirect
 * Add separate graph search for collnetDirect, testing alltoall paths
   and working similarly to the NVLS search.

Fix hang when nodes have different CPU types
 * Add the CPU type to the rank peer info.
 * Align all ranks on the CPU type after the first allgather.
 * Only use the aligned CPU type for all tuning operations.
   Fixes issue #1136
   Fixes issue #1184

Fix performance of registered send/recv operations
 * Allow for single full size operations
 * Add INFO to confirm the registration of send/recv buffers.

Move all sync ops to finalize stage
 * Ensure ncclCommDestroy is non-blocking if ncclCommFinalize has
   been called.

Improve error reporting during SHM segment creation

Improve support of various compilers
   Merges PR #1177
   Merges PR #1228

Allow net and tuner plugins to be statically linked
 * Search for ncclNet or ncclTuner symbols in the main binary.
   Merges PR #979

Plugin examples includes cleanup
 * Harmonize err.h and common.h usage.
 * Add mixed plugin with both net and tuner.
Tento commit je obsažen v:
Sylvain Jeaugey
2024-06-11 01:28:01 -07:00
rodič 529ee691c3
revize 178b6b7590
115 změnil soubory, kde provedl 8672 přidání a 4403 odebrání
+15
Zobrazit soubor
@@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COMMON_H_
#define COMMON_H_
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
#endif
+1
Zobrazit soubor
@@ -11,6 +11,7 @@ typedef enum { ncclSuccess = 0,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6 } ncclResult_t;
#endif
+1 -5
Zobrazit soubor
@@ -8,6 +8,7 @@
#include <stdint.h>
#include <stdlib.h>
#include "common.h"
#include "err.h"
#define NCCL_NET_HANDLE_MAXSIZE 128
@@ -19,11 +20,6 @@
// Maximum number of requests per comm object
#define NCCL_NET_MAX_REQUESTS 32
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
#include "net_v8.h"
#include "net_v7.h"
#include "net_v6.h"
+2 -2
Zobrazit soubor
@@ -2,8 +2,8 @@
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NCCL_ERR_H_
#define NCCL_ERR_H_
#ifndef NCCL_TYPES_H_
#define NCCL_TYPES_H_
/* Data types */
typedef enum { ncclInt8 = 0, ncclChar = 0,
+15
Zobrazit soubor
@@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COMMON_H_
#define COMMON_H_
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
#endif
+17
Zobrazit soubor
@@ -0,0 +1,17 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NCCL_ERR_H_
#define NCCL_ERR_H_
/* Error type for plugins */
typedef enum { ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6 } ncclResult_t;
#endif
+28 -15
Zobrazit soubor
@@ -8,15 +8,24 @@
#ifndef NCCL_TUNER_H_
#define NCCL_TUNER_H_
#include "nccl.h"
#include <stdint.h>
#include <stdlib.h>
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
#include "common.h"
#include "err.h"
#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
typedef enum {
ncclFuncBroadcast = 0,
ncclFuncReduce = 1,
ncclFuncAllGather = 2,
ncclFuncReduceScatter = 3,
ncclFuncAllReduce = 4,
ncclFuncSendRecv = 5,
ncclFuncSend = 6,
ncclFuncRecv = 7,
ncclNumFuncs = 8
} ncclFunc_t;
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define NCCL_ALGO_UNDEF -1
@@ -33,6 +42,8 @@ typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncRed
#define NCCL_PROTO_LL128 1
#define NCCL_PROTO_SIMPLE 2
#define NCCL_ALGO_PROTO_IGNORE -1.0
// API to be implemented by external tuner
typedef struct {
// Name of the tuner
@@ -52,31 +63,33 @@ typedef struct {
// - context: tuner context object
// - collType: collective type , e.g., allreduce, allgather…
// - nBytes: collective size in bytes
// - collNetSupport: whether collnet supports this type
// - nvlsSupport: whether nvlink sharp supports this time
// - numPipeOps: number of operations in the group
// - numAlgo: number of algorithms in collCostTable
// - numProto: number of protocols in collCostTable
//
// Outputs:
// - algorithm: selected algorithm to be used for the given collective
// - protocol: selected protocol to be used for the given collective
// - nChannels: number of channels (hence SMs) to be used.
//
// InOut:
// - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
// NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
//
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
// default tuning for the given collective.
// Also, the plugin is allowed to not set any output, or set only the
// algorithm and protocol, but not only the algorithm or only the protocol.
// Unset fields will be set automatically by NCCL.
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels);
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
int* nChannels);
// Terminates the plugin and cleans up any resources that the plugin allocated.
// context: tuner context object
ncclResult_t (*destroy)(void* context);
} ncclTuner_v2_t;
} ncclTuner_v3_t;
typedef ncclTuner_v2_t ncclTuner_t;
typedef ncclTuner_v3_t ncclTuner_t;
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
#endif
+10 -3
Zobrazit soubor
@@ -11,14 +11,21 @@
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; }
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
int* nChannels) {
// Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
}
*nChannels = 1;
return ncclSuccess;
}
__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
#define PLUGIN_NAME "Example"
const ncclTuner_v2_t ncclTunerPlugin_v2 = {
const ncclTuner_v3_t ncclTunerPlugin_v3 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.getCollInfo = pluginGetCollInfo,
+2 -2
Zobrazit soubor
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 21
NCCL_PATCH := 5
NCCL_MINOR := 22
NCCL_PATCH := 3
NCCL_SUFFIX :=
PKG_REVISION := 1
+3 -2
Zobrazit soubor
@@ -201,7 +201,6 @@ ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFrom
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
memset(handle, 0, sizeof(ncclBootstrapHandle));
NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
const char* env = ncclGetEnv("NCCL_COMM_ID");
if (env) {
@@ -210,7 +209,9 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
handle->magic = NCCL_MAGIC;
} else {
NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
NCCLCHECK(bootstrapCreateRoot(handle, false));
}
@@ -626,7 +627,7 @@ ncclResult_t bootstrapClose(void* commState) {
struct bootstrapState* state = (struct bootstrapState*)commState;
if (state->unexpectedConnections != NULL) {
unexpectedFree(state);
if (__atomic_load_n(state->abortFlag, __ATOMIC_RELAXED) == 0) {
if (__atomic_load_n(state->abortFlag, __ATOMIC_ACQUIRE) == 0) {
WARN("Unexpected connections are not empty");
return ncclInternalError;
}
+5 -3
Zobrazit soubor
@@ -7,16 +7,17 @@
#include "channel.h"
#include "param.h"
#include "gdrwrap.h"
#include "transport.h"
ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
struct ncclChannel* channel = &comm->channels[channelId];
if (channel->id != -1) return ncclSuccess;
int nRanks = comm->nRanks;
int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
int nvlsRanks = comm->localRanks;
int nPeers = nRanks + 1 /* Collnet */ + nvlsRanks /* NVLS */;
channel->id = channelId;
channel->workFifoSent = 0;
channel->workFifoProduced = 0;
struct ncclSharedResources* sharedRes = comm->sharedRes;
@@ -74,7 +75,8 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
int nvlsRanks = comm->localRanks;
if (share) {
channel->nvlsPeers = parent->channels[channelId].nvlsPeers;
channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers;
+63
Zobrazit soubor
@@ -9,6 +9,69 @@
#include "enqueue.h"
#include "nccl.h"
const char* ncclFuncToString(ncclFunc_t fn) {
switch (fn) {
case ncclFuncAllGather: return "AllGather";
case ncclFuncAllReduce: return "AllReduce";
case ncclFuncBroadcast: return "Broadcast";
case ncclFuncRecv: return "Recv";
case ncclFuncReduce: return "Reduce";
case ncclFuncReduceScatter: return "ReduceScatter";
case ncclFuncSendRecv: return "SendRecv";
case ncclFuncSend: return "Send";
default: return "Invalid";
}
}
const char* ncclDevRedOpToString(ncclDevRedOp_t op) {
switch (op) {
case ncclDevSum: return "Sum";
case ncclDevProd: return "Prod";
case ncclDevMinMax: return "MinMax";
case ncclDevPreMulSum: return "PreMulSum";
case ncclDevSumPostDiv: return "SumPostDiv";
default: return "Unknown";
}
}
const char* ncclDatatypeToString(ncclDataType_t type) {
switch (type) {
case ncclInt8: return "ncclInt8";
case ncclInt32: return "ncclInt32";
case ncclUint32: return "ncclUint32";
case ncclInt64: return "ncclInt64";
case ncclUint64: return "ncclUint64";
case ncclFloat16: return "ncclFloat16";
case ncclFloat32: return "ncclFloat32";
case ncclFloat64: return "ncclFloat64";
#if defined(__CUDA_BF16_TYPES_EXIST__)
case ncclBfloat16: return "ncclBfloat16";
#endif
default: return "Unknown";
}
}
const char* ncclAlgoToString(int algo) {
switch (algo) {
case NCCL_ALGO_TREE: return "TREE";
case NCCL_ALGO_RING: return "RING";
case NCCL_ALGO_COLLNET_DIRECT: return "COLLNET_DIRECT";
case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN";
case NCCL_ALGO_NVLS: return "NVLS";
case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE";
default: return "Unknown";
}
}
const char* ncclProtoToString(int proto) {
switch (proto) {
case NCCL_PROTO_LL: return "LL";
case NCCL_PROTO_LL128: return "LL128";
case NCCL_PROTO_SIMPLE: return "SIMPLE";
default: return "Unknown";
}
}
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+29 -18
Zobrazit soubor
@@ -8,7 +8,10 @@
#include "nccl_net.h"
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <strings.h>
#include <sys/syscall.h>
#include <chrono>
#include "param.h"
int ncclDebugLevel = -1;
@@ -16,14 +19,15 @@ static int pid = -1;
static char hostname[1024];
thread_local int ncclDebugNoWarn = 0;
char ncclLastError[1024] = ""; // Global string for the last error in human readable form
uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV
static uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV
FILE *ncclDebugFile = stdout;
pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
std::chrono::steady_clock::time_point ncclEpoch;
static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
static std::chrono::steady_clock::time_point ncclEpoch;
static bool ncclWarnSetDebugInfo = false;
static __thread int tid = -1;
void ncclDebugInit() {
static void ncclDebugInit() {
pthread_mutex_lock(&ncclDebugLock);
if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
const char* nccl_debug = ncclGetEnv("NCCL_DEBUG");
@@ -83,6 +87,8 @@ void ncclDebugInit() {
mask = NCCL_BOOTSTRAP;
} else if (strcasecmp(subsys, "REG") == 0) {
mask = NCCL_REG;
} else if (strcasecmp(subsys, "PROFILE") == 0) {
mask = NCCL_PROFILE;
} else if (strcasecmp(subsys, "ALL") == 0) {
mask = NCCL_ALL;
}
@@ -94,6 +100,15 @@ void ncclDebugInit() {
free(ncclDebugSubsys);
}
const char* ncclWarnSetDebugInfoEnv = ncclGetEnv("NCCL_WARN_ENABLE_DEBUG_INFO");
if (ncclWarnSetDebugInfoEnv != NULL && strlen(ncclWarnSetDebugInfoEnv) > 0) {
int64_t value;
errno = 0;
value = strtoll(ncclWarnSetDebugInfoEnv, NULL, 0);
if (!errno)
ncclWarnSetDebugInfo = value;
}
// Cache pid and hostname
getHostName(hostname, 1024, '.');
pid = getpid();
@@ -143,8 +158,6 @@ void ncclDebugInit() {
pthread_mutex_unlock(&ncclDebugLock);
}
NCCL_PARAM(WarnSetDebugInfo, "WARN_ENABLE_DEBUG_INFO", 0);
/* Common logging function used by the INFO, WARN and TRACE macros
* Also exported to the dynamically loadable Net transport modules so
* they can share the debugging mechanisms and output files
@@ -178,7 +191,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
if (level == NCCL_LOG_WARN) {
len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
hostname, pid, tid, cudaDev, filefunc, line);
if (ncclParamWarnSetDebugInfo()) ncclDebugLevel = NCCL_LOG_INFO;
if (ncclWarnSetDebugInfo) ncclDebugLevel = NCCL_LOG_INFO;
} else if (level == NCCL_LOG_INFO) {
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
} else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
@@ -190,17 +203,15 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
hostname, pid, tid, cudaDev, timestamp, filefunc, line);
}
if (len) {
va_list vargs;
va_start(vargs, fmt);
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
va_end(vargs);
// vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
// Rewind len so that we can replace the final \0 by \n
if (len > sizeof(buffer)) len = sizeof(buffer)-1;
buffer[len++] = '\n';
fwrite(buffer, 1, len, ncclDebugFile);
}
va_list vargs;
va_start(vargs, fmt);
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
va_end(vargs);
// vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
// Rewind len so that we can replace the final \0 by \n
if (len > sizeof(buffer)) len = sizeof(buffer)-1;
buffer[len++] = '\n';
if (len) fwrite(buffer, 1, len, ncclDebugFile);
}
NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
+55 -62
Zobrazit soubor
@@ -10,30 +10,26 @@
namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = (int)args->nWarps * WARP_SIZE;
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
ncclRing *ring = &ncclShmem.channel.ring;
const int *ringRanks = ring->userRanks;
const int nranks = ncclShmem.comm.nRanks;
const size_t chunkCount = args->chunkCount;
const size_t channelCount = args->workCount;
const size_t gridOffset = args->workOffset;
const size_t count = args->count;
size_t count, partOffset, partCount, chunkCount;
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount);
size_t offset;
size_t dataOffset;
int nelem;
int rankDest;
T *inputBuf = (T*)args->sendbuff;
T *outputBuf = (T*)args->recvbuff;
T *inputBuf = (T*)work->sendbuff;
T *outputBuf = (T*)work->recvbuff;
Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg);
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) {
/////////////// begin AllGather steps ///////////////
nelem = min(chunkCount, channelCount - elemOffset);
dataOffset = gridOffset + elemOffset;
nelem = min(chunkCount, partCount - elemOffset);
dataOffset = partOffset + elemOffset;
// step 0: push data to next GPU
rankDest = ringRanks[0];
@@ -64,52 +60,50 @@ namespace {
}
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
using Proto = ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
runRing<T, RedOp, Proto>(tid, nthreads, work);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL>(args);
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL128>(args);
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
const int tid = threadIdx.x;
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
const ssize_t count = args->count;
const ssize_t rank = ncclShmem.comm.rank;
const size_t chunkCount = args->chunkCount;
size_t gridOffset = args->workOffset;
size_t channelCount = args->workCount;
size_t count, gridOffset, channelCount;
size_t chunkCount;
ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
size_t offset;
int nelem;
const int nThreadsBcast = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
const int nThreadsGather = args->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
const int nThreadsBcast = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
const int nThreadsGather = work->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
const int tidEndGather = nThreadsGather;
const int tidEndBcast = tidEndGather + nThreadsBcast;
if (!args->regUsed) {
if (!work->regUsed) {
if (tid < tidEndGather) {
// Gather
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
prims(tid, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
@@ -119,8 +113,8 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
// Bcast through NVLS
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, work->sendbuff, NULL,
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
@@ -133,7 +127,7 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL,
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
/* used as sync */
prims.scatter(0, 0, 0, 0, -1, 0);
@@ -144,8 +138,8 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
} else if (tid < tidEndBcast) {
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, args->sendbuff, NULL,
args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, args);
prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, work->sendbuff, NULL,
work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, work);
/* used as sync */
prims.recv(0, 0);
@@ -161,10 +155,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
template<bool BcastSendNotRecv>
struct Scatterer {
struct ncclWorkElem* args;
struct ncclDevWorkColl* work;
ssize_t chunkSize;
ssize_t railGridOffset;
@@ -179,13 +173,13 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
int nNodes = ncclShmem.comm.nNodes;
int nRails = direct->nHeads;
int bid = args->bid;
char* inbuf = (char*)args->sendbuff;
char* outbuf = (char*)args->recvbuff;
ssize_t sizePerRank = args->count*sizeof(T);
int part = ncclShmem.channelId - work->channelLo;
char* inbuf = (char*)work->sendbuff;
char* outbuf = (char*)work->recvbuff;
ssize_t sizePerRank = work->collnet.count*sizeof(T);
bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*sizePerRank);
ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank);
ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank);
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
int railAllSize = railAllEnd - railAllBeg;
if (tid < nDsts) dstSizes[tid] = railAllSize;
@@ -232,28 +226,27 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
}
};
__device__ __forceinline__ void run(ncclWorkElem *args) {
int tid = threadIdx.x;
const int nChannels = args->nChannels;
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
const int part = ncclShmem.channelId - work->channelLo;
const int nChannels = work->channelHi - work->channelLo + 1;
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
int const &nNodes = ncclShmem.comm.nNodes;
ssize_t chunkSize = int(args->chunkCount);
ssize_t const &sizePerRank = args->count;
ssize_t sizePerRank = work->collnet.count*sizeof(T);
size_t chunkSize = work->collnet.chunkCount;
bool isMultiRail = (direct->nHeads > 1);
int nWarps1 = 1;
int nWarps2 = (isMultiRail ? 2 : 1);
int nWarps3 = (isMultiRail ? 2 : 0);
float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3);
float denom = float(work->nWarps)/float(nWarps1+nWarps2+nWarps3);
nWarps3 = int(denom*nWarps3);
nWarps2 = int(denom*nWarps2);
nWarps1 = args->nWarps - (nWarps2+nWarps3);
nWarps1 = work->nWarps - (nWarps2+nWarps3);
using Proto = ProtoSimple<1, 1>;
int tn = nWarps1*WARP_SIZE;
if (tid < tn) {
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (tid == 0) {
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
@@ -262,10 +255,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
} else {
// Phase 1: send to network
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid, tn, nullptr, &direct->out, args->sendbuff, nullptr,
prims(tid, tn, nullptr, &direct->out, work->sendbuff, nullptr,
/*redOpArg=*/0, 0 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
ssize_t railAllBeg = railGridOffset + args->bid * chunkSize;
ssize_t railAllBeg = railGridOffset + part * chunkSize;
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
ssize_t railOneEnd = railOneBeg + sizePerRank;
@@ -280,7 +273,7 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
tn = nWarps2*WARP_SIZE;
if (tid < tn) {
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (tid == 0) {
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
@@ -293,10 +286,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
/*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0);
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
Scatterer</*BcastSendNotRecv=*/true> scat;
scat.args = args;
scat.work = work;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.process</*Recv=*/1, /*Send=*/1>(scat);
prims.template process</*Recv=*/1, /*Send=*/1>(scat);
}
}
return;
@@ -311,10 +304,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
/*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0);
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
Scatterer</*BcastSendNotRecv=*/false> scat;
scat.args = args;
scat.work = work;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.process</*Recv=*/1, /*Send=*/0>(scat);
prims.template process</*Recv=*/1, /*Send=*/0>(scat);
}
return;
}
+143 -147
Zobrazit soubor
@@ -10,28 +10,27 @@
namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = (int)args->nWarps * WARP_SIZE;
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
ncclRing *ring = &ncclShmem.channel.ring;
int ringIx = ring->index;
ssize_t chunkCount = args->chunkCount;
const int nranks = ncclShmem.comm.nRanks;
ssize_t gridOffset;
ssize_t channelCount;
ssize_t chunkCount;
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
const ssize_t loopCount = nranks * chunkCount;
ssize_t offset;
ssize_t gridOffset = args->workOffset;
ssize_t channelCount = args->workCount;
int nelem;
int chunk;
Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
ssize_t remCount = channelCount - elemOffset;
ssize_t chunkOffset;
if (remCount < loopCount) chunkCount = args->lastChunkCount;
if (remCount < loopCount) chunkCount = alignUp(divUp(remCount, nranks), 16/sizeof(T));
auto modRanks = [&]__device__(int r)->int {
return r - (r >= nranks ? nranks : 0);
@@ -75,24 +74,24 @@ namespace {
chunkOffset = chunk * chunkCount;
offset = gridOffset + elemOffset + chunkOffset;
nelem = (int)min(chunkCount, remCount - chunkOffset);
prims.directRecv(offset, nelem);
}
}
template<typename T, typename RedOp, typename Proto>
__device__ __forceinline__ void runTreeUpDown(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = (int)args->nWarps * WARP_SIZE;
__device__ __forceinline__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
ncclTree *tree = &ncclShmem.channel.tree;
const size_t channelCount = args->workCount;
const size_t gridOffset = args->workOffset;
const size_t chunkCount = args->chunkCount;
size_t gridOffset;
size_t channelCount;
size_t chunkCount;
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
size_t offset;
int nelem;
{ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
(tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg);
(tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg);
if (tree->up == -1) {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
@@ -118,7 +117,7 @@ namespace {
{ // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_TREE_ARITY>, /*Direct=*/1, Proto, 0> prims
(tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
(tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
if (tree->up == -1) {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
@@ -144,16 +143,14 @@ namespace {
}
template<typename T, typename RedOp, typename Proto>
__device__ __forceinline__ void runTreeSplit(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = (int)args->nWarps * WARP_SIZE;
__device__ __forceinline__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
ncclTree *tree = &ncclShmem.channel.tree;
const size_t chunkCount = args->chunkCount;
const size_t gridOffset = args->workOffset;
const size_t channelCount = args->workCount;
size_t gridOffset;
size_t channelCount;
size_t chunkCount;
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
size_t offset;
int nelem;
int nthreadsSplit;
if (Proto::Id == NCCL_PROTO_SIMPLE) {
nthreadsSplit = nthreads/2;
@@ -167,7 +164,7 @@ namespace {
if (tree->up == -1) {
// Reduce and broadcast. Max number of recv is 2, max number of send is 2
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_TREE_ARITY_TOP>, /*Direct=*/1, Proto, 0>
prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
@@ -184,7 +181,7 @@ namespace {
* but the ctor above for tree roots would be DirectRecv=0 DirectSend=1.
*/
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/1, Proto, 0>
prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth);
prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth);
if (tree->down[0] == -1) {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
@@ -203,8 +200,8 @@ namespace {
else {
// Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_TREE_ARITY>, /*Direct=*/1, Proto, 0>
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth);
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, work->sendbuff, work->recvbuff,
work->redOpArg, 1*Proto::MaxGroupWidth);
if (tree->down[0] == -1) {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
@@ -224,34 +221,33 @@ namespace {
}
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
runRing<T, RedOp, Proto>(tid, nthreads, work);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
#if CUDART_VERSION >= 11020 && CUDART_VERSION < 11040 && __CUDA_ARCH__ >= 800
runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(args);
runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(tid, nthreads, work);
#else
runTreeSplit<T, RedOp, ProtoSimple<1, 1>>(args);
runTreeSplit<T, RedOp, ProtoSimple<1, 1>>(tid, nthreads, work);
#endif
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
static constexpr int COLLNET_COPY_THREADS = 96;
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
const int bid = ncclShmem.channelId - work->channelLo;
const int nChannels = work->channelHi - work->channelLo + 1;
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
const ssize_t chunkSize = args->chunkCount;
const ssize_t size = args->count;
const ssize_t chunkSize = work->collnet.chunkCount;
const ssize_t size = work->collnet.count;
const ssize_t loopSize = nChannels*direct->nHeads*chunkSize;
const int hasUp = (direct->up[0] >= 0) ? 1 : 0;
@@ -259,7 +255,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
const int nThreadsScatter = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 3*COLLNET_COPY_THREADS : 0);
const int nThreadsGather = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
const int nThreadsBcast = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 2*COLLNET_COPY_THREADS);
const int nThreadsReduce = args->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
const int nThreadsReduce = work->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
const int tidStartBcast = nThreadsGather;
const int tidStartScatter = tidStartBcast + nThreadsBcast;
const int tidStartReduce = tidStartScatter + nThreadsScatter;
@@ -269,12 +265,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
// Scatter
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff,
args->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, args);
prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, work->sendbuff, work->recvbuff,
work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
int nelem = min(direct->nHeads*chunkSize, size-offset);
if (args->regUsed) {
if (work->regUsed) {
prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
} else {
prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
@@ -284,12 +280,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
if (hasDn) {
// Reduce, send to network
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, args);
prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, work->sendbuff, work->recvbuff,
work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
if (args->regUsed) {
if (work->regUsed) {
prims.directRecvReduceSend(offset, nelem);
} else {
prims.recvReduceSend(offset, nelem);
@@ -297,7 +293,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
}
} else {
// Directly send to network
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (tid == tidStartReduce) {
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
@@ -305,8 +301,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
__syncwarp();
} else {
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, work->sendbuff, work->recvbuff,
work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
@@ -317,8 +313,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
} else if (tid < tidStartBcast && hasUp) {
// Gather
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff,
args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, args);
prims(tid, nThreadsGather, direct->up, NULL, work->sendbuff, work->recvbuff,
work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
int nelem = min(direct->nHeads*chunkSize, size-offset);
@@ -328,15 +324,15 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
if (hasDn) {
// Recv from network, broadcast
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, args);
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, work->sendbuff, work->recvbuff,
work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true);
}
} else {
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (tid == tidStartBcast) {
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
@@ -345,8 +341,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
} else {
// Recv from network (no post thread needed)
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid - tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff,
args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0);
prims(tid - tidStartBcast, nThreadsBcast, &direct->out, nullptr, work->sendbuff, work->recvbuff,
work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize;
int nelem = min(chunkSize, size - offset);
@@ -359,18 +355,16 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
const int tid = threadIdx.x;
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
ssize_t chunkSize = args->chunkCount;
const bool hasOut = nvls->out != -1;
const int nranks = ncclShmem.comm.nRanks;
const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
const int bcastWarps = hasOut ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0;
const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5);
const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
const int bcastWarps = hasOut ? (work->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0;
const int reduceWarps = work->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5);
const int scatterWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
const int gatherWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
const int nThreadsScatter = scatterWarps*WARP_SIZE;
const int nThreadsGather = gatherWarps*WARP_SIZE;
@@ -381,35 +375,37 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
const int tidEndReduce = tidEndGather + nThreadsReduce;
const int tidEndBcast = tidEndReduce + nThreadsBcast;
if (args->oneNode) {
if (work->oneNode) {
ssize_t gridOffset, channelCount, chunkSize;
ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkSize);
const ssize_t loopCount = nvls->nHeads * chunkSize;
const ssize_t channelCount = args->workCount;
const ssize_t gridOffset = args->workOffset;
ssize_t offset;
int nelem;
int remCount = channelCount%(nvls->nHeads*chunkSize);
int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T));
if (tid < tidEndScatter) {
// Scatter
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
offset = gridOffset + elemOffset;
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndGather) {
// Gather
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
offset = gridOffset + elemOffset;
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndReduce) {
@@ -417,10 +413,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
ssize_t chunkOffset;
if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
chunkOffset = elemOffset + nvls->headRank * chunkSize;
offset = gridOffset + chunkOffset;
nelem = min(chunkSize, channelCount - chunkOffset);
@@ -428,30 +424,32 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
}
}
} else {
const int bid = args->bid;
const ssize_t loopSize = args->nChannels * nvls->nHeads * chunkSize;
const ssize_t size = args->count;
const int bid = ncclShmem.channelId - work->channelLo;
const int nChannels = work->channelHi - work->channelLo + 1;
const ssize_t chunkSize = work->collnet.chunkCount;
const ssize_t loopSize = nChannels * nvls->nHeads * chunkSize;
const ssize_t size = work->collnet.count;
if (tid < tidEndScatter) {
// Scatter
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
int nelem = work->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndGather) {
// Gather
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
int nelem = args->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
int nelem = work->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndReduce && nvls->headRank != -1) {
@@ -460,7 +458,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
int nelem = min(chunkSize, size - offset);
@@ -471,7 +469,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, args);
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
int nelem = min(chunkSize, size - offset);
@@ -483,7 +481,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
int nelem = min(chunkSize, size - offset);
@@ -495,25 +493,25 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
const int tid = threadIdx.x;
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
const int treeUp = nvls->treeUp;
const int* treeDown = nvls->treeDown;
ssize_t chunkCount = args->chunkCount;
ssize_t gridOffset, channelCount, chunkCount;
ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
const ssize_t loopCount = nvls->nHeads * chunkCount;
const ssize_t channelCount = args->workCount;
const ssize_t gridOffset = args->workOffset;
const int nranks = ncclShmem.comm.nRanks;
const bool hasUp = treeUp != -1;
const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
const int bcastWarps = hasUp ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 4) : 0;
const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5);
const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
const int bcastWarps = hasUp ? (work->regUsed ? ((totalWarps - 2) >> 1) - 1 : 4) : 0;
const int reduceWarps = work->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5);
const int scatterWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
const int gatherWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
ssize_t offset;
int nelem;
int remCount = channelCount%(nvls->nHeads*chunkCount);
int lastChunkCount = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T));
const int nThreadsScatter = scatterWarps*WARP_SIZE;
const int nThreadsGather = gatherWarps*WARP_SIZE;
@@ -528,24 +526,24 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
// Scatter
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
offset = gridOffset + elemOffset;
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
prims.scatter(offset, nelem, chunkCount, chunkCount, -1, 0);
}
} else if (tid < tidEndGather) {
// Gather
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
offset = gridOffset + elemOffset;
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
prims.gather(offset, nelem, chunkCount, chunkCount, -1, 0);
}
} else if (tid < tidEndReduce && nvls->headRank != -1) {
@@ -554,10 +552,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
ssize_t chunkOffset;
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
chunkOffset = elemOffset + nvls->headRank * chunkCount;
offset = gridOffset + chunkOffset;
nelem = min(chunkCount, channelCount - chunkOffset);
@@ -568,10 +566,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
ssize_t chunkOffset;
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
chunkOffset = elemOffset + nvls->headRank * chunkCount;
offset = gridOffset + chunkOffset;
nelem = min(chunkCount, channelCount - chunkOffset);
@@ -583,10 +581,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
ssize_t chunkOffset;
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
chunkOffset = elemOffset + nvls->headRank * chunkCount;
offset = gridOffset + chunkOffset;
nelem = min(chunkCount, channelCount - chunkOffset);
@@ -597,17 +595,15 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
const int bid = ncclShmem.channelId - work->channelLo;
const int nChannels = work->channelHi - work->channelLo + 1;
ncclTree *tree = &ncclShmem.channel.collnetChain;
ssize_t chunkSize = args->chunkCount;
ssize_t chunkSize = work->collnet.chunkCount;
const ssize_t loopSize = int(nChannels*chunkSize);
const int nranks = ncclShmem.comm.nRanks;
const ssize_t size = args->count;
const ssize_t size = work->collnet.count;
int nthreadsSplit = nthreads/2;
if (nthreadsSplit >= 256) nthreadsSplit += 64;
@@ -634,7 +630,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
if (tid < nthreadsSplit) {
if (recv == -1) {
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (groupTid == 0) {
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, steps);
@@ -642,8 +638,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
__syncwarp();
} else {
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
@@ -652,8 +648,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
}
} else {
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
@@ -665,7 +661,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
if (recv == nranks) {
// I'm the first in the broadcast chain, I need to perform the division (postOp)
if (send == -1) {
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (groupTid == 0) {
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, steps);
@@ -673,8 +669,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
__syncwarp();
} else {
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
@@ -683,8 +679,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
}
} else {
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
@@ -693,8 +689,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
}
} else {
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
if (send == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -714,29 +710,29 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL>(args);
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runTreeSplit<T, RedOp, ProtoLL>(args);
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
runTreeSplit<T, RedOp, ProtoLL>(tid, nthreads, work);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL128>(args);
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runTreeSplit<T, RedOp, ProtoLL128>(args);
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
runTreeSplit<T, RedOp, ProtoLL128>(tid, nthreads, work);
}
};
+18 -19
Zobrazit soubor
@@ -10,23 +10,22 @@
namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = (int)args->nWarps * WARP_SIZE;
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
ncclRing *ring = &ncclShmem.channel.ring;
const int rank = ring->userRanks[0];
const int nextRank = ring->userRanks[1];
const int root = args->root;
const size_t chunkCount = args->chunkCount;
const size_t channelCount = args->workCount;
const size_t gridOffset = args->workOffset;
const int root = work->root;
size_t chunkCount;
size_t channelCount;
size_t gridOffset;
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
size_t offset;
int nelem;
T *inputBuf = (T*)args->sendbuff;
T *outputBuf = (T*)args->recvbuff;
T *inputBuf = (T*)work->sendbuff;
T *outputBuf = (T*)work->recvbuff;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg);
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
@@ -48,23 +47,23 @@ namespace {
}
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
using Proto = ProtoSimple<BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
runRing<T, RedOp, Proto>(tid, nthreads, work);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL>(args);
struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL128>(args);
struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
}
};
+3 -3
Zobrazit soubor
@@ -14,11 +14,11 @@ __shared__ ncclShmemData ncclShmem;
#endif
struct RunWorkNop {
__device__ void run(ncclWork *w) {}
__device__ void run() {}
};
__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) {
ncclKernelMain<-1, RunWorkNop>(comm, channelMask, workHead);
__global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
ncclKernelMain<-1, RunWorkNop>(&args4K.args);
}
__device__ void ncclDevFunc_Nop() {}
+288 -118
Zobrazit soubor
@@ -10,10 +10,19 @@
#include "collectives.h"
#include "device.h"
#include "op128.h"
#include "reduce_kernel.h"
#include "network/unpack/unpack_defs.h"
#define COLL_UNROLL (ncclCollUnroll())
#if __CUDA_ARCH__ >= 700
// __grid_constant__ appears to break cuda-gdb
//#define NCCL_GRID_CONSTANT __grid_constant__
#define NCCL_GRID_CONSTANT
#else
#define NCCL_GRID_CONSTANT
#endif
typedef void(*ncclDevFuncPtr_t)();
extern __device__ ncclDevFuncPtr_t const ncclDevFuncTable[];
@@ -31,18 +40,28 @@ struct ncclShmemGroup {
};
struct ncclShmemData {
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
uint64_t redOpArgs[NCCL_MAX_ARITY+1];
struct ncclDevKernelArgs args;
int channelId;
int aborted;
alignas(16) struct ncclDevComm comm;
alignas(16) struct ncclDevChannel channel;
alignas(16) struct ncclWork work;
int batchIx, nextBatchIx;
enum ncclDevWorkType workType;
uint8_t directMode;
uint16_t funcId;
int nWorks;
int workSize;
uint32_t workConsumed;
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
alignas(16) char workStorage[1024];
alignas(16) union {
unpackShmem unpack;
} devicePlugin;
};
static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
extern __shared__ ncclShmemData ncclShmem;
#if __CUDA_ARCH__ >= 700
@@ -55,14 +74,62 @@ __device__ inline void* ncclScratchForWarp(int warp) {
return (char*)ncclShmemPerWarp + warp*ncclShmemScratchWarpSize();
}
__device__ inline bool barrierReduceAny(int bit) {
uint32_t popc;
asm ("{"
".reg .pred barr_pred;"
"setp.eq.u32 barr_pred, %1, 1;"
"bar.red.popc.u32 %0, 2, barr_pred;"
"}" : "=r"(popc) : "r"(bit));
return popc != 0;
__device__ inline void barrier_sync(int name) {
#if 0
asm volatile("barrier.sync %0;" :: "r"(name) : "memory");
#else
asm volatile("barrier.sync.aligned %0;" :: "r"(name) : "memory");
#endif
}
__device__ inline void barrier_sync(int name, int nThreads) {
#if 0
asm volatile("barrier.sync %0, %1;" :: "r"(name), "r"(nThreads) : "memory");
#else
asm volatile("barrier.sync.aligned %0, %1;" :: "r"(name), "r"(nThreads) : "memory");
#endif
}
__device__ inline void barrier_sync_aligned(int name) {
asm volatile("barrier.sync.aligned %0;" :: "r"(name) : "memory");
}
__device__ inline void barrier_sync_aligned(int name, int nThreads) {
asm volatile("barrier.sync.aligned %0, %1;" :: "r"(name), "r"(nThreads) : "memory");
}
__device__ inline bool barrier_red_or(bool vote, int name) {
int ans;
asm("{ .reg .pred p;"
" setp.ne.s32 p, %1, 0;"
" barrier.red.or.pred p, %2, p; "
" selp.s32 %0, 1, 0, p; }"
: "=r"(ans) : "r"((int)vote), "r"(name) : "memory");
return bool(ans);
}
__device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
int ans;
asm("{ .reg .pred p;"
" setp.ne.s32 p, %1, 0;"
" barrier.red.or.pred p, %2, %3, p; "
" selp.s32 %0, 1, 0, p; }"
: "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
return bool(ans);
}
__device__ inline bool barrier_red_or_aligned(bool vote, int name) {
int ans;
asm("{ .reg .pred p;"
" setp.ne.s32 p, %1, 0;"
" barrier.red.or.pred.aligned p, %2, p; "
" selp.s32 %0, 1, 0, p; }"
: "=r"(ans) : "r"((int)vote), "r"(name) : "memory");
return bool(ans);
}
__device__ inline bool barrier_red_or_aligned(bool vote, int name, int nThreads) {
int ans;
asm("{ .reg .pred p;"
" setp.ne.s32 p, %1, 0;"
" barrier.red.or.pred.aligned p, %2, %3, p; "
" selp.s32 %0, 1, 0, p; }"
: "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
return bool(ans);
}
// Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads.
@@ -71,158 +138,261 @@ inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int by
if (offset < bytes) {
uint64_t a=0, b=0;
asm("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset));
asm volatile("st.v2.u64 [%0],{%1,%2};" :: "l"((char*)dst + offset), "l"(a), "l"(b));
uint32_t udst = (uint32_t)__cvta_generic_to_shared(dst);
asm volatile("st.shared.v2.u64 [%0],{%1,%2};" :: "r"(udst + offset), "l"(a), "l"(b));
}
}
// Must run with at least 64 threads
__device__ __forceinline__ void loadWorkBatchToShmem(
int tid, int tn, struct ncclDevKernelArgs const* args, int batchIx
) {
int lane = tid%WARP_SIZE;
int workCursor = 0; // num works written in previous loop iterations.
while (true) {
struct ncclDevWorkBatch batch = ((struct ncclDevWorkBatch*)(args+1))[batchIx];
// fnsOfBitset[n] = index of n'th set bit in batch.offsetBitset.
// PTX has instruction "fns" (find n-th set) but it expands to a lot of SASS,
// since we know all lanes will be querying the same bitmask we can compute
// much faster using shared memory.
uint8_t* fnsOfBitset = (uint8_t*)ncclScratchForWarp(threadIdx.x/WARP_SIZE);
__syncwarp();
if (uint32_t(batch.offsetBitset) & (1u<<lane)) {
int nWorksBelow = __popc(uint32_t(batch.offsetBitset) & ((1u<<lane)-1));
fnsOfBitset[nWorksBelow] = lane;
}
int nWorksLow32 = __popc(uint32_t(batch.offsetBitset)); // just of low 32 bits
if (uint32_t(batch.offsetBitset>>32) & (1u<<lane)) {
int nWorksBelow = nWorksLow32;
nWorksBelow += __popc(uint32_t(batch.offsetBitset>>32) & ((1u<<lane)-1));
fnsOfBitset[nWorksBelow] = 32 + lane;
}
int nWorks = nWorksLow32 + __popc(uint32_t(batch.offsetBitset>>32)); // add high 32 bits
__syncwarp();
int workSize;
int nPacks; // total number of packs loaded, each pack is 16 bytes
int packInWork; // my pack index within work struct
int dstWork; // my work index in contiguous destination shmem
switch (batch.workType) {
case (int)ncclDevWorkTypeP2p:
workSize = sizeof(struct ncclDevWorkP2p);
nPacks = nWorks*(workSize/16);
packInWork = tid%(workSize/16);
dstWork = tid/(workSize/16);
break;
case (int)ncclDevWorkTypeColl:
workSize = sizeof(struct ncclDevWorkColl);
nPacks = nWorks*(workSize/16);
packInWork = tid%(workSize/16);
dstWork = tid/(workSize/16);
break;
case (int)ncclDevWorkTypeCollReg:
default:
workSize = sizeof(struct ncclDevWorkCollReg);
nPacks = nWorks*(workSize/16);
packInWork = tid%(workSize/16);
dstWork = tid/(workSize/16);
break;
}
if (tid == 0) {
ncclShmem.workSize = workSize;
ncclShmem.workConsumed = batch.offsetBase + (64-__clzll(batch.offsetBitset))*workSize;
}
// We deliberately replicate these div and mod calculations into the case
// blocks above so that they get constant divisor optimizations by the compiler.
// packInWork = tid%(workSize/16);
// dstWork = tid/(workSize/16);
// We can only assume we have 64 threads, which means we can read at most 1024 bytes
// here which is the per batch maximum.
if (tid < nPacks) {
int srcWork = fnsOfBitset[dstWork]; // find n'th set bit in batch.offsetBitset
ulong2 tmp;
// The loads done in these two cases must be kept separate since we are
// relying on the compiler to use "ld.param" in the first one. The parameter
// space is not generically addressable, so any attempt to load through
// a pointer that *might* be parameter space backed will cause the
// compiler to spill the parameter struct (4K!) to each thread's local space
// before creating a pointer (to the spill) and decimate perf.
//
// An example of what not to do would be the following:
//
// if (condition) {
// // The compiler could spill parameter_variable to local space and take
// // the address of that, since when src is loaded below it could also
// // be global space.
// src = &parameter_variable;
// } else {
// src = &global_variable;
// }
// memcpy(dst, src, n);
if (ncclShmem.args.workStorageType == ncclDevWorkStorageTypeArgs) {
char* src = (char*)args + (batch.offsetBase + srcWork*workSize + packInWork*16);
tmp = *(ulong2*)src; // becomes ld.param.v2.u64
} else {
char* src = (char*)ncclShmem.args.workBuf + ((batch.offsetBase + srcWork*workSize + packInWork*16) & ncclShmem.args.workMask);
tmp = *(ulong2*)src; // becomes ld.v2.u64
}
char* dst = ncclShmem.workStorage;
dst += (workCursor + dstWork)*workSize + packInWork*16;
*(ulong2*)dst = tmp;
}
workCursor += nWorks;
if (batch.nextExtends) {
batchIx += batch.nextJump;
tid -= 64; // Rotate threads so we use the next two warps for next batch struct.
if (tid < 0) tid += tn;
} else {
if (tid == 0) {
ncclShmem.batchIx = batchIx;
ncclShmem.nextBatchIx = (batch.nextJump == 0) ? -1 : batchIx + batch.nextJump;
ncclShmem.workType = (enum ncclDevWorkType)batch.workType;
ncclShmem.nWorks = workCursor;
ncclShmem.funcId = batch.funcId;
}
break;
}
}
}
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
struct RunWorkElement {
__device__ void run(ncclWorkElem*) {
struct RunWorkColl {
__device__ void run(int tid, int tn, struct ncclDevWorkColl* work) {
// Put NOT IMPLEMENTED behavior here.
}
};
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
struct RunWork {
struct RunWorkBatch;
// Specialized for P2p in sendrecv.h
template<typename T, typename RedOp>
struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE>;
// Specialized here for non-P2p (Coll and CollReg)
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
struct RunWorkBatch {
// This __forceinline__ is necessary. The compiler was inserting a function call
// here from the LL ncclKernel.
__device__ __forceinline__ void run(ncclWork *w) {
int wid = threadIdx.x / WARP_SIZE;
ncclWorkElem* we = w->header.type == ncclWorkTypeRegColl ? &w->regElems[0].elem : &w->elems[0];
int stride = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) : sizeof(ncclWorkElem);
#pragma unroll 1
while ((char*)we + stride <= (char*)(w+1) && we->isUsed) {
if (wid < we->nWarps) {
RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(we);
__device__ __forceinline__ void run() {
int tid = threadIdx.x;
int tn = blockDim.x;
if (RedOpArg<RedOp>::ArgUsed) {
int nWorks = ncclShmem.nWorks;
for (int w=tid; w < nWorks; w += tn) {
struct ncclDevWorkColl* work = (ncclDevWorkColl*)(ncclShmem.workStorage + w*ncclShmem.workSize);
if (work->redOpArgIsPtr) {
work->redOpArg = RedOpArg<RedOp>::loadArg(reinterpret_cast<void*>(work->redOpArg));
}
}
we = (ncclWorkElem*)((char*)we + stride);
__syncthreads();
}
#pragma unroll 1
for (int w=0; w < ncclShmem.nWorks; w++) {
struct ncclDevWorkColl* work = (struct ncclDevWorkColl*)(ncclShmem.workStorage + w*ncclShmem.workSize);
if (w != 0) {
struct ncclDevWorkColl* workPrev = (struct ncclDevWorkColl*)(ncclShmem.workStorage + (w-1)*ncclShmem.workSize);
if (work->nWarps != workPrev->nWarps) __syncthreads();
}
int subtn = work->nWarps*WARP_SIZE;
if (tid < subtn) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid, subtn, work);
}
}
};
static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
if (we->isUsed && we->redOpArgIsPtr) {
/* redOpArg is a pointer to the scalar value, so we'll dereference it
* here so that redOpArg holds the bits of the scalar going forward.
* The tricky thing is we don't know its type T since that's encoded in
* the funcIndex. Because it would be difficult to get sizeof(T) from
* funcIndex, we'll cheat and just dereference the largest possible size
* given the alignment of the pointer. We might be reading in more bytes
* than we need but that's harmless.
*/
if (we->redOpArg%2 != 0)
we->redOpArg = *reinterpret_cast<uint8_t*>(we->redOpArg);
else if (we->redOpArg%4 != 0)
we->redOpArg = *reinterpret_cast<uint16_t*>(we->redOpArg);
else if (we->redOpArg%8 != 0)
we->redOpArg = *reinterpret_cast<uint32_t*>(we->redOpArg);
else
we->redOpArg = *reinterpret_cast<uint64_t*>(we->redOpArg);
}
}
template<int SpecializedFnId, typename SpecializedRunWork>
__device__ void ncclKernelMain(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) {
template<int SpecializedFnId, typename SpecializedRunWorkBatch>
__device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* args) {
int tid = threadIdx.x;
int tn = blockDim.x;
// Copy kernel args to shmem and then only read those. Otherwise the compiler
// will end up putting the args into thread local stack which is very wasteful.
if (tid < sizeof(ncclDevKernelArgs)/sizeof(uint32_t)) {
((uint32_t*)&ncclShmem.args)[tid] = ((uint32_t*)args)[tid];
}
// To map blockId to channelId, we need the n'th set bit of channelMask which
// is the inverse of counting the number of set bits among the the first n.
if (tid < WARP_SIZE) {
int x = tid;
if (channelMask & (1ull<<x)) {
int y = __popcll(channelMask & ((1ull<<x)-1));
if (blockIdx.x == y) ncclShmem.channelId = x;
}
if (32 < MAXCHANNELS) {
x = 32 + tid;
if (channelMask & (1ull<<x)) {
int y = __popcll(channelMask & ((1ull<<x)-1));
if (blockIdx.x == y) ncclShmem.channelId = x;
}
}
// PTX has the fns instruction which does this but is extremely slow. We can
// do better when we know all threads are querying the same bitmask.
if (tid < MAXCHANNELS && (args->channelMask & (1ull<<tid))) {
int n = __popcll(args->channelMask & ((1ull<<tid)-1));
if (blockIdx.x == n) ncclShmem.channelId = tid;
}
__syncthreads(); // publish ncclShmem.channelId
int channelId = ncclShmem.channelId;
__syncthreads(); // publish ncclShmem.{args, channelId}
/* set abort flag to 0 */
if (tid == 0) ncclShmem.aborted = 0;
if (true) {
void *dst, *src;
int bytes;
// Use first 3 warps to load comm, channel, and work into ncclShmem
switch (tid/WARP_SIZE) {
case 0:
dst = &ncclShmem.comm;
src = comm;
bytes = sizeof(ncclDevComm);
// Use first 2 warps to load comm and channel, and reamaining load work batch.
switch (tid/WARP_SIZE) {
case 0:
{ void* dst = &ncclShmem.comm;
void* src = ncclShmem.args.comm;
int bytes = sizeof(ncclDevComm);
static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
break;
case 1:
// Get address of channel without incurring indirect load from ncclDevComm::channels
dst = &ncclShmem.channel;
src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
bytes = sizeof(ncclDevChannel);
copyToShmem16(tid, dst, src, bytes);
} break;
case 1:
{ // Get address of channel without incurring indirect load from ncclDevComm::channels
void* dst = &ncclShmem.channel;
void* src = &((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
int bytes = sizeof(ncclDevChannel);
static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
break;
case 2:
dst = &ncclShmem.work;
src = workHead + blockIdx.x;
bytes = sizeof(ncclWork);
static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn.");
break;
default:
bytes = 0;
break;
}
if (bytes) copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
copyToShmem16(tid-WARP_SIZE, dst, src, bytes);
} break;
default:
{ int subtid = tid - 2*WARP_SIZE;
int subtn = tn - 2*WARP_SIZE;
loadWorkBatchToShmem(subtid, subtn, args, /*batchIx=*/blockIdx.x);
} break;
}
__syncthreads(); // publish ncclShmem
if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
// ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
}
while (true) {
// Notify host that all fifo reads are complete.
if (tid == 0 && ncclShmem.work.header.isLast && ncclShmem.work.header.inFifo) {
*ncclShmem.channel.workFifoDone = ncclShmem.work.header.doneAcks;
}
__syncwarp();
if (ncclShmem.work.header.type == ncclWorkTypeColl) {
if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]);
} else if (ncclShmem.work.header.type == ncclWorkTypeRegColl) {
if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem.work.regElems[tid].elem);
}
__syncthreads();
if (0 <= SpecializedFnId && ncclShmem.work.header.funcIndex == (unsigned)SpecializedFnId) {
SpecializedRunWork().run(&ncclShmem.work);
if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) {
SpecializedRunWorkBatch().run();
} else {
ncclDevFuncTable[ncclShmem.work.header.funcIndex]();
ncclDevFuncTable[ncclShmem.funcId]();
}
int workIxNext = ncclShmem.work.header.workNext;
if (ncclShmem.nextBatchIx == -1) break;
int batchIx = ncclShmem.nextBatchIx;
__syncthreads();
if (ncclShmem.work.header.isLast) break;
loadWorkBatchToShmem(tid, tn, args, batchIx);
copyToShmem16(tid, &ncclShmem.work, workHead + workIxNext, sizeof(ncclWork));
{ // Check whether the last operation was aborted and make sure all threads exit
int aborted = tid == 0 ? *comm->abortFlag : 0;
if (barrierReduceAny(aborted)) // publish ncclShmem.work
break;
// Check whether the last operation was aborted and make sure all threads exit
bool aborted = false;
if (tid == 0) aborted = *ncclShmem.comm.abortFlag;
aborted = barrier_red_or_aligned(aborted, 0); // publish ncclShmem.work
if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
// ncclShmem.workConsumed written by loadWorkBatchToShmem before barrier_red_or()
ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
}
if (aborted) break;
}
}
__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
__global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
__device__ void ncclDevFunc_Nop();
#define DEFINE_ncclDevKernel(suffix, coll, redop, ty, algo, proto, specializedFnId) \
__global__ void ncclDevKernel_##suffix(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
ncclKernelMain<specializedFnId, RunWork<coll, ty, redop<ty>, algo, proto>>(comm, channelMask, workHead); \
__global__ void ncclDevKernel_##suffix(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { \
ncclKernelMain<specializedFnId, RunWorkBatch<coll, ty, redop<ty>, algo, proto>>(&args4K.args); \
}
#define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto) \
__device__ void ncclDevFunc_##suffix() { \
RunWork<coll, ty, redop<ty>, algo, proto>().run(&ncclShmem.work); \
RunWorkBatch<coll, ty, redop<ty>, algo, proto>().run(); \
}
#endif
+3 -1
Zobrazit soubor
@@ -233,6 +233,8 @@ with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
out('#include "device.h"\n')
out("\n")
out("extern int const ncclDevFuncIdCount = %d;\n" % len(primary_funcs))
# The mapping from function rows to valid primary function ids.
out("extern int const ncclDevFuncRowToId[] = {\n")
index = 0
@@ -251,7 +253,7 @@ with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
cudart, _ = required_cuda(*kfn)
sym = paste("_", "ncclDevKernel", *kfn)
if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
out("__global__ void %s(struct ncclDevComm*, uint64_t, struct ncclWork*);\n" % sym)
out("__global__ void %s(ncclDevKernelArgs4K const);\n" % sym)
if cudart != 0: out("#endif\n")
out("\n")
+7 -7
Zobrazit soubor
@@ -10,7 +10,7 @@
#include "unpack_defs.h"
#include "op128.h"
#include "align.h"
#include "bitops.h"
#include "device.h"
#include "common.h"
@@ -35,16 +35,16 @@ inline __device__ void ncclNetDeviceUnpackSetup(void* ohandle, const int group,
struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
ncclShmem.groups[group].devicePlugin.unpack.g_meta[index] = handle->meta;
ncclShmem.devicePlugin.unpack.bounce_buf = handle->bounce_buf;
ncclShmem.groups[group].devicePlugin.unpack.head = handle->head;
ncclShmem.groups[group].devicePlugin.unpack.head[index] = handle->head;
}
inline __device__ void ncclNetDeviceIncrementHead(const int group) {
ncclShmem.groups[group].devicePlugin.unpack.head++;
inline __device__ void ncclNetDeviceIncrementHead(const int group, const int index) {
ncclShmem.groups[group].devicePlugin.unpack.head[index]++;
}
inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group) {
inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group, const int index) {
struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
handle->head = ncclShmem.groups[group].devicePlugin.unpack.head;
handle->head = ncclShmem.groups[group].devicePlugin.unpack.head[index];
}
template <uint8_t sz>
@@ -183,7 +183,7 @@ inline __device__ void ncclNetDeviceUnpack</*Recv=*/1>(
// Pack data from the internal iovec to the supplied flat srcs buffer using all the threads
// + Src is necessary in the case of accessing the user buffer directly
ncclNetDeviceUnpackInner(tid, tidInBlock, nworkers, group /* in case they need to use split warps shared memory partitioning*/,
ix, ncclShmem.groups[group].srcs[ix + Src], workSize, ncclShmem.groups[group].devicePlugin.unpack.head);
ix, ncclShmem.groups[group].srcs[ix + Src], workSize, ncclShmem.groups[group].devicePlugin.unpack.head[ix]);
}
}
+1 -1
Zobrazit soubor
@@ -54,7 +54,7 @@ struct unpackShmem {
struct unpackGroupShmem {
int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv
uint64_t head;
uint64_t head[NET_UNPACK_MAX_NPEERS];
struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy
};
+6 -4
Zobrazit soubor
@@ -44,10 +44,11 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
inline __device__ void barrier() {
if (nthreads == WARP_SIZE)
if (nthreads == WARP_SIZE) {
__syncwarp();
else
asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
} else {
barrier_sync(15-group, nthreads);
}
}
uint32_t abort = 0;
@@ -323,7 +324,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
__device__ Primitives(
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr,
bool userBufReg=false, int stepSize_=0
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
+3 -2
Zobrazit soubor
@@ -50,7 +50,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; }
inline __device__ void barrier() {
asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
barrier_sync(15-group, nthreads);
}
uint32_t abort = 0;
@@ -364,7 +364,8 @@ public:
__device__ Primitives(
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclDevWorkColl* e = nullptr,
bool userBufReg=false, int stepSize_=0
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
+21 -37
Zobrazit soubor
@@ -23,7 +23,7 @@ class Primitives<
ConnFifoEnabled = 0x100,
DirectWrite = 0x200,
DirectRead = 0x400,
ThreadsSynced = 0x800,
// 0x800 is free to use
NvlsMinPolling = 0x1000,
NetDeviceUnpack = 0x2000,
AnyNetDeviceUnpack = 0x4000,
@@ -44,53 +44,38 @@ class Primitives<
uint64_t *connStepPtr;
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
int connStepSize; // Connection step size
void* mhandle;
void* netDeviceHandle;
// Don't use barrier 0 as it's used by the final sync
__device__ void barrier() {
flags |= ThreadsSynced;
if (nthreads == WARP_SIZE) __syncwarp();
else {
int bar = 15-group;
asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nthreads) : "memory");
barrier_sync(bar, nthreads);
}
}
__device__ void subBarrier() {
if (nworkers == WARP_SIZE) __syncwarp();
else {
int bar = (nworkers==nthreads ? 15 : 8) - group;
asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nworkers) : "memory");
int bar = 15-group - (nworkers!=nthreads ? 1 : 0);
barrier_sync(bar, nworkers);
}
}
__device__ bool barrierAny(int vote) {
flags |= ThreadsSynced;
if (nthreads == WARP_SIZE) {
return __any_sync(~0u, vote);
} else {
int ans, bar = 15-group;
asm volatile(
"{ .reg .pred p;"
" setp.ne.s32 p, %1, 0;"
" bar.red.or.pred p, %2, %3, p; "
" selp.s32 %0, 1, 0, p; }"
: "=r"(ans) : "r"(vote), "r"(bar), "r"(nthreads) : "memory");
return ans != 0;
int name = 15-group;
return barrier_red_or(vote, name, nthreads);
}
}
__device__ bool subBarrierAny(int vote) {
if (nworkers == WARP_SIZE) {
return __any_sync(~0u, vote);
} else {
int ans, bar = (nworkers==nthreads ? 15 : 8) - group;
asm volatile(
"{ .reg .pred p;"
" setp.ne.s32 p, %1, 0;"
" bar.red.or.pred p, %2, %3, p; "
" selp.s32 %0, 1, 0, p; }"
: "=r"(ans) : "r"(vote), "r"(bar), "r"(nworkers) : "memory");
return ans != 0;
int name = 15-group - (nworkers!=nthreads ? 1 : 0);
return barrier_red_or(vote, name, nworkers);
}
}
@@ -164,8 +149,8 @@ class Primitives<
else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
}
if ((flags & (AnyNetDeviceUnpack)) && (flags & (Recv*RoleWaitRecv))) {
ncclNetDeviceIncrementHead(group);
if (flags & NetDeviceUnpack) {
ncclNetDeviceIncrementHead(group, index);
}
step += StepPerSlice;
}
@@ -436,7 +421,7 @@ private:
}
}
__device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
__device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
if (flags & (RoleWaitRecv|RolePostRecv)) {
auto *conn = &peer->recv[connIndex];
if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
@@ -488,7 +473,7 @@ private:
}
}
__device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
__device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
if (flags & (RoleWaitSend|RolePostSend)) {
auto *conn = &peer->send[connIndex];
step = conn->step;
@@ -538,13 +523,13 @@ private:
__device__ Primitives(
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,bool userBufReg=false, int stepSize_=0
):
tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
// For send operations, we need an extra warp to overlap the threadfence and the copy
this->nworkers = nthreads - (MaxSend > 0 && nthreads-WARP_SIZE >= 64 ? WARP_SIZE : 0);
this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0);
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
@@ -572,7 +557,7 @@ private:
loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
if (p2p && p2p->reg) flags |= UserBufferMode;
if (userBufReg) flags |= UserBufferMode;
if (barrierAny(flags & NetDeviceUnpack)) {
flags |= AnyNetDeviceUnpack;
@@ -584,13 +569,12 @@ private:
}
}
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e);
}
__device__ ~Primitives() {
// Ensure ncclShmem.groups[].send/recvConns are available
if (!(flags & ThreadsSynced))
barrier();
barrier();
// Save steps for the next operation
if (flags & (RolePostSend|RolePostRecv)) {
auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
@@ -606,8 +590,8 @@ private:
while (*ptr != -1) if (checkAbort(spins)) break;
}
if ((flags & (AnyNetDeviceUnpack)) && (flags & (RoleWaitRecv))) {
ncclNetDeviceSaveHead(netDeviceHandle, group);
if (flags & NetDeviceUnpack) {
ncclNetDeviceSaveHead(netDeviceHandle, group, index);
}
// Make sure all threads are done writing back conn->step and done using
@@ -615,7 +599,7 @@ private:
barrier();
}
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) {
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* e) {
if (tid==0) {
ncclShmem.groups[group].userInput = (void*)inputBuf;
ncclShmem.groups[group].userOutput = (void*)outputBuf;
@@ -625,7 +609,7 @@ private:
bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite));
bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead) || (flags == (flags|RoleWaitRecv|NvlsDirectRead)); // receiver accepts direct buffer
int regUsed = e != nullptr ? e->elem.regUsed : 0;
int regUsed = e != nullptr ? e->coll.regUsed : 0;
if (Direct && recvProvider) {
int spins = 0;
+16 -17
Zobrazit soubor
@@ -10,22 +10,21 @@
namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = (int)args->nWarps * WARP_SIZE;
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
ncclRing *ring = &ncclShmem.channel.ring;
const int nranks = ncclShmem.comm.nRanks;
const int rank = ncclShmem.comm.rank;
const int prevRank = ring->userRanks[nranks-1];
const int root = args->root;
const size_t chunkCount = args->chunkCount;
const size_t channelCount = args->workCount;
const size_t gridOffset = args->workOffset;
const int root = work->root;
size_t chunkCount;
size_t channelCount;
size_t gridOffset;
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
size_t offset;
int nelem;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
if (prevRank == root) {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
@@ -52,23 +51,23 @@ namespace {
}
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
using Proto = ProtoSimple<REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
runRing<T, RedOp, Proto>(tid, nthreads, work);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL>(args);
struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL128>(args);
struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
}
};
+42 -1
Zobrazit soubor
@@ -37,6 +37,7 @@ template<typename T>
struct FuncSum { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; };
template<typename T>
struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; };
template<typename T>
struct FuncMinMax {
using EltType = T;
@@ -47,9 +48,30 @@ struct FuncMinMax {
isMinNotMax = (opArg&1)==0;
}
};
template<typename T> struct FuncPreMulSum;
template<typename T> struct FuncSumPostDiv;
////////////////////////////////////////////////////////////////////////////////
// Trait class for handling the reduction argument.
template<typename Fn>
struct RedOpArg { // default case: no argument
static constexpr bool ArgUsed = false;
__device__ static uint64_t loadArg(void *ptr) { return 0; }
};
template<typename T>
struct RedOpArg<FuncMinMax<T>> {
static constexpr bool ArgUsed = true;
__device__ static uint64_t loadArg(void *ptr) {
union { uint64_t u64; T val; };
u64 = 0;
val = *(T*)ptr;
return u64;
}
};
////////////////////////////////////////////////////////////////////////////////
// Trait classes for reduction functions. Given a function (FuncSum, etc.)
// and a number of elements in a pack, will reduce, preOp, or postOp a pack
@@ -356,6 +378,17 @@ struct Apply_PostOp<Fn, /*EltPerPack=*/0> {
////////////////////////////////////////////////////////////////////////////////
// FuncPreMulSum
template<typename T>
struct RedOpArg<FuncPreMulSum<T>> {
static constexpr bool ArgUsed = true;
__device__ static uint64_t loadArg(void *ptr) {
union { uint64_t u64; T val; };
u64 = 0;
val = *(T*)ptr;
return u64;
}
};
// General definition for all integral types, float, and double.
template<typename T>
struct FuncPreMulSum {
@@ -486,6 +519,14 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
////////////////////////////////////////////////////////////////////////////////
// FuncSumPostDiv
template<typename T>
struct RedOpArg<FuncSumPostDiv<T>> {
static constexpr bool ArgUsed = true;
__device__ static uint64_t loadArg(void *ptr) {
return *(uint64_t*)ptr;
}
};
template<typename T, bool IsFloating=IsFloatingPoint<T>::value>
struct FuncSumPostDiv_IntOnly;
@@ -658,7 +699,7 @@ struct Apply_LoadMultimem {
static constexpr bool IsFloat = IsFloatingPoint<T>::value;
static constexpr int BigPackSize =
IsFloat && IsSum && sizeof(T) < 8 ? 16 :
IsFloat && IsSum ? 8 :
IsFloat && IsSum ? sizeof(T) :
IsFloat && IsMinMax && sizeof(T)==2 ? 16 :
!IsFloat && (IsSum||IsMinMax) && sizeof(T)>=4 ? sizeof(T) :
/*multimem.ld_reduce not supported:*/ 0;
+58 -59
Zobrazit soubor
@@ -10,23 +10,22 @@
namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
const uint32_t nthreads = (uint32_t)args->nWarps * WARP_SIZE;
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
ncclRing *ring = &ncclShmem.channel.ring;
int const *ringRanks = ring->userRanks;
const size_t chunkCount = args->chunkCount;
const int nranks = ncclShmem.comm.nRanks;
size_t channelCount = args->workCount;
size_t gridOffset = args->workOffset;
size_t count;
size_t gridOffset;
size_t channelCount;
size_t chunkCount;
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
size_t offset;
size_t dataOffset;
size_t count = args->count;
uint32_t nelem;
int rankDest;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
nelem = min(chunkCount, channelCount - elemOffset);
@@ -54,56 +53,56 @@ namespace {
}
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
using Proto = ProtoSimple<REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
runRing<T, RedOp, Proto>(tid, nthreads, work);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL>(args);
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL128>(args);
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
const int tid = threadIdx.x;
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
const size_t chunkCount = args->chunkCount;
const size_t count = args->count;
size_t count;
size_t gridOffset;
size_t channelCount;
size_t chunkCount;
ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
const int rank = ncclShmem.comm.rank;
const int nranks = ncclShmem.comm.nRanks;
size_t gridOffset = args->workOffset;
size_t channelCount = args->workCount;
size_t offset;
int nelem;
/* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync;
* if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth
* and the rest are allocated to scatter. */
const int nThreadsReduce = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
const int nThreadsScatter = args->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce);
const int nThreadsReduce = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
const int nThreadsScatter = work->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce);
const int tidEndScatter = nThreadsScatter;
const int tidEndReduce = tidEndScatter + nThreadsReduce;
if (!args->regUsed) {
if (!work->regUsed) {
if (tid < tidEndScatter) {
// Scatter
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
@@ -113,8 +112,8 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
// Reduce through NVLS
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, work->recvbuff,
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
@@ -127,7 +126,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL,
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
prims.scatter(0, 0, 0, 0, -1, 0);
}
@@ -138,8 +137,8 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
// Reduce through NVLS
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, args->recvbuff,
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, work->recvbuff,
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
size_t outOffset = gridOffset + elemOffset;
size_t inpOffset = outOffset + rank * count;
@@ -155,10 +154,10 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
template<bool ReduceSendNotRecv>
struct Scatterer {
struct ncclWorkElem* args;
struct ncclDevWorkColl* work;
int chunkSize;
ssize_t railGridOffset;
@@ -173,11 +172,11 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
int nNodes = ncclShmem.comm.nNodes;
int nRails = direct->nHeads;
int bid = args->bid;
void* inbuf = (void*)args->sendbuff;
ssize_t sizePerRank = args->count;
int part = ncclShmem.channelId - work->channelLo;
void* inbuf = (void*)work->sendbuff;
ssize_t sizePerRank = work->collnet.count;
ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank);
ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank);
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
int railAllSize = railAllEnd - railAllBeg;
if (tid < nDsts) dstSizes[tid] = railAllSize;
@@ -204,7 +203,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
/*MultimemSrcs=*/0, 1+MinSrcs, 1+MaxSrcs,
/*MultimemDsts,MinDsts,MaxDsts=*/0,1,1,
/*PreOpSrcs=*/1>
(tid, tn, args->redOpArg, &args->redOpArg, false,
(tid, tn, work->redOpArg, &work->redOpArg, false,
/*nSrcs=*/1+nSrcs, [=]__device__(int s) {
return s==0 ? (T*)inbuf + userOneBeg
: (T*)srcPtrs[s-1] + railAllOffset;
@@ -223,23 +222,23 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
}
};
__device__ __forceinline__ void run(ncclWorkElem *args) {
int tid = threadIdx.x;
const int nChannels = args->nChannels;
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
const int part = ncclShmem.channelId - work->channelLo;
const int nChannels = work->channelHi - work->channelLo + 1;
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
int const &nNodes = ncclShmem.comm.nNodes;
ssize_t chunkSize = int(args->chunkCount);
ssize_t sizePerRank = args->count;
ssize_t chunkSize = int(work->collnet.chunkCount);
ssize_t sizePerRank = work->collnet.count;
if (direct->out == -1) __trap();
bool isMultiRail = (direct->nHeads > 1);
int nWarps1 = (isMultiRail ? 2 : 0);
int nWarps2 = (isMultiRail ? 2 : 1);
int nWarps3 = 1;
float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3);
float denom = float(work->nWarps)/float(nWarps1+nWarps2+nWarps3);
nWarps3 = int(denom*nWarps3);
nWarps2 = int(denom*nWarps2);
nWarps1 = args->nWarps - (nWarps2+nWarps3);
nWarps1 = work->nWarps - (nWarps2+nWarps3);
using Proto = ProtoSimple<1, 1>;
@@ -248,13 +247,13 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
// Phase 1: Scatter inputs to peers
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, tn, nullptr, direct->heads+1, nullptr, nullptr,
args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
Scatterer</*ReduceSendNotRecv=*/true> scat;
scat.args = args;
scat.work = work;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.process</*Recv=*/0, /*Send=*/1>(scat);
prims.template process</*Recv=*/0, /*Send=*/1>(scat);
}
return;
}
@@ -262,7 +261,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
tn = nWarps2*WARP_SIZE;
if (tid < tn) {
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (tid == 0) {
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
@@ -272,13 +271,13 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
// Phase 2: Reduce from peers + local input -> send to network
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr,
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
Scatterer</*ReduceSendNotRecv=*/false> scat;
scat.args = args;
scat.work = work;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.process</*Recv=*/1, /*Send=*/1>(scat);
prims.template process</*Recv=*/1, /*Send=*/1>(scat);
}
}
return;
@@ -287,7 +286,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
tn = nWarps3*WARP_SIZE;
if (tid < tn) {
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
if (tid == 0) {
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
@@ -296,10 +295,10 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
} else {
// Phase 3: recv from network
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid, tn, &direct->out, nullptr, nullptr, args->recvbuff,
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0);
prims(tid, tn, &direct->out, nullptr, nullptr, work->recvbuff,
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0);
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
ssize_t railAllBeg = railGridOffset + args->bid * chunkSize;
ssize_t railAllBeg = railGridOffset + part * chunkSize;
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
ssize_t railOneEnd = railOneBeg + sizePerRank;
+141 -65
Zobrazit soubor
@@ -9,83 +9,159 @@
#include "primitives.h"
template<typename T, typename RedOp>
struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
static_assert(sizeof(T)==1, "SendRecv only works on single byte types T.");
template<typename Proto>
__device__ void runSend(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
if (args->peer == ncclShmem.comm.rank) {
struct ncclWorkElemP2p* recvArgs = args-1;
void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
if (buff != recvBuff) {
reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
(tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count);
}
} else {
int chunkSize = args->chunkSize/sizeof(T);
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
int const peer = args->peer;
Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T));
size_t offset = 0;
do {
int nelem = min(size_t(chunkSize), count-offset);
prims.directSend(offset, offset, nelem);
offset += nelem;
} while(offset < count && args->reg == 0);
}
__device__ void runSend(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
size_t bytes = work->sendBytes;
int chunkSize = u32fp8Decode(work->sendChunkSize_u32fp8);
Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1>
prims(tid, tn, nullptr, &work->sendRank, work->sendAddr, nullptr,
/*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
/*userBufferMode=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize);
size_t cursor = 0;
do {
int n = min(size_t(chunkSize), bytes-cursor);
prims.directSend(cursor, cursor, n);
cursor += n;
} while (cursor < bytes && work->sendRegistered == 0);
}
template<typename Proto>
__device__ void runRecv(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
if (args->peer != ncclShmem.comm.rank) {
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
int chunkSize = args->chunkSize/sizeof(T);
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
int const peer = args->peer;
Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T));
size_t offset = 0;
do {
int nelem = min(size_t(chunkSize), count-offset);
prims.directRecv(offset, nelem);
offset += nelem;
} while(offset < count && args->reg == 0);
}
__device__ void runRecv(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
size_t bytes = work->recvBytes;
int chunkSize = u32fp8Decode(work->recvChunkSize_u32fp8);
Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1>
prims(tid, tn, &work->recvRank, nullptr, nullptr, work->recvAddr,
/*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
/*userBufferMode=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize);
size_t cursor = 0;
do {
int n = min(size_t(chunkSize), bytes-cursor);
prims.directRecv(cursor, n);
cursor += n;
} while (cursor < bytes && work->recvRegistered == 0);
}
__device__ __forceinline__ void run(ncclWork *work) {
struct ncclWorkElemP2p* args = work->p2pElems;
int ngroups = args->ngroups;
int tid = threadIdx.x;
int wid = tid / WARP_SIZE;
// This has to work even for groups of 2.5 warps (which is 8 groups, and means 3
// warps for send, 2 warps for recv).
// warpStarts were rounded thanks to int division, but for group number we need to round the other way around
// So we mirror wid then mirror again the group.
#define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
uint8_t group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
args += group;
tid -= args->warpStart * WARP_SIZE;
int nthreads = args->nWarps * WARP_SIZE;
__device__ __forceinline__ void run() {
const int tid = threadIdx.x;
const int tn = blockDim.x;
const int wid = tid/WARP_SIZE;
const int nWarps = tn/WARP_SIZE;
const int lane = tid%WARP_SIZE;
if (args->p2pType == ncclWorkP2pTypeUnused) return;
if (tid >= nthreads || args->peer == -1) return;
struct Shared {
uint32_t workSendMask; // bitmasks of which work indices have send/recv
uint32_t workRecvMask;
};
Shared* shared = (Shared*)ncclScratchForWarp(0);
// Select Proto here
// This is to allow the same kernel to run multiple primitives on different warps (thread groups)
if ((group%2) == 0) {
if (args->proto == NCCL_PROTO_LL) {
runRecv<ProtoLL>(tid, nthreads, group, args);
struct ncclDevWorkP2p* works = (ncclDevWorkP2p*)ncclShmem.workStorage;
int nWorks = ncclShmem.nWorks;
if (wid == 0) {
// Modify the memory range of each work[] to reflect this channel's
// partition of the work. Since integer divides are very heavy it's
// best to do them all in one warp.
int workIx = lane%16;
int isSend = lane < 16 ? 0 : 1;
bool hasWork = false;
if (workIx < nWorks) {
struct ncclDevWorkP2p* work = &works[workIx];
size_t bytes = isSend ? work->sendBytes : work->recvBytes;
int nParts = isSend ? work->nSendChannels : work->nRecvChannels;
int part = ncclP2pChannelToPart(work->nP2pChannels, work->channelBase, ncclShmem.channelId);
hasWork = (part < nParts);
if (nParts != 0) {
size_t partBeg, partEnd;
ncclP2pPartBounds(nParts, part, bytes, &partBeg, &partEnd);
(isSend ? work->sendAddr : work->recvAddr) = (char*)(isSend ? work->sendAddr : work->recvAddr) + partBeg;
(isSend ? work->sendBytes : work->recvBytes) = partEnd - partBeg;
}
}
uint32_t mask = __ballot_sync(~0u, hasWork);
if (lane == 0) {
shared->workSendMask = mask>>16;
shared->workRecvMask = mask & 0xffff;
}
}
// The fastest way to compute a warp uniform division x/y in [0,32) is to
// use each lane to guess a solution and count the ones that don't exceed
// the numerator:
// __popc(__ballot_sync(~0u, y*(lane+1) <= x))
// That takes 1/3 the time of standard division and about 3/4 the time of
// approximate floating point division:
// __float2int_rd(__fdividef(float(x),float(y))).
// nWarpPerWork = nWarps/nWorks
int nWarpPerWork = __popc(__ballot_sync(~0u, nWorks*(lane+1) <= nWarps));
int nRecvWarpPerWork = nWarpPerWork<=4 ? nWarpPerWork/2 : (nWarpPerWork-1)/2;
int nSendWarpPerWork = nWarpPerWork<=4 ? nRecvWarpPerWork : nRecvWarpPerWork+1;
// This might reduce nWarpPerWork which is probably desirable. It is better
// to have a balanced number of reading and writing threads even if that
// leaves warps unused.
nWarpPerWork = nSendWarpPerWork + nRecvWarpPerWork;
// The work index this warp belongs to: workIx = wid/nWarpPerWork
int workIx = __popc(__ballot_sync(~0u, (lane+1)*nWarpPerWork <= wid));
__syncthreads(); // Wait for works[] and shared->* to be updated by warp=0
uint32_t workSendMask = shared->workSendMask;
uint32_t workRecvMask = shared->workRecvMask;
__syncthreads(); // release scratch space used by shared->*
if (nWorks <= workIx) return;
// Thread range for whole work (send & recv combined)
int subtid = tid - workIx*nWarpPerWork*WARP_SIZE;
int subtn = nWarpPerWork*WARP_SIZE;
// A send primtive of sufficient size requires 2 cuda barrier ids.
constexpr int nSendWarpsForExtraGroup = NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE/WARP_SIZE;
// Count up all group ids used below this workIx:
int group, extra;
// Each recv gets one group id:
group = __popc(workRecvMask & ((1<<workIx)-1));
// Sends accompanying recvs get one and maybe an extra:
extra = (nSendWarpPerWork >= nSendWarpsForExtraGroup) ? 1 : 0;
group += __popc((workSendMask & workRecvMask) & ((1<<workIx)-1))*(1+extra);
// Sends without recvs use more warps so compute extra accordingly:
extra = (nWarpPerWork >= nSendWarpsForExtraGroup) ? 1 : 0;
group += __popc((workSendMask & ~workRecvMask) & ((1<<workIx)-1))*(1+extra);
struct ncclDevWorkP2p* work = &works[workIx];
bool hasSend = 1 & (workSendMask>>workIx);
bool hasRecv = 1 & (workRecvMask>>workIx);
bool isCopy = work->sendRank == ncclShmem.comm.rank;
bool isSend = !hasRecv || (hasSend && subtid < nSendWarpPerWork*WARP_SIZE);
if (!isCopy && hasSend && hasRecv) {
// Translate thread ids to reflect just this send or recv as opposed to whole work.
if (isSend) {
subtn = nSendWarpPerWork*WARP_SIZE;
} else {
runRecv<ProtoSimple<1,1>>(tid, nthreads, group, args);
subtid -= nSendWarpPerWork*WARP_SIZE;
subtn = nRecvWarpPerWork*WARP_SIZE;
group += 1 + (nSendWarpPerWork >= nSendWarpsForExtraGroup ? 1 : 0);
}
}
if (isCopy) {
reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
(subtid, subtn, 0, nullptr, false, 1, &work->sendAddr, 1, &work->recvAddr, (ssize_t)work->sendBytes);
} else if (isSend) {
if (work->sendProtoLL) {
runSend<ProtoLL>(subtid, subtn, group, work);
} else {
runSend<ProtoSimple<1,1>>(subtid, subtn, group, work);
}
} else {
if (args->proto == NCCL_PROTO_LL) {
runSend<ProtoLL>(tid, nthreads, group, args);
if (work->recvProtoLL) {
runRecv<ProtoLL>(subtid, subtn, group, work);
} else {
runSend<ProtoSimple<1,1>>(tid, nthreads, group, args);
runRecv<ProtoSimple<1,1>>(subtid, subtn, group, work);
}
}
}
+1338 -1295
Zobrazit soubor
Rozdílový obsah nebyl zobrazen, protože je příliš veliký Načíst rozdílové porovnání
+19 -4
Zobrazit soubor
@@ -5,7 +5,9 @@
************************************************************************/
#include "comm.h"
#include "device.h"
#include "graph.h"
#include "transport.h"
#include "trees.h"
#include "rings.h"
#include "topo.h"
@@ -84,6 +86,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
}
}
memcpy(comm->nvlsHeads, topoRanks->nvlsHeads, sizeof(int) * topoRanks->nvlsHeadNum);
return ncclSuccess;
}
@@ -188,7 +191,7 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
char line[1024];
sprintf(line, "CollNet channel %d rank %d ", c, rank);
sprintf(line, "CollNetDirect channel %d rank %d ", c, rank);
int nDown = 0;
for (int i=0; i<nHeads; i++) {
if (rank == heads[i]) { // is head
@@ -334,10 +337,14 @@ int ncclMinNchannels() {
if (minNchannels < 0) minNchannels = 0;
return minNchannels;
}
extern int64_t ncclParamWorkArgsBytes();
int ncclMaxNchannels() {
int maxNchannels = MAXCHANNELS;
if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings();
if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
maxNchannels = std::min(maxNchannels, ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes()));
if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
if (maxNchannels < 1) {
WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
@@ -363,6 +370,8 @@ void exchangeValues(int* v0, int* v1) {
*v0 = tmp;
}
NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
@@ -444,13 +453,13 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
// Setup CollNet
if (comm->collNetSupport == 1) {
struct ncclTopoGraph* collNetGraph = graphs[NCCL_ALGO_COLLNET_DIRECT];
struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
if (collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) {
if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) {
int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
}
NCCLCHECK(connectCollNet(comm, collNetGraph));
NCCLCHECK(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]));
}
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
@@ -458,6 +467,12 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
}
// Double the number of channels when using unpack networking (greater than 1 node)
// We won't automatically double past 16 channels, users can specify 32 if they want
if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) {
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
}
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
// We permit combining max, then min, to only use the first channels, then duplicate them.
if (comm->sharedRes->owner != comm) {
+10 -15
Zobrazit soubor
@@ -10,6 +10,8 @@
#include "comm.h"
#include "net.h"
#include "channel.h"
#include "transport.h"
#include "device.h"
// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
@@ -732,12 +734,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp
NCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 1);
NCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS);
static int nextPow2(int v) {
int pow2 = 1;
while (pow2 < v) pow2 <<= 1;
return pow2;
}
extern int64_t ncclParamWorkArgsBytes();
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
/* here we already honor comm->max/minCTAs for p2pnChannels. */
@@ -759,19 +756,17 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
}
}
// Round to next pow2 nChannelsPerPeer and nChannels
comm->p2pnChannelsPerPeer = nextPow2(minChannels);
comm->p2pnChannels = nextPow2(comm->p2pnChannels);
// Make nChannelsPerPeer and nChannels powers of 2. This is relied on when
// mapping p2p peers to channels.
comm->p2pnChannelsPerPeer = pow2Up(minChannels);
comm->p2pnChannels = pow2Up(comm->p2pnChannels);
comm->p2pnChannels = std::min(comm->p2pnChannels, pow2Down(ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes())));
comm->p2pnChannelsPerPeer = std::min(comm->p2pnChannelsPerPeer, comm->p2pnChannels);
// Init channels that weren't used so far
for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c));
// We want to spread channels used when there aren't many and progressively
// fill the whole space of nChannels. To do so we mirror the bits in the
// nChannels space.
for (int c=0; c<comm->p2pnChannels; c++) {
comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels);
}
return ncclSuccess;
}
+66 -6
Zobrazit soubor
@@ -8,6 +8,7 @@
#include "core.h"
#include "graph.h"
#include "topo.h"
#include "transport.h"
#include "xml.h"
#include <math.h>
@@ -51,6 +52,15 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
return ncclSuccess;
}
ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm) {
// We assume there is at least one CPU and that the CPUs have the same
// architecture and vendor.
const struct ncclTopoNodeSet* cpus = &comm->topo->nodes[CPU];
comm->cpuArch = cpus->nodes[0].cpu.arch;
comm->cpuVendor = cpus->nodes[0].cpu.vendor;
return ncclSuccess;
}
static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, int type, struct ncclTopoLink** revLink) {
for (int l=0; l<node2->nlinks; l++) {
struct ncclTopoLink* link = node2->links+l;
@@ -104,7 +114,7 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
}
// Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1).
static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, int mult, struct ncclTopoNode** node) {
static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, float mult, struct ncclTopoNode** node) {
// First handle easy cases
*node = system->nodes[type2].nodes+index2;
if (type1 == -1) return ncclSuccess;
@@ -334,6 +344,42 @@ ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopo
return ncclSuccess;
}
ncclResult_t ncclTopoSearchTryCollnetDirect(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) {
int fwdg = 0;
int bwdg = 0;
struct ncclTopoNode* gpu = NULL;
float mul = 1.0 / (float)(system->nodes[GPU].count - 1);
do {
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, fwdg, mul, &gpu));
} while (gpu && ++fwdg < system->nodes[GPU].count);
if (gpu != NULL) {
do {
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, bwdg, GPU, g, mul, &gpu));
} while (gpu && ++bwdg < system->nodes[GPU].count);
if (gpu != NULL) {
// Both directions worked. Now we already have head, so pop the all other intra ranks.
int step = 1;
for (int index = 0; index < ngpus; ++index) {
if (index != g) {
graph->intra[graph->nChannels * ngpus + step] = system->nodes[GPU].nodes[index].gpu.rank;
step++;
}
}
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time));
}
while (bwdg) {
bwdg--;
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, bwdg, GPU, g, -mul, &gpu));
}
}
while (fwdg) {
fwdg--;
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, fwdg, -mul, &gpu));
}
return ncclSuccess;
}
ncclResult_t ncclTopoSearchTryNvls(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) {
struct ncclTopoNode* nvs;
struct ncclTopoNode* gpu;
@@ -514,6 +560,8 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
}
} else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time));
} else if (graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) {
NCCLCHECK(ncclTopoSearchTryCollnetDirect(system, graph, saveGraph, g, ngpus, time));
} else if (step < system->nodes[GPU].count-1) {
// Go to next GPU
int next[NCCL_TOPO_MAX_NODES];
@@ -552,9 +600,10 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
int* nets;
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
int netCount;
int graphFound = 0;
NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
for (int i=0; i<netCount; i++) {
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && i>0) continue;
if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) continue;
int n = nets[(graph->nChannels+i)%netCount];
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
if (graph->collNet && net->net.collSupport == 0) continue;
@@ -571,12 +620,22 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
}
}
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) {
// NVLS search only tries to find NIC:GPU combinations to compute the heads.
if (graph->nChannels < netCount) {
int gpu;
int duplicate = 0;
NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
// check whether there is duplicate head when one GPU connects with multiple NICs
for (int gc = 0; gc < graph->nChannels; gc++) {
if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) {
duplicate = 1;
break;
}
}
if (duplicate) continue;
if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
graphFound = 1;
}
} else {
if (graph->nChannels > 0) {
@@ -891,8 +950,9 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
int ccMin;
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
// NVLS search must have ngpus heads at most.
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) graph->maxChannels = system->nodes[GPU].count;
// NVLS and COLLNET_DIRECT search must have ngpus heads at most.
if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT)
graph->maxChannels = system->nodes[GPU].count;
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
@@ -1104,7 +1164,7 @@ ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, i
exit:
return ret;
fail:
WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank);
WARN("Could not find NIC for rank %d in NVLS graph", comm->rank);
goto exit;
}
+107 -35
Zobrazit soubor
@@ -11,6 +11,7 @@
#include "nvmlwrap.h"
#include "net.h"
#include "coll_net.h"
#include "transport.h"
#include <sys/stat.h>
#include <fcntl.h>
#include "xml.h"
@@ -51,7 +52,12 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode*
return ncclSuccess;
}
for (int l=0; l<node->nlinks; l++) {
if (node->links[l].type == LINK_PCI) NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
// Go up the PCI tree to find the CPU. Follow only PCI switches.
if (node->links[l].type == LINK_PCI
&& (node->links[l].remNode->type == PCI
|| node->links[l].remNode->type == CPU)) {
NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
}
if (*cpu != NULL) return ncclSuccess;
}
return ncclSuccess;
@@ -109,11 +115,6 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo
n->type = type;
n->id = id;
if (type == GPU) {
// Create link to itself (used in some corner cases)
n->nlinks=1;
n->links[0].type = LINK_LOC;
n->links[0].remNode = n;
n->links[0].bw = LOC_BW;
n->gpu.dev = NCCL_TOPO_UNDEF;
n->gpu.rank = NCCL_TOPO_UNDEF;
n->gpu.cudaCompCap = NCCL_TOPO_UNDEF;
@@ -279,8 +280,10 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
if (link->type == LINK_LOC) continue;
if (link->type != LINK_PCI || link->remNode != prevNode) {
if (link->type == LINK_LOC) {
sprintf(line+offset, "+ %s[%2.1f] - %s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], link->remNode->id);
INFO(NCCL_GRAPH, "%s", line);
} else if (link->type != LINK_PCI || link->remNode != prevNode) {
sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw);
int nextOffset = strlen(line);
if (link->type == LINK_PCI) {
@@ -443,7 +446,9 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
for (int s=0; s<xmlPci->nSubs; s++) {
struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId));
if (strcmp(xmlSubPci->name, "pcilink") != 0) { // PCI links will be added later
NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId));
}
}
}
@@ -579,6 +584,38 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
return ncclSuccess;
}
ncclResult_t ncclTopoAddPciLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) {
if (strcmp(node->name, "pcilink") == 0) {
struct ncclTopoNode* pci = NULL;
int64_t pBusId;
NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
pBusId = NCCL_TOPO_ID(systemId, pBusId);
NCCLCHECK(ncclTopoGetNode(system, &pci, PCI, pBusId));
if (pci == NULL) {
WARN("Add PCI Link error : could not find PCI SW %lx", pBusId);
return ncclInternalError;
}
struct ncclTopoNode* remote = NULL;
const char* target;
NCCLCHECK(xmlGetAttrStr(node, "target", &target));
int64_t busId;
NCCLCHECK(busIdToInt64(target, &busId));
NCCLCHECK(ncclTopoGetNode(system, &remote, PCI, NCCL_TOPO_ID(systemId, busId)));
if (remote) NCCLCHECK(ncclTopoConnectNodes(pci, remote, LINK_LOC, LOC_BW));
} else {
if (strcmp(node->name, "cpu") == 0) {
NCCLCHECK(ncclGetSystemId(system, node, &systemId));
}
const char* busId;
NCCLCHECK(xmlGetAttr(node, "busid", &busId));
for (int s=0; s<node->nSubs; s++) {
NCCLCHECK(ncclTopoAddPciLinks(node->subs[s], system, busId ? busId : parentBusId, systemId));
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) {
if (strcmp(node->name, "c2c") == 0) {
struct ncclTopoNode* gpu = NULL;
@@ -626,6 +663,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0));
NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0));
NCCLCHECK(ncclTopoAddPciLinks(topNode, *topoSystem, NULL, 0));
NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
@@ -668,6 +706,18 @@ static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrN
return ncclSuccess;
}
ncclResult_t ncclTopoRefreshBcmP2pLinks(void) {
//refresh the switch topology by reading the link below
FILE *fp = fopen("/sys/kernel/pci_switch_link/refresh_switch_toplogy", "r");
if (fp != NULL) {
int tmp;
size_t r = fread(&tmp, sizeof(tmp), 1, fp);
if (r != 1)
INFO(NCCL_GRAPH, "Failed to read refresh_switch_toplogy");
fclose(fp);
}
return ncclSuccess;
}
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
struct ncclXml* xml;
@@ -687,18 +737,17 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
}
// Auto-detect GPUs if needed
for (int r=0; r<comm->nRanks; r++) {
if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
struct ncclXmlNode* node;
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
if (node == NULL) continue;
NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
NCCLCHECK(xmlSetAttrInt(node, "rank", r));
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
}
NCCLCHECK(ncclTopoRefreshBcmP2pLinks());
// Detect only the GPU managed by this process. We'll get any others through XML fusion.
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
NCCLCHECK(int64ToBusId(comm->peerInfo[comm->rank].busId, busId));
struct ncclXmlNode* node;
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
if (node) {
NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
NCCLCHECK(xmlSetAttrInt(node, "rank", comm->rank));
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport));
}
// Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
// so we start with collnet so that it has precedence.
@@ -728,6 +777,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
NCCLCHECK(comm->ncclNet->getProperties(n, &props));
comm->netDeviceType = props.netDeviceType;
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
@@ -745,24 +795,46 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
NCCLCHECK(ncclTopoTrimXml(xml));
// XML topo fusion.
int* localRanks;
int localRank = -1, nLocalRanks = 0;
if (comm->MNNVL) {
// MNNVL clique support
char* mem;
NCCLCHECK(ncclCalloc(&mem, comm->clique.size * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*comm->cliqueRank);
memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->clique.ranks, comm->cliqueRank, comm->clique.size, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
struct ncclXml* cliqueXml;
NCCLCHECK(xmlAlloc(&cliqueXml, comm->clique.size*NCCL_TOPO_XML_MAX_NODES));
for (int i = 0; i < comm->clique.size; i++) {
struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
NCCLCHECK(ncclTopoFuseXml(cliqueXml, peerXml));
nLocalRanks = comm->clique.size;
localRank = comm->cliqueRank;
localRanks = comm->clique.ranks;
} else {
// Intra-node fusion. Much of the comm is not initialized yet at this point so we need to do our own calculations.
NCCLCHECK(ncclCalloc(&localRanks, comm->nRanks));
for (int i = 0; i < comm->nRanks; i++) {
if (comm->peerInfo[i].hostHash == comm->peerInfo[comm->rank].hostHash) {
if (i == comm->rank)
localRank = nLocalRanks;
localRanks[nLocalRanks++] = i;
}
}
free(xml);
xml = cliqueXml;
}
char* mem;
NCCLCHECK(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank);
memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
if (comm->MNNVL) {
// Ensure that we have enough room when fusing topos from multiple nodes.
free(xml);
NCCLCHECK(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES));
} else {
// In the intra-node case there's no need to enlarge the topo xml.
xml->maxIndex = 0;
free(localRanks);
}
for (int i = 0; i < nLocalRanks; i++) {
struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
NCCLCHECK(ncclTopoFuseXml(xml, peerXml));
}
free(mem);
xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
+1 -1
Zobrazit soubor
@@ -218,7 +218,7 @@ static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, int64_t id
return ncclSuccess;
}
}
WARN("Could not find NET with id %lx\n", id);
WARN("Could not find NET with id %lx", id);
return ncclInternalError;
}
+15 -17
Zobrazit soubor
@@ -110,11 +110,9 @@ NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);
static float getNetOverhead(struct ncclComm* comm) {
if (ncclParamNetOverhead() != -2) return ncclParamNetOverhead() * .001;
int cpuArch, cpuVendor, cpuModel;
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0;
if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0;
else return 1.0;
if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0;
if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0;
return 1.0;
}
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
@@ -317,6 +315,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
}
if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
if (a == NCCL_ALGO_RING && pEnable == 0) comm->ringbdw[c][p] = 0;
}
for (int c = 0; c < NCCL_NUM_FUNCTIONS; c++) {
@@ -415,15 +414,15 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
{ .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .6, .7, .8, .7, .7, .8, .9, .9 }
};
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup) {
float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
float lat = info->comm->latencies[info->coll][algorithm][protocol];
ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup) {
float bw = comm->bandwidths[coll][algorithm][protocol];
float lat = comm->latencies[coll][algorithm][protocol];
if (backup) {
*backup = false;
if (algorithm == NCCL_ALGO_RING && bw == 0.0f) {
/* try back up RING algorithm */
bw = info->comm->ringbdw[info->coll][protocol];
bw = comm->ringbdw[coll][protocol];
*backup = true;
}
}
@@ -431,15 +430,14 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
if (bw == 0) {
*time = -1.0; return ncclSuccess;
}
int logSize = log2i(info->nBytes>>6);
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
&& info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) {
lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
int logSize = log2i(nBytes>>6);
if (algorithm == NCCL_ALGO_TREE && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && comm->nNodes > 1
&& coll == ncclFuncAllReduce && nBytes/(comm->nChannels*comm->nRanks) >= 64) {
lat *= comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
}
// Tree pipelining saves latency in aggregation cases
int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS);
*time = lat * latCount + (info->nBytes) / (1000 * bw);
int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_DEV_WORK_BATCH_COLLS);
*time = lat * latCount + nBytes / (1000 * bw);
return ncclSuccess;
}
+85 -53
Zobrazit soubor
@@ -272,56 +272,34 @@ ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml)
return ncclSuccess;
}
ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src) {
struct ncclXmlNode* topNode;
NCCLCHECK(xmlFindTag(dst, "system", &topNode));
static ncclResult_t xmlTopoFuseXmlRecursive(struct ncclXml* dst, struct ncclXmlNode* dstParent, struct ncclXmlNode* srcParent) {
for (int i = 0; i < srcParent->nSubs; i++) {
struct ncclXmlNode* srcNode = srcParent->subs[i];
struct ncclXmlNode* dstNode;
NCCLCHECK(xmlFindNode(dstParent, srcNode, &dstNode));
if (dstNode == NULL) {
NCCLCHECK(xmlAddTree(dst, dstParent, srcNode));
} else {
NCCLCHECK(xmlTopoFuseXmlRecursive(dst, dstNode, srcNode));
}
}
return ncclSuccess;
}
if (topNode == NULL) {
ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src) {
struct ncclXmlNode* topNodeDst;
NCCLCHECK(xmlFindTag(dst, "system", &topNodeDst));
if (topNodeDst == NULL) {
xmlAddTree(dst, NULL, src->nodes);
return ncclSuccess;
}
// Fuse the CPUs with the first XML
struct ncclXmlNode* srcCpu;
NCCLCHECK(xmlFindTag(src, "cpu", &srcCpu));
while (srcCpu) {
const char* srcNumaId;
const char* srcHostHash;
NCCLCHECK(xmlGetAttr(srcCpu, "numaid", &srcNumaId));
if (srcNumaId == NULL) {
WARN("TopoFuseXmls : could not find CPU numa ID.");
return ncclInternalError;
}
xmlGetAttr(srcCpu, "host_hash", &srcHostHash);
if (srcHostHash == NULL)
srcHostHash = "0";
struct ncclXmlNode* topNodeSrc;
NCCLCHECK(xmlFindTag(src, "system", &topNodeSrc));
// Search through the destination for a duplicate. Note that
// this makes the complexity of this whole function O(n^2), but n
// is expected to be small.
struct ncclXmlNode* dstCpu;
NCCLCHECK(xmlFindTag(dst, "cpu", &dstCpu));
while (dstCpu) {
const char* dstNumaId;
const char* dstHostHash;
NCCLCHECK(xmlGetAttr(dstCpu, "numaid", &dstNumaId));
if (dstNumaId == NULL) {
WARN("TopoFuseXmls : could not find CPU numa ID.");
return ncclInternalError;
}
xmlGetAttr(dstCpu, "host_hash", &dstHostHash);
if (dstHostHash == NULL)
dstHostHash = "0";
if (strcmp(srcNumaId, dstNumaId) == 0 && strcmp(srcHostHash, dstHostHash) == 0)
break;
NCCLCHECK(xmlTopoFuseXmlRecursive(dst, topNodeDst, topNodeSrc));
NCCLCHECK(xmlFindNextTag(dst, "cpu", dstCpu, &dstCpu));
}
// Only add the CPU if no duplicate was found
if (dstCpu == NULL)
NCCLCHECK(xmlAddTree(dst, topNode, srcCpu));
NCCLCHECK(xmlFindNextTag(src, "cpu", srcCpu, &srcCpu));
}
return ncclSuccess;
}
@@ -335,6 +313,11 @@ ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclX
return ncclSuccess;
}
ncclResult_t ncclTopoXmlLoadPciLink(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
return ncclSuccess;
}
ncclResult_t ncclTopoXmlLoadC2c(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
return ncclSuccess;
@@ -357,8 +340,8 @@ ncclResult_t ncclTopoXmlLoadNic(FILE* file, struct ncclXml* xml, struct ncclXmlN
}
ncclResult_t ncclTopoXmlLoadPci(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic} };
NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 3));
struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic}, { "pcilink", ncclTopoXmlLoadPciLink} };
NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 4));
return ncclSuccess;
}
@@ -423,6 +406,28 @@ static ncclResult_t getPciPath(const char* busId, char** path) {
return ncclSuccess;
}
#include <dirent.h>
static ncclResult_t getBcmLinks(const char* busId, int* nlinks, char** peers) {
*nlinks = 0;
*peers = NULL;
char dirPath[] = "/sys/kernel/pci_switch_link/virtual_switch_links/0000:00:00.0";
memcpylower(dirPath+sizeof("/sys/kernel/pci_switch_link/virtual_switch_links/")-1, busId, BUSID_SIZE-1);
DIR *dir = opendir(dirPath);
if (dir) {
struct dirent* file;
while ((file = readdir(dir)) != NULL) {
if (strlen(file->d_name) != BUSID_SIZE-1) continue;
char* path;
if (getPciPath(file->d_name, &path) == ncclSystemError) continue;
free(path);
NCCLCHECK(ncclRealloc(peers, (*nlinks)*BUSID_SIZE, ((*nlinks)+1)*BUSID_SIZE));
memcpy((*peers)+BUSID_SIZE*(*nlinks)++, file->d_name, BUSID_SIZE);
}
closedir(dir);
}
return ncclSuccess;
}
ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) {
char filePath[PATH_MAX];
sprintf(filePath, "%s/%s", path, fileName);
@@ -541,10 +546,11 @@ ncclResult_t ncclTopoGetPciNode(struct ncclXml* xml, const char* busId, struct n
// There can be trailing chars.
int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')); }
int checkBDFFormat(char* bdf) {
if (bdf[4] != ':' || bdf[7] != ':' || bdf[10] != '.') return 0;
if (isHex(bdf[0]) == 0 || isHex(bdf[1] == 0) || isHex(bdf[2] == 0) || isHex(bdf[3] == 0) ||
isHex(bdf[5] == 0) || isHex(bdf[6] == 0) || isHex(bdf[8] == 0) || isHex(bdf[9] == 0) ||
isHex(bdf[11] == 0)) return 0;
if (strlen(bdf) != 12) return 0;
if ((bdf[4] != ':') || (bdf[7] != ':') || (bdf[10] != '.')) return 0;
if ((isHex(bdf[0]) == 0) || (isHex(bdf[1]) == 0) || (isHex(bdf[2]) == 0) || (isHex(bdf[3]) == 0) ||
(isHex(bdf[5]) == 0) || (isHex(bdf[6]) == 0) || (isHex(bdf[8]) == 0) || (isHex(bdf[9]) == 0) ||
(isHex(bdf[11]) == 0)) return 0;
return 1;
}
@@ -608,6 +614,24 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
NCCLCHECK(xmlSetAttr(pciNode, "link_width", ""));
}
}
const char* vendor;
NCCLCHECK(xmlGetAttr(pciNode, "vendor", &vendor));
if (vendor != NULL && strcmp(vendor, "0x1000") == 0) { // BCM switch, look for P2P connections
int nlinks;
char* peers;
NCCLCHECK(getBcmLinks(busId, &nlinks, &peers));
for (int l=0; l<nlinks; l++) {
char* target = peers+l*BUSID_SIZE;
struct ncclXmlNode* linkNode;
NCCLCHECK(xmlGetSubKv(pciNode, "pcilink", &linkNode, "target", target));
if (linkNode == NULL) {
NCCLCHECK(xmlAddNode(xml, pciNode, "pcilink", &linkNode));
NCCLCHECK(xmlSetAttr(linkNode, "target", target));
}
}
}
struct ncclXmlNode* parent = pciNode->parent;
if (parent == NULL) {
if (path) {
@@ -911,25 +935,33 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
return ncclSuccess;
}
ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node) {
ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node, int* keep) {
const char* str;
NCCLCHECK(xmlGetAttr(node, "keep", &str));
if (str && strcmp(str, "1") == 0) {
NCCLCHECK(xmlUnsetAttr(node, "keep"));
*keep = 1;
} else {
// Copy nSubs and subs as they could change as we trim recursively.
struct ncclXmlNode* subs[MAX_SUBS];
int nSubs = node->nSubs;
memcpy(subs, node->subs, node->nSubs*sizeof(struct ncclXmlNode*));
*keep = 0;
for (int s=0; s<nSubs; s++) {
NCCLCHECK(ncclTopoTrimXmlRec(subs[s]));
int k = 0;
NCCLCHECK(ncclTopoTrimXmlRec(subs[s], &k));
*keep += k;
}
if (*keep == 0 && // Trim PCI switches or CPU with no used GPU/NIC under them.
(strcmp(node->name, "pci") == 0 || strcmp(node->name, "cpu") == 0)) {
NCCLCHECK(xmlRemoveNode(node));
}
if (node->nSubs == 0) NCCLCHECK(xmlRemoveNode(node));
}
return ncclSuccess;
}
ncclResult_t ncclTopoTrimXml(struct ncclXml* xml) {
NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes));
int keep = 0;
NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes, &keep));
return ncclSuccess;
}
+24 -1
Zobrazit soubor
@@ -55,7 +55,7 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
/* Remove unneeded parts */
ncclResult_t ncclTopoTrimXml(struct ncclXml* xml);
/* Fuse multiple system XMLs into one, skipping duplicate CPUs */
/* Fuse multiple system XMLs into one, skipping duplicate entries */
ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src);
/* Relocate pointers in XML to (de-)serialize the structure */
ncclResult_t ncclTopoConvertXml(struct ncclXml* xml, uintptr_t base, int exp);
@@ -172,6 +172,29 @@ static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagName, struc
return ncclSuccess;
}
static ncclResult_t xmlFindNode(struct ncclXmlNode* parentNode, struct ncclXmlNode* searchNode, struct ncclXmlNode** node) {
*node = NULL;
// Search for the node at the current level only.
for (int i=0; i<parentNode->nSubs; i++) {
struct ncclXmlNode* n = parentNode->subs[i];
if (strcmp(n->name, searchNode->name) == 0 && n->type == searchNode->type && n->nAttrs == searchNode->nAttrs) {
int a;
// Ensure that all the attributes are the same.
for (a=0; a<searchNode->nAttrs; a++) {
const char* val;
NCCLCHECK(xmlGetAttr(n, searchNode->attrs[a].key, &val));
if (!val || strcmp(val, searchNode->attrs[a].value))
break;
}
if (a == searchNode->nAttrs) {
*node = n;
return ncclSuccess;
}
}
}
return ncclSuccess;
}
static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, const char* value) {
int index;
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+195 -66
Zobrazit soubor
@@ -10,6 +10,7 @@
#include "transport.h"
#include "channel.h"
#include <assert.h>
#include "bootstrap.h"
__thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting
__thread ncclResult_t ncclGroupError = ncclSuccess;
@@ -31,6 +32,7 @@ ncclResult_t ncclAsyncLaunch(
) {
ncclResult_t ret = ncclSuccess;
job->destroyFlag = comm->destroyFlag;
if (ncclGroupDepth == 0) {
ret = func(job);
if (ret != ncclSuccess && undo) undo(job);
@@ -40,11 +42,15 @@ ncclResult_t ncclAsyncLaunch(
job->undo = undo;
job->destructor = destructor;
job->abortFlag = comm->abortFlag;
job->abortFlagDev = comm->abortFlagDev;
job->childAbortFlag = comm->childAbortFlag;
job->childAbortFlagDev = comm->childAbortFlagDev;
job->state = ncclGroupJobRunning;
job->comm = comm;
/* check if there are blocking and nonblocking comms at the same time in group. */
if (ncclGroupBlocking == -1) {
if (comm->destroyFlag) {
ncclGroupBlocking = 1;
} else if (ncclGroupBlocking == -1) {
/* first met communicator */
ncclGroupBlocking = comm->config.blocking;
} else if (ncclGroupBlocking != comm->config.blocking) {
@@ -98,11 +104,23 @@ exit:
return ret;
}
NCCL_API(ncclResult_t, ncclGroupSimulateEnd, ncclSimInfo_t* simInfo);
ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo) {
ncclResult_t ret = ncclSuccess;
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCLCHECKGOTO(ncclGroupEndInternal(simInfo), ret, exit);
TRACE_CALL("ncclGroupSimulateEnd()");
exit:
return ret;
}
struct ncclPreconnectJob {
struct ncclAsyncJob base;
struct ncclComm* comm;
bool* algoNeedConnect;
};
ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) {
ncclResult_t ncclP2PPreconnectFunc(struct ncclAsyncJob* job_) {
struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
struct ncclComm* comm = job->comm;
CUDACHECK(cudaSetDevice(comm->cudaDev));
@@ -111,6 +129,57 @@ ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) {
return ncclSuccess;
}
ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) {
struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
struct ncclComm* comm = job->comm;
ncclResult_t ret = ncclSuccess;
CUDACHECK(cudaSetDevice(comm->cudaDev));
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
for (int i = 0; i < NCCL_NUM_ALGORITHMS; ++i) {
if (job->algoNeedConnect[i]) {
switch (i) {
case NCCL_ALGO_RING: {
NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
break;
}
case NCCL_ALGO_TREE: {
NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
break;
}
case NCCL_ALGO_NVLS: {
/* If we are using NVLS_TREE algo, we must mark NVLS algo to set up
* NVLS intra-node buffer */
NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
break;
}
case NCCL_ALGO_NVLS_TREE: {
NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
break;
}
case NCCL_ALGO_COLLNET_CHAIN: {
NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
break;
}
case NCCL_ALGO_COLLNET_DIRECT: {
NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
break;
}
default: {
ret = ncclInternalError;
goto fail;
}
}
}
}
exit:
free(job->algoNeedConnect);
return ret;
fail:
goto exit;
}
static ncclResult_t doLaunches(struct ncclComm* head) {
ncclResult_t result = ncclSuccess;
struct ncclComm* cliqueComm0 = head->intraComm0;
@@ -124,7 +193,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
struct ncclComm* comm = cliqueHead;
bool capturingYes = false, capturingNo = false;
do {
(ncclCudaGraphValid(comm->tasks.capturingGraph) ? capturingYes : capturingNo) = true;
(ncclCudaGraphValid(comm->planner.capturingGraph) ? capturingYes : capturingNo) = true;
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
@@ -150,19 +219,19 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
// Barrier reduction result tells us if this was the final round.
moreRounds = 0 != ncclCommIntraBarrierOut(comm);
} else {
moreRounds |= comm->unlaunchedPlansHead != nullptr;
moreRounds |= comm->planner.unlaunchedPlansHead != nullptr;
}
if (moreRounds) {
// Pop next unlaunched kernel
struct ncclKernelPlan* plan = comm->unlaunchedPlansHead;
struct ncclKernelPlan* plan = comm->planner.unlaunchedPlansHead;
if (plan != nullptr) {
comm->unlaunchedPlansHead = plan->next;
comm->planner.unlaunchedPlansHead = plan->next;
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure);
NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
}
// Barrier reduction input indicates if we require further rounds.
if (useBarrier) ncclCommIntraBarrierIn(comm, comm->unlaunchedPlansHead != nullptr ? 1 : 0);
if (useBarrier) ncclCommIntraBarrierIn(comm, comm->planner.unlaunchedPlansHead != nullptr ? 1 : 0);
if (plan != nullptr) {
NCCLCHECKGOTO(ncclLaunchKernelAfter_NoCuda(comm, plan), result, failure);
}
@@ -210,37 +279,29 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
// is needed.
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
for (int i = 0; i < comm->nRanks; i++) {
comm->tasks.peers[i].sendSeen = false;
comm->tasks.peers[i].recvSeen = false;
comm->connectSend[i] = 0UL;
comm->connectRecv[i] = 0UL;
}
comm->unlaunchedPlansHead = nullptr;
// Reclaim abandoned kernel plan memory. Note ncclWork structs were already
// reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
while (!ncclIntruQueueEmpty(&comm->planQueue)) {
struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue);
while (!ncclIntruQueueEmpty(&comm->planner.planQueue)) {
struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planner.planQueue);
// Persistent plans will be reclaimed via the callbackQueue when the
// graph drops its UserObject reference.
if (!plan->persistent) {
for (int c = 0; c < MAXCHANNELS; c++) {
while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) {
struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue);
ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
}
while (!ncclIntruQueueEmpty(&plan->proxyOpQueue)) {
struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->proxyOpQueue);
ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
}
ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
}
}
// Reset comm->tasks to empty.
comm->tasks.nTasksColl = 0;
comm->tasks.nTasksP2p = 0;
comm->tasks.workBytesTotal = 0;
comm->tasks.streams = nullptr;
ncclIntruQueueConstruct(&comm->tasks.collQueue);
for (int i = 0; i < comm->nRanks; i++) {
ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
{ // Reset comm->planner to empty.
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
memset(&comm->planner, 0, sizeof(comm->planner));
comm->planner.peers = tmp;
memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0]));
}
if (!comm->config.blocking)
@@ -260,37 +321,10 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
return;
}
static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
int savedDev;
static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain, volatile bool *groupAbortFlag) {
ncclResult_t ret = ncclSuccess;
bool jobsDone = false;
bool errorJobAbortFlag = false;
struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
volatile bool *groupAbortFlag = gjob->abortFlagPtr;
CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
if (groupCommPreconnectHeadMain != nullptr) {
struct ncclComm* comm = groupCommPreconnectHeadMain;
do {
struct ncclPreconnectJob* job;
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
job->base.func = ncclPreconnectFunc;
job->base.undo = nullptr;
job->base.destructor = free;
job->base.state = ncclGroupJobRunning;
job->base.abortFlag = comm->abortFlag;
job->comm = comm;
ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
struct ncclComm* next = comm->preconnectNext;
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
comm = next;
} while (comm != nullptr);
}
if (!ncclIntruQueueEmpty(asyncJobsMain)) {
struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain);
@@ -321,9 +355,13 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
assert(state == ncclGroupJobJoined);
}
if (__atomic_load_n(groupAbortFlag, __ATOMIC_RELAXED) || errorJobAbortFlag == true) {
__atomic_store_n(job->abortFlag, 1, __ATOMIC_RELAXED);
if (job->childAbortFlag) __atomic_store_n(job->childAbortFlag, 1, __ATOMIC_RELAXED);
if (!job->destroyFlag && (__atomic_load_n(groupAbortFlag, __ATOMIC_ACQUIRE) || errorJobAbortFlag == true)) {
__atomic_store_n(job->abortFlag, 1, __ATOMIC_RELEASE);
__atomic_store_n(job->abortFlagDev, 1, __ATOMIC_RELEASE);
if (job->childAbortFlag) {
__atomic_store_n(job->childAbortFlag, 1, __ATOMIC_RELEASE);
__atomic_store_n(job->childAbortFlagDev, 1, __ATOMIC_RELEASE);
}
}
job = job->next;
@@ -335,17 +373,86 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
if (ret != ncclSuccess) goto fail;
}
if (groupCommHeadMain != nullptr) {
NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
}
while (!ncclIntruQueueEmpty(asyncJobsMain)) {
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
if (job->comm && !job->comm->config.blocking)
if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
(void) ncclCommSetAsyncError(job->comm, ret);
if (job->destructor) job->destructor((void*)job);
}
exit:
return ret;
fail:
goto exit;
}
static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) {
int savedDev;
ncclResult_t ret = ncclSuccess;
struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
bool *groupAbortFlag = gjob->abortFlagPtr;
CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
if (!simInfo && groupCommPreconnectHeadMain != nullptr) {
struct ncclComm* comm = groupCommPreconnectHeadMain;
do {
struct ncclPreconnectJob* job;
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
job->base.func = ncclP2PPreconnectFunc;
job->base.undo = nullptr;
job->base.destructor = free;
job->base.state = ncclGroupJobRunning;
job->base.abortFlag = comm->abortFlag;
job->base.abortFlagDev = comm->abortFlagDev;
job->comm = comm;
ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
struct ncclComm* next = comm->preconnectNext;
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
comm = next;
} while (comm != nullptr);
}
NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail);
/* Connect channels at runtime if cumem is supported */
if (groupCommHeadMain != nullptr) {
struct ncclComm* comm = groupCommHeadMain;
do {
bool needConnect = false;
bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
if (comm->cuMemSupport && needConnect) {
struct ncclPreconnectJob* job;
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
job->base.func = ncclCollPreconnectFunc;
job->base.undo = nullptr;
job->base.destructor = free;
job->base.state = ncclGroupJobRunning;
job->base.abortFlag = comm->abortFlag;
job->comm = comm;
NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
}
comm = comm->groupNext;
} while (comm);
NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail);
}
if ((!simInfo) && (groupCommHeadMain != nullptr)) {
NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
}
while (groupCommHeadMain != nullptr) {
struct ncclComm* comm = groupCommHeadMain;
struct ncclComm* next = comm->groupNext;
@@ -365,8 +472,17 @@ fail:
goto exit;
}
ncclResult_t ncclGroupEndInternal() {
static ncclResult_t groupLaunchNonBlocking(struct ncclAsyncJob *job_) {
return groupLaunch(job_ /* estimatedTime = NULL */);
}
ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
ncclResult_t ret = ncclSuccess;
ncclSimInfo_t internalSimInfo = NCCL_SIM_INFO_INITIALIZER;
ncclSimInfo_t* internalSimInfoPtr = NULL;
size_t realSize = 0;
internalSimInfo.magic = 0;
if (ncclGroupDepth == 0) {
WARN("ncclGroupEnd: not in a group call.");
@@ -378,6 +494,18 @@ ncclResult_t ncclGroupEndInternal() {
if ((ret = ncclGroupError) != ncclSuccess) goto fail;
if (simInfo) {
memcpy((void*)&realSize, (void*)&simInfo->size, sizeof(size_t));
realSize = realSize > sizeof(ncclSimInfo_t) ? sizeof(ncclSimInfo_t) : realSize;
memcpy((void*)&internalSimInfo, (void*)simInfo, realSize);
if (internalSimInfo.magic != 0x74685283) {
WARN("ncclSimInfo_t argument not initialized via NCCL_SIM_INFO_INITIALIZER");
ret = ncclInvalidArgument;
goto fail;
}
internalSimInfoPtr = &internalSimInfo;
}
if (ncclGroupCommHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs) || ncclGroupCommPreconnectHead != nullptr) {
ncclGroupJobMain.groupCommHeadPtr = &ncclGroupCommHead;
ncclGroupJobMain.groupCommPreconnectHeadPtr = &ncclGroupCommPreconnectHead;
@@ -410,12 +538,13 @@ ncclResult_t ncclGroupEndInternal() {
} while (comm);
}
ncclGroupJobMainPtr->base.func = groupLaunch;
ncclGroupJobMainPtr->base.func = groupLaunchNonBlocking;
SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail);
ret = ncclInProgress;
} else {
/* blocking group */
NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base), ret, fail);
NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base, internalSimInfoPtr), ret, fail);
if (simInfo) memcpy((void*)simInfo, (void*)internalSimInfoPtr, realSize);
groupResetJobState(ncclGroupJobMainPtr);
}
}
@@ -438,7 +567,7 @@ ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) {
ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) {
if (groupJob && groupJob->initialized) {
__atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELAXED);
__atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELEASE);
NCCLCHECK(ncclGroupJobComplete(groupJob));
}
return ncclSuccess;
-47
Zobrazit soubor
@@ -1,47 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_ALIGN_H_
#define NCCL_ALIGN_H_
#define DIVUP(x, y) \
(((x)+(y)-1)/(y))
#define ROUNDUP(x, y) \
(DIVUP((x), (y))*(y))
#define ALIGN_POWER(x, y) \
((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
#define ALIGN_SIZE(size, align) \
size = ((size + (align) - 1) / (align)) * (align);
#if !__CUDA_ARCH__
#ifndef __host__
#define __host__
#endif
#ifndef __device__
#define __device__
#endif
#endif
template<typename X, typename Y, typename Z = decltype(X()+Y())>
__host__ __device__ constexpr Z divUp(X x, Y y) {
return (x+y-1)/y;
}
template<typename X, typename Y, typename Z = decltype(X()+Y())>
__host__ __device__ constexpr Z roundUp(X x, Y y) {
return (x+y-1) - (x+y-1)%y;
}
// assumes second argument is a power of 2
template<typename X, typename Z = decltype(X()+int())>
__host__ __device__ constexpr Z alignUp(X x, int a) {
return (x+a-1) & Z(-a);
}
#endif
+73 -52
Zobrazit soubor
@@ -9,7 +9,7 @@
#include "nccl.h"
#include "checks.h"
#include "align.h"
#include "bitops.h"
#include "utils.h"
#include "p2p.h"
#include <sys/mman.h>
@@ -19,18 +19,25 @@
uint64_t clockNano(); // from utils.h with which we have a circular dependency
template<typename T>
constexpr size_t ncclSizeOfT() { return sizeof(T); }
template<>
constexpr size_t ncclSizeOfT<void>() { return 1; }
template <typename T>
ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish);
memset(*ptr, 0, nelem*sizeof(T));
if (nelem > 0) {
CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*ncclSizeOfT<T>(), cudaHostAllocMapped), result, finish);
memset(*ptr, 0, nelem*ncclSizeOfT<T>());
}
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T));
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA host alloc %ld bytes", nelem*ncclSizeOfT<T>());
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
return result;
}
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -42,14 +49,18 @@ inline ncclResult_t ncclCudaHostFree(void* ptr) {
template <typename T>
ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
void* p = malloc(nelem*sizeof(T));
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
return ncclSystemError;
if (nelem > 0) {
void* p = malloc(nelem*ncclSizeOfT<T>());
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
return ncclSystemError;
}
//INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), p);
memset(p, 0, nelem*ncclSizeOfT<T>());
*ptr = (T*)p;
} else {
*ptr = NULL;
}
//INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p);
memset(p, 0, nelem*sizeof(T));
*ptr = (T*)p;
return ncclSuccess;
}
#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -60,16 +71,16 @@ ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
if (nelem == oldNelem) return ncclSuccess;
T* oldp = *ptr;
T* p = (T*)malloc(nelem*sizeof(T));
T* p = (T*)malloc(nelem*ncclSizeOfT<T>());
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
return ncclSystemError;
}
memcpy(p, oldp, oldNelem*sizeof(T));
memcpy(p, oldp, oldNelem*ncclSizeOfT<T>());
free(oldp);
memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T));
memset(p+oldNelem, 0, (nelem-oldNelem)*ncclSizeOfT<T>());
*ptr = (T*)p;
INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr);
INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*ncclSizeOfT<T>(), nelem*ncclSizeOfT<T>(), *ptr);
return ncclSuccess;
}
@@ -111,7 +122,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
if (handlep) *handlep = handle;
TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle);
TRACE(NCCL_ALLOC, "CuMem Alloc Size %zu pointer %p handle %llx", size, *ptr, handle);
return result;
}
@@ -123,7 +134,7 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
CUCHECK(cuMemRelease(handle));
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
TRACE(NCCL_ALLOC, "CuMem Free Size %zu pointer %p handle 0x%llx", size, ptr, handle);
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
CUCHECK(cuMemRelease(handle));
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
@@ -151,15 +162,17 @@ ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, in
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (ncclCuMemEnable()) {
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
} else {
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
if (nelem > 0) {
if (ncclCuMemEnable()) {
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
} else {
CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
}
}
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T));
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA malloc %ld bytes", nelem*ncclSizeOfT<T>());
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
return result;
}
#define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -170,21 +183,23 @@ ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, in
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
// Need a side stream so as not to interfere with graph capture.
cudaStream_t stream;
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
if (ncclCuMemEnable()) {
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
} else {
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
if (nelem > 0) {
// Need a side stream so as not to interfere with graph capture.
cudaStream_t stream;
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
if (ncclCuMemEnable()) {
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
} else {
CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
}
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT<T>(), stream), result, finish);
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
}
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T));
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc %ld bytes", nelem*ncclSizeOfT<T>());
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
return result;
}
#define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -195,16 +210,18 @@ ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (ncclCuMemEnable()) {
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
} else {
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
if (nelem > 0) {
if (ncclCuMemEnable()) {
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
} else {
CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
}
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT<T>(), stream), result, finish);
}
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T));
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc async %ld bytes", nelem*ncclSizeOfT<T>());
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
return result;
}
#define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -230,7 +247,7 @@ ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stre
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish);
CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*ncclSizeOfT<T>(), cudaMemcpyDefault, stream), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
return result;
@@ -256,13 +273,17 @@ finish:
// allocated on separate pages as those pages will be marked DONTFORK
// and if they are shared, that could cause a crash in a child process
inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
size_t page_size = sysconf(_SC_PAGESIZE);
void* p;
int size_aligned = ROUNDUP(size, page_size);
int ret = posix_memalign(&p, page_size, size_aligned);
if (ret != 0) return ncclSystemError;
memset(p, 0, size);
*ptr = p;
if (size > 0) {
size_t page_size = sysconf(_SC_PAGESIZE);
void* p;
int size_aligned = ROUNDUP(size, page_size);
int ret = posix_memalign(&p, page_size, size_aligned);
if (ret != 0) return ncclSystemError;
memset(p, 0, size);
*ptr = p;
} else {
*ptr = NULL;
}
INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
return ncclSuccess;
}
+277
Zobrazit soubor
@@ -0,0 +1,277 @@
/*************************************************************************
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_BITOPS_H_
#define NCCL_BITOPS_H_
#include <stdint.h>
#if !__NVCC__
#ifndef __host__
#define __host__
#endif
#ifndef __device__
#define __device__
#endif
#endif
#define DIVUP(x, y) \
(((x)+(y)-1)/(y))
#define ROUNDUP(x, y) \
(DIVUP((x), (y))*(y))
#define ALIGN_POWER(x, y) \
((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
#define ALIGN_SIZE(size, align) \
size = ((size + (align) - 1) / (align)) * (align);
template<typename X, typename Y, typename Z = decltype(X()+Y())>
__host__ __device__ constexpr Z divUp(X x, Y y) {
return (x+y-1)/y;
}
template<typename X, typename Y, typename Z = decltype(X()+Y())>
__host__ __device__ constexpr Z roundUp(X x, Y y) {
return (x+y-1) - (x+y-1)%y;
}
template<typename X, typename Y, typename Z = decltype(X()+Y())>
__host__ __device__ constexpr Z roundDown(X x, Y y) {
return x - x%y;
}
// assumes second argument is a power of 2
template<typename X, typename Z = decltype(X()+int())>
__host__ __device__ constexpr Z alignUp(X x, int a) {
return (x + a-1) & Z(-a);
}
// assumes second argument is a power of 2
template<typename X, typename Z = decltype(X()+int())>
__host__ __device__ constexpr Z alignDown(X x, int a) {
return x & Z(-a);
}
template<typename Int>
inline __host__ __device__ int countOneBits(Int x) {
#if __CUDA_ARCH__
if (sizeof(Int) <= sizeof(unsigned int)) {
return __popc((unsigned int)x);
} else if (sizeof(Int) <= sizeof(unsigned long long)) {
return __popcll((unsigned long long)x);
} else {
static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
return -1;
}
#else
if (sizeof(Int) <= sizeof(unsigned int)) {
return __builtin_popcount((unsigned int)x);
} else if (sizeof(Int) <= sizeof(unsigned long)) {
return __builtin_popcountl((unsigned long)x);
} else if (sizeof(Int) <= sizeof(unsigned long long)) {
return __builtin_popcountll((unsigned long long)x);
} else {
static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
return -1;
}
#endif
}
// Returns index of first one bit or returns -1 if mask is zero.
template<typename Int>
inline __host__ __device__ int firstOneBit(Int mask) {
int i;
#if __CUDA_ARCH__
if (sizeof(Int) <= sizeof(int)) {
i = __ffs((int)mask);
} else if (sizeof(Int) <= sizeof(long long)) {
i = __ffsll((long long)mask);
} else {
static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
}
#else
if (sizeof(Int) <= sizeof(int)) {
i = __builtin_ffs((int)mask);
} else if (sizeof(Int) <= sizeof(long)) {
i = __builtin_ffsl((long)mask);
} else if (sizeof(Int) <= sizeof(long long)) {
i = __builtin_ffsll((long long)mask);
} else {
static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
}
#endif
return i-1;
}
template<typename Int>
inline __host__ __device__ int popFirstOneBit(Int* mask) {
Int tmp = *mask;
*mask &= *mask-1;
return firstOneBit(tmp);
}
template<typename Int>
inline __host__ __device__ int log2Down(Int x) {
int w, n;
#if __CUDA_ARCH__
if (sizeof(Int) <= sizeof(int)) {
w = 8*sizeof(int);
n = __clz((int)x);
} else if (sizeof(Int) <= sizeof(long long)) {
w = 8*sizeof(long long);
n = __clzll((long long)x);
} else {
static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
}
#else
if (x == 0) {
return -1;
} else if (sizeof(Int) <= sizeof(unsigned int)) {
w = 8*sizeof(unsigned int);
n = __builtin_clz((unsigned int)x);
} else if (sizeof(Int) <= sizeof(unsigned long)) {
w = 8*sizeof(unsigned long);
n = __builtin_clzl((unsigned long)x);
} else if (sizeof(Int) <= sizeof(unsigned long long)) {
w = 8*sizeof(unsigned long long);
n = __builtin_clzll((unsigned long long)x);
} else {
static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
}
#endif
return (w-1)-n;
}
template<typename Int>
inline __host__ __device__ int log2Up(Int x) {
int w, n;
if (x != 0) x -= 1;
#if __CUDA_ARCH__
if (sizeof(Int) <= sizeof(int)) {
w = 8*sizeof(int);
n = __clz((int)x);
} else if (sizeof(Int) <= sizeof(long long)) {
w = 8*sizeof(long long);
n = __clzll((long long)x);
} else {
static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
}
#else
if (x == 0) {
return 0;
} else if (sizeof(Int) <= sizeof(unsigned int)) {
w = 8*sizeof(unsigned int);
n = __builtin_clz((unsigned int)x);
} else if (sizeof(Int) <= sizeof(unsigned long)) {
w = 8*sizeof(unsigned long);
n = __builtin_clzl((unsigned long)x);
} else if (sizeof(Int) <= sizeof(unsigned long long)) {
w = 8*sizeof(unsigned long long);
n = __builtin_clzll((unsigned long long)x);
} else {
static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
}
#endif
return w-n;
}
template<typename Int>
inline __host__ __device__ Int pow2Up(Int x) {
return Int(1)<<log2Up(x);
}
template<typename Int>
inline __host__ __device__ Int pow2Down(Int x) {
return Int(1)<<log2Down(x);
}
template<typename UInt, int nSubBits>
inline __host__ UInt reverseSubBits(UInt x) {
if (nSubBits >= 16 && 8*sizeof(UInt) == nSubBits) {
switch (8*sizeof(UInt)) {
case 16: x = __builtin_bswap16(x); break;
case 32: x = __builtin_bswap32(x); break;
case 64: x = __builtin_bswap64(x); break;
default: static_assert(8*sizeof(UInt) <= 64, "Unsupported integer type.");
}
return reverseSubBits<UInt, 8>(x);
} else if (nSubBits == 1) {
return x;
} else {
UInt m = UInt(-1)/((UInt(1)<<(nSubBits/2))+1);
x = (x & m)<<(nSubBits/2) | (x & ~m)>>(nSubBits/2);
return reverseSubBits<UInt, nSubBits/2>(x);
}
}
template<typename T> struct ncclToUnsigned;
template<> struct ncclToUnsigned<char> { using type = unsigned char; };
template<> struct ncclToUnsigned<signed char> { using type = unsigned char; };
template<> struct ncclToUnsigned<unsigned char> { using type = unsigned char; };
template<> struct ncclToUnsigned<signed short> { using type = unsigned short; };
template<> struct ncclToUnsigned<unsigned short> { using type = unsigned short; };
template<> struct ncclToUnsigned<signed int> { using type = unsigned int; };
template<> struct ncclToUnsigned<unsigned int> { using type = unsigned int; };
template<> struct ncclToUnsigned<signed long> { using type = unsigned long; };
template<> struct ncclToUnsigned<unsigned long> { using type = unsigned long; };
template<> struct ncclToUnsigned<signed long long> { using type = unsigned long long; };
template<> struct ncclToUnsigned<unsigned long long> { using type = unsigned long long; };
// Reverse the bottom nBits bits of x. The top bits will be overwritten with 0's.
template<typename Int>
inline __host__ __device__ Int reverseBits(Int x, int nBits) {
using UInt = typename ncclToUnsigned<Int>::type;
union { UInt ux; Int sx; };
sx = x;
#if __CUDA_ARCH__
if (sizeof(Int) <= sizeof(unsigned int)) {
ux = __brev(ux);
} else if (sizeof(Int) <= sizeof(unsigned long long)) {
ux = __brevll(ux);
} else {
static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer type.");
}
#else
ux = reverseSubBits<UInt, 8*sizeof(UInt)>(ux);
#endif
ux = nBits==0 ? 0 : ux>>(8*sizeof(UInt)-nBits);
return sx;
}
////////////////////////////////////////////////////////////////////////////////
// Custom 8 bit floating point format for approximating 32 bit uints. This format
// has nearly the full range of uint32_t except it only keeps the top 3 bits
// beneath the leading 1 bit and thus has a max value of 0xf0000000.
inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
int log2x;
#if __CUDA_ARCH__
log2x = 31-__clz(x|1);
#else
log2x = 31-__builtin_clz(x|1);
#endif
uint32_t mantissa = x>>(log2x >= bitsPerPow2 ? log2x-bitsPerPow2 : 0) & ((1u<<bitsPerPow2)-1);
uint32_t exponent = log2x >= bitsPerPow2 ? log2x-(bitsPerPow2-1) : 0;
return exponent<<bitsPerPow2 | mantissa;
}
inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
uint32_t exponent = x>>bitsPerPow2;
uint32_t mantissa = (x & ((1u<<bitsPerPow2)-1)) | (exponent!=0 ? 0x8 : 0);
if (exponent != 0) exponent -= 1;
return mantissa<<exponent;
}
constexpr uint32_t u32fp8MaxValue() { return 0xf0000000; }
inline __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
return u32fpEncode(x, 3);
}
inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
return u32fpDecode(x, 3);
}
#endif
+12 -29
Zobrazit soubor
@@ -7,42 +7,25 @@
#ifndef NCCL_CHANNEL_H_
#define NCCL_CHANNEL_H_
#include "comm.h"
#include "utils.h"
#include <algorithm>
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
int peerNode = comm->rankToNode[peer];
int peerIndex = comm->rankToLocalRank[peer];
int nsteps = comm->maxLocalRanks;
int rankIndex = comm->rankToLocalRank[comm->rank];
int step, delta;
if (coll == ncclFuncSend) {
step = (nsteps + peerIndex - rankIndex)%nsteps;
delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
} else if (coll == ncclFuncRecv) {
step = (nsteps + rankIndex - peerIndex)%nsteps;
delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) {
if (comm->nNodes > 1) {
int nodeDelta = p2pRound/comm->maxLocalRanks;
int localDelta = p2pRound%comm->maxLocalRanks;
int base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH);
base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH;
return base & 0xff;
} else {
return ncclInternalError;
return p2pRound & 0xff;
}
*channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
return ncclSuccess;
}
static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
//*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
*channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels;
return ncclSuccess;
}
static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) {
int base;
NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base));
NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId));
return ncclSuccess;
}
#endif
+4 -4
Zobrazit soubor
@@ -123,23 +123,23 @@
} while (0);
#define NCCLWAIT(call, cond, abortFlagPtr) do { \
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
uint32_t* tmpAbortFlag = (abortFlagPtr); \
ncclResult_t RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
return ncclInternalError; \
} \
if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECK(*tmpAbortFlag, 0); \
} while (!(cond));
#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
uint32_t* tmpAbortFlag = (abortFlagPtr); \
RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
goto label; \
} \
if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
} while (!(cond));
#define NCCLCHECKTHREAD(a, args) do { \
+8
Zobrazit soubor
@@ -8,6 +8,8 @@
#define NCCL_COLLECTIVES_H_
#include "nccl.h"
#include "nccl_common.h"
#include "device.h"
// CHUNKSIZE must be a multiple of SLICESIZE
#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
@@ -22,6 +24,12 @@
#define REDUCE_CHUNKSTEPS 1
#define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
const char* ncclFuncToString(ncclFunc_t op);
const char* ncclDevRedOpToString(ncclDevRedOp_t op);
const char* ncclDatatypeToString(ncclDataType_t type);
const char* ncclAlgoToString(int algo);
const char* ncclProtoToString(int proto);
inline int ncclTypeSize(ncclDataType_t type) {
switch (type) {
case ncclInt8:
+222 -60
Zobrazit soubor
@@ -7,7 +7,7 @@
#ifndef NCCL_COMM_H_
#define NCCL_COMM_H_
#include "transport.h"
//#include "transport.h"
#include "p2p.h"
#include "collectives.h"
#include "nccl_tuner.h"
@@ -15,6 +15,7 @@
#include "strongstream.h"
#include "nccl_net.h"
#include "register.h"
#include "graph.h"
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
@@ -144,7 +145,7 @@ struct ncclChannel {
struct ncclNvls nvls;
int id; // index of this channel
uint32_t workFifoSent; // last used work index+1
uint32_t workFifoProduced; // +1 successor of last used work fifo byte
/* comm split sharable resources */
struct ncclChannelPeer* collnetPeers;
@@ -153,22 +154,15 @@ struct ncclChannel {
struct ncclDevChannelPeer* nvlsDevPeers;
};
struct ncclWorkList {
struct ncclWorkBatchList {
struct ncclWorkBatchList* next;
struct ncclDevWorkBatch batch;
};
struct alignas(16) ncclWorkList {
struct ncclWorkList* next;
struct ncclWork work;
};
struct ncclPointerList {
struct ncclPointerList* next;
void *ptr;
};
struct ncclNvlsMcHandleList {
struct ncclNvlsMcHandleList *next;
CUmemGenericAllocationHandle mcHandle;
CUdeviceptr ptr;
int dev;
size_t size;
enum ncclDevWorkType workType;
int size; // Size of struct following this node
// ncclDevWorkColl, ncclDevWorkColLReg, ncclDevWorkP2p[]...
};
struct ncclCollnetHandleList {
@@ -188,33 +182,190 @@ struct ncclKernelPlan {
struct ncclKernelPlan* next;
bool persistent; // aka captured in a graph
enum ncclDevWorkStorageType workStorageType;
bool kernelSpecialized;
void *kernelFn;
int channelUbound; // only channels c < channelUbound are present
int channelCount; // number of channels present
uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask)
struct ncclDevKernelArgs* kernelArgs;
size_t kernelArgsSize;
uint64_t channelMask; // bitset of which channels are present
bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
int threadPerBlock;
// workHeap fields are null until uploadWorkFifo() or preparePersistentKernel()
struct ncclWork* workHead;
int collOpCount; // zero based for this plan
int collOpCount; // Number of collectives in this plan.
int nWorkBatches; // Number of work batches.
size_t workBytes; // Sum size of all work (in the fifo) in bytes.
struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> cleanupQueue;
void* workBufPersistent;
struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
struct ncclIntruQueue<struct ncclNvlsMcHandleList, &ncclNvlsMcHandleList::next> nvlsMcHandleQueue;
struct ncclIntruQueue<struct ncclCollnetHandleList, &ncclCollnetHandleList::next> collnetHandleQueue;
struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
};
struct Channel {
int nWork;
union {
int nWorkElem; // used for coll and reg coll
int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1
};
size_t collBytes;
struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
} channels[MAXCHANNELS];
size_t maxBytesPerChannel;
////////////////////////////////////////////////////////////////////////////////
struct ncclTaskColl {
struct ncclTaskColl* next;
ncclFunc_t func;
void const* sendbuff;
void* recvbuff;
size_t count;
int root;
ncclDataType_t datatype;
ncclRedOp_t opHost;
struct ncclDevRedOpFull opDev;
int chunkSteps, sliceSteps;
// Computed later:
size_t trafficBytes;
int32_t nMaxChannels:8;
int32_t nWarps:8;
int32_t algorithm:8, protocol:8;
uint32_t isCollnet:1, isNvls:1;
uint32_t devFuncId:30;
enum ncclRegBufferType regBufType;
// number of elements in planner->ipcMemQueue associated with this collective
int nCleanupQueueElts;
void* sendMhandle;
void* recvMhandle;
};
struct ncclTaskP2p {
struct ncclTaskP2p* next;
void* buff;
size_t bytes;
};
////////////////////////////////////////////////////////////////////////////////
// Roughly sorts ncclTaskColl's by their size descending. This structure is
// self-referential, meaning that pointers it contains internally may point
// into the structure itself. This means that it is NOT memcpy-moveable:
struct ncclTaskCollSorter {
static constexpr int UnitLog2 = 10; // 1K
static constexpr size_t UnitSize = 1<<UnitLog2;
static constexpr int MaxLog2 = 30; // 1GB
static constexpr size_t MaxSize = 1ull<<MaxLog2;
// Number of bins between powers of 2. For 4 bins, the worst case out-of-order
// relative magnitude is (5/4)-1 = 25%
static constexpr int BitsPerPow2 = 2;
static constexpr int BinsPerPow2 = 1<<BitsPerPow2;
static constexpr int BinCount = 1 + (MaxLog2-UnitLog2)*BinsPerPow2;
struct ncclTaskColl* head;
struct ncclTaskColl* tail;
// Least bin such that it and all above are empty.
int binEdge;
// Pointer to the pointer to this bin's head node which is either the
// previous node's `next` field or `head`.
struct ncclTaskColl** bins[BinCount];
};
inline void ncclTaskCollSorterInsert(
struct ncclTaskCollSorter* me, struct ncclTaskColl* x, size_t size
) {
constexpr int UnitLog2 = ncclTaskCollSorter::UnitLog2;
constexpr size_t MaxSize = ncclTaskCollSorter::MaxSize;
constexpr int BitsPerPow2 = ncclTaskCollSorter::BitsPerPow2;
constexpr int BinCount = ncclTaskCollSorter::BinCount;
int bin = u32fpEncode(std::min(MaxSize, size)>>UnitLog2, BitsPerPow2);
bin = BinCount-1 - bin; // descending bin
if (me->bins[bin] == nullptr) {
if (me->binEdge <= bin) {
me->binEdge = bin+1;
me->bins[bin] = me->tail ? &me->tail->next : &me->head;
me->tail = x;
} else {
// Find successor non-empty bin after this one.
int succ = bin+1;
while (me->bins[succ] == nullptr) succ++;
// What was our successor's head's previous is now our head's previous.
me->bins[bin] = me->bins[succ];
// The first node we insert is our tail, so that becomes our successor's
// head's new previous.
me->bins[succ] = &x->next;
}
}
// Push a new head for this bin.
x->next = *me->bins[bin];
*me->bins[bin] = x;
}
inline bool ncclTaskCollSorterEmpty(struct ncclTaskCollSorter* me) {
return me->head == nullptr;
}
// Reset sorter and return sorted linked list of its coll tasks.
inline struct ncclTaskColl* ncclTaskCollSorterDequeueAll(struct ncclTaskCollSorter* me) {
struct ncclTaskColl* head = me->head;
if (head != nullptr) memset(me, 0, sizeof(*me));
return head;
}
////////////////////////////////////////////////////////////////////////////////
struct ncclCudaStreamList {
struct ncclCudaStreamList *next;
cudaStream_t stream;
};
struct ncclKernelPlanner {
//////////////////////////////////////////////////////////////////////////////
// State for accumulating tasks between ncclGroupStart/End()
//////////////////////////////////////////////////////////////////////////////
struct Peer {
bool sendSeen, recvSeen;
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
};
struct ncclTaskCollSorter collSorter;
struct Peer* peers/*[nRanks]*/;
int nTasksColl, nTasksP2p;
bool persistent;
// The list of user streams aggregated over all tasks present.
struct ncclCudaStreamList* streams;
// The most recent user stream. Ignored if streams==nullptr
cudaStream_t streamRecent;
// The graph capturing all user streams or invalid if none. Thus we restrict the
// user that all streams must be captured in the same graph or not captured
// at all. Technically we could probably relax this, but that would mean
// collecting a different `ncclTasks` per graph and one for non-graph.
struct ncclCudaGraph capturingGraph;
//////////////////////////////////////////////////////////////////////////////
// Lists of tasks to be assembled into plans.
//////////////////////////////////////////////////////////////////////////////
struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collTaskQueue;
struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> collWorkQueue;
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> collCleanupQueue;
//////////////////////////////////////////////////////////////////////////////
// State for building current (Work-In-Progress) plan:
//////////////////////////////////////////////////////////////////////////////
struct WipPlan {
struct Channel {
struct {
int workBytes; // Sum size of work metadata referenced by this batch.
int nP2ps; // Number of p2p works in this batch
int p2pRounds[NCCL_MAX_DEV_WORK_P2P_PER_BATCH]; // which rounds are present in this batch.
} wipBatch; // work-in-progress batch which will be next tail of workBatchQueue
int nWorkBatchesP2p; // number of p2p batches for this channel.
struct ncclIntruQueue<struct ncclWorkBatchList, &ncclWorkBatchList::next> workBatchQueue;
struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
} channels[MAXCHANNELS];
} wipPlan;
//////////////////////////////////////////////////////////////////////////////
// State for launching built plans:
//////////////////////////////////////////////////////////////////////////////
// List of kernel plans built form tasks.
struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
// First of the unlaunched kernels in `planQueue`
struct ncclKernelPlan* unlaunchedPlansHead;
};
#define NCCL_MAGIC 0x0280028002800280 // Nickel atomic number is 28.
@@ -233,12 +384,18 @@ struct ncclComm {
struct ncclPeerInfo* peerInfo;
struct ncclTopoSystem* topo;
int netPluginLoaded;
ncclNet_t* ncclNet;
ncclNetDeviceType netDeviceType;
ncclCollNet_t* ncclCollNet;
void* bootstrap;
// Bitmasks for ncclTransportP2pSetup
uint64_t* connectSend;
uint64_t* connectRecv;
struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
bool initAlgoChannels[NCCL_NUM_ALGORITHMS];
bool runtimeConn; // if dynamic connection is supported
int cuMemSupport;
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
@@ -253,6 +410,9 @@ struct ncclComm {
cpu_set_t cpuAffinity; // CPU affinity of the GPU
int cudaArch; // matches __CUDA_ARCH__ of device
int cpuArch; // architecture - As defined in src/include/graph.h, e.g. x86/arm/ppc/mixed
int cpuVendor; // vendor - As defined in src/include/graph.h
int node;
int nNodes;
int localRank;
@@ -278,10 +438,11 @@ struct ncclComm {
int nChannels; // connection nChannels
int collChannels; // enqueue nChannels
int nvlsChannels; // enqueue nChannels
// all nvls heads stored to check if we can splitShare
int nvlsHeads[MAXCHANNELS];
// Channels (per peer) for p2p
int p2pnChannels;
int p2pnChannelsPerPeer;
int p2pChannels[MAXCHANNELS];
// Should this comm allocate LL buffers for network P2P connections?
bool allocP2pNetLLBuffers;
@@ -303,23 +464,28 @@ struct ncclComm {
ncclResult_t asyncResult;
// Flag to ask NCCL kernels to abort
volatile uint32_t *abortFlag;
volatile uint32_t *childAbortFlag;
uint32_t *abortFlagRefCount;
uint32_t* abortFlag;
uint32_t* abortFlagDev;
int* abortFlagRefCount;
uint32_t* childAbortFlag;
uint32_t* childAbortFlagDev;
uint32_t destroyFlag;
// Device side of the communicator (for cudaFree's)
struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
// Operation pool.
int workFifoDepth; // size of workFifoHeap[], power of 2
struct ncclWork* workFifoHeap;
struct ncclWork* devWorkFifoHeap;
void* workFifoHeapGdrHandle;
uint32_t workArgsBytes; // max size of kernel args
uint32_t workFifoBytes; // size of workFifoBuf, power of 2
void* workFifoBuf;
void* workFifoBufDev;
void* workFifoBufGdrHandle;
// Work completion notificaion
uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory
uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot.
uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels.
// Monotonic number of bytes (mod 1<<32) consumed per channel. In cudaHost memory.
uint32_t* workFifoConsumed/*[MAXCHANNELS]*/;
// Last observed value of: min(workFifoConsumed[c] for c < MAXCHANNELS)
uint32_t workFifoConsumedLeast;
// Monotonic number of bytes (mod 1<<32) sent to fifo.
uint32_t workFifoProduced;
// Intra-process sync
struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
@@ -337,7 +503,7 @@ struct ncclComm {
// Whether this communicator uses collNet
int collNetSupport;
bool collNetRegSupport;
uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes];
uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
int intraHighestTransportType;
int* collNetHeads;
int collNetHeadsNum;
@@ -355,16 +521,16 @@ struct ncclComm {
// pools backed by comm->memPermanent
struct ncclMemoryPool memPool_ncclProxyOp;
struct ncclMemoryPool memPool_ncclKernelPlan;
struct ncclMemoryPool memPool_ncclPointerList;
struct ncclMemoryPool memPool_ncclNvlsHandleList;
struct ncclMemoryPool memPool_ncclCollnetHandleList;
// Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
// this comm is not yet in a group.
struct ncclComm* groupNext;
// Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
struct ncclComm* preconnectNext;
int persistentRefs; // number of persistent plan-lists capturing this comm
struct ncclTasks tasks;
struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule;
struct ncclKernelPlanner planner;
// user-created reduction ops
int userRedOpCapacity, userRedOpFreeHead;
@@ -373,11 +539,6 @@ struct ncclComm {
// Queue of things for the main thread to do
struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
// List of kernel plans built form tasks.
struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
// First of the unlaunched kernels in `planQueue`
struct ncclKernelPlan* unlaunchedPlansHead;
ncclConfig_t config;
// initState is to more conveniently reclaim resources when errors happen.
ncclResult_t initState;
@@ -389,6 +550,7 @@ struct ncclComm {
struct ncclGroupJob *groupJob;
// Tuning plugin
int tunerPluginLoaded;
ncclTuner_t* tuner;
void *tunerContext;
// buffer registration cache
+4
Zobrazit soubor
@@ -80,6 +80,10 @@ DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent);
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice);
DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute);
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel);
#if CUDART_VERSION >= 11080
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx);
#endif
// cuMem API support
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve);
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree);
+1 -8
Zobrazit soubor
@@ -10,21 +10,14 @@
#include "nccl.h"
#include "nccl_common.h"
#include <stdio.h>
#include <chrono>
#include <type_traits>
#include <limits.h>
#include <string.h>
#include <pthread.h>
// Conform to pthread and NVTX standard
#define NCCL_THREAD_NAMELEN 16
extern int ncclDebugLevel;
extern uint64_t ncclDebugMask;
extern pthread_mutex_t ncclDebugLock;
extern FILE *ncclDebugFile;
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
@@ -32,13 +25,13 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
extern thread_local int ncclDebugNoWarn;
extern char ncclLastError[];
#define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
#define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
#define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)
#ifdef ENABLE_TRACE
#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
extern std::chrono::steady_clock::time_point ncclEpoch;
#else
#define TRACE(...)
#endif
+182 -95
Zobrazit soubor
@@ -9,8 +9,10 @@
#include "nccl.h"
#include "nccl_common.h"
#include "align.h"
#include "bitops.h"
#include <algorithm>
#include <stdint.h>
#include <sys/types.h>
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
@@ -21,6 +23,12 @@ extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
#define NCCL_MAX_OPS 2048
#define NCCL_STEPS 8
#ifdef __CUDA_ARCH__
#define NCCL_CUDA_ARCH __CUDA_ARCH__
#else
#define NCCL_CUDA_ARCH 0
#endif
#include "net_device.h"
enum ncclDevRedOp_t {
@@ -52,8 +60,11 @@ union ncclLLFifoLine {
#define WARP_SIZE 32
#define MAXCHANNELS 32
#define NCCL_MAX_LOCAL_RANKS 64
#define NCCL_MAX_NTHREADS 640
#define NCCL_MIN_NTHREADS (4*WARP_SIZE)
#define NCCL_SIMPLE_MAX_NTHREADS 512
#define NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE (3*WARP_SIZE)
#define NCCL_LL_MAX_NTHREADS 512
#define NCCL_LL_LINES_PER_THREAD 8
#ifdef TEST_LL_CLEANUP
@@ -84,6 +95,9 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
#define NCCL_IPC_READ 0x10
#define NCCL_NVLS_MIN_POLL 0x20
// Number of named barriers supported by CUDA
#define NCCL_MAX_GROUPS 16
#define NCCL_MAX_COLLNET_SIZE (1L << 29)
enum ncclRegBufferType {
@@ -196,112 +210,155 @@ struct ncclChannelPeer {
struct ncclDevComm;
/* ncclWork is to be a power of two, currently 8x64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclWorkElem. */
#define NCCL_WORK_SIZE 512
struct alignas(16) ncclDevWorkP2p {
void *sendAddr, *recvAddr;
size_t sendBytes, recvBytes;
int sendRank, recvRank;
// From the part index, nP2pChannels, and channelBase the device code can
// calculate which part of the transfer a channel is responsible for.
uint8_t nP2pChannels; // Always equal to comm->p2pnChannels
uint8_t channelBase; // Channel owning first part.
// Zero channels indicates no work in that direction.
uint8_t nSendChannels, nRecvChannels;
// Chunk size stored in 8 bits via u32fp8Encode/Decode.
uint8_t sendChunkSize_u32fp8, recvChunkSize_u32fp8;
enum ncclWorkType : uint8_t {
ncclWorkTypeUnused=0,
ncclWorkTypeColl=1,
ncclWorkTypeP2p=2,
ncclWorkTypeRegColl=3
};
enum ncclWorkP2PType : uint8_t {
ncclWorkP2pTypeUnused=0,
ncclWorkP2pTypeSend,
ncclWorkP2pTypeRecv
uint8_t sendProtoLL:1, recvProtoLL:1;
uint8_t sendRegistered:1, recvRegistered:1;
};
struct ncclWorkHeader {
union {
int32_t workNext; // when isLast=0: Offset from kernel argument workHead
uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
};
uint16_t funcIndex;
uint8_t isLast:1; // last work for this kernel
uint8_t inFifo:1; // is this work in the fifo
enum ncclWorkType type;
};
// Compute the subset of the data transfer corresponding to the given part index.
inline __host__ __device__ void ncclP2pPartBounds(int nParts, int part, size_t bytes, size_t* partBeg, size_t* partEnd) {
size_t partBytes = alignUp(divUp(bytes, nParts), 4<<10);
#if __CUDA_ARCH__
*partBeg = min((part+0)*partBytes, bytes);
*partEnd = min((part+1)*partBytes, bytes);
#else
*partBeg = std::min<size_t>((part+0)*partBytes, bytes);
*partEnd = std::min<size_t>((part+1)*partBytes, bytes);
#endif
}
struct ncclWorkElem {
union {
uint8_t flagBits;
struct {
uint8_t isUsed:1, redOpArgIsPtr:1, oneNode:1;
};
};
uint8_t regUsed;
uint8_t nWarps;
uint8_t direct;
// implemented in channel.h
inline __host__ uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound);
// ncclP2pChannelToPart and ncclP2pChannelForPart are inverses. The device code
// uses ncclP2pChannelToPart to determine which part "this" channel is responsible for.
inline __host__ int ncclP2pChannelForPart(int nP2pChannels, int base, int part) {
// Only works because nP2pChannels is pow2
int nChannelsLog2 = countOneBits(nP2pChannels-1);
int delta = reverseBits(part, nChannelsLog2);
return (base + delta) & (nP2pChannels-1);
}
inline __device__ int ncclP2pChannelToPart(int nP2pChannels, int base, int channel) {
// Only works because nP2pChannels is pow2
int nChannelsLog2 = countOneBits(nP2pChannels-1);
int delta = (channel-base) & (nP2pChannels-1);
return reverseBits(delta, nChannelsLog2);
}
struct alignas(16) ncclDevWorkColl {
// Running on channels [channelLo..channelHi], hi is inclusive.
// nChannels == (channelHi - channelLo) + 1
uint32_t channelLo:8, channelHi:8;
uint32_t nWarps:8;
uint32_t redOpArgIsPtr:1, regUsed:2, oneNode:1, direct:4;
uint32_t root;
const void *sendbuff;
void *recvbuff;
size_t count;
uint64_t redOpArg;
uint64_t chunkCount:25, workCount:39;
void* recvbuff;
void* sendbuff;
union {
// Continuous-byte-distribution scheduling. The lo and hi channels are of
// different size than the channels in the middle.
struct {
uint64_t lastChunkCount:25;
uint64_t workOffset:39;
};
size_t countLo, countMid, countHi;
// Chunk counts where units are ncclProtoGrainSize(protocol) bytes
uint64_t chunkGrainsLo:21, chunkGrainsMid:21, chunkGrainsHi:21;
} cbd;
// Collnet scheduling. All channels divide work evenly.
struct {
uint64_t bid:32;
uint64_t nChannels:32;
};
size_t count; // Total size, not divided per channel.
uint32_t chunkCount;
} collnet;
};
uint64_t redOpArg;
};
#define NCCL_MAX_WORK_ELEMENTS ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem))
static_assert(NCCL_MAX_WORK_ELEMENTS == 9, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 9");
struct ncclWorkElemP2p {
int peer : 30;
int proto : 2;
__host__ __device__ constexpr int ncclProtoGrainSize(int proto) {
return proto == NCCL_PROTO_LL ? 16 :
proto == NCCL_PROTO_LL128 ? WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS*NCCL_LL128_DATAELEMS*sizeof(uint64_t) :
proto == NCCL_PROTO_SIMPLE ? 512 :
-1;
}
enum ncclWorkP2PType p2pType;
uint8_t reg:1;
uint8_t nWarps:5;
uint8_t warpStart;
uint8_t ngroups;
// Important not to use any fields with greater than 4-byte alignment since
// we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
// there were 8-byte fields.
//void* buff;
uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
//size_t count;
uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
int chunkSize;
};
template<typename Int>
__host__ __device__ inline void ncclCollCbdPart(
struct ncclDevWorkColl* work, uint32_t channelId, int proto, int eltSize,
Int* count, Int* partOffset, Int* partCount, Int* chunkCount
) {
int eltPerGrain = ncclProtoGrainSize(proto)/eltSize;
int nMidChannels = work->channelHi - work->channelLo - 1;
// We can assum that nMidChannels<0 implies countMid==0, which let's us assume
// that countMid*nMidChannels == 0.
if (count != nullptr) {
*count = work->cbd.countLo + work->cbd.countMid*nMidChannels + work->cbd.countHi;
}
if (channelId == work->channelLo) {
*partOffset = 0;
*partCount = work->cbd.countLo;
*chunkCount = work->cbd.chunkGrainsLo*eltPerGrain;
} else if (channelId == work->channelHi) {
*partOffset = work->cbd.countLo + nMidChannels*work->cbd.countMid;
*partCount = work->cbd.countHi;
*chunkCount = work->cbd.chunkGrainsHi*eltPerGrain;
} else {
int mid = channelId - work->channelLo - 1;
*partOffset = work->cbd.countLo + mid*work->cbd.countMid;
*partCount = work->cbd.countMid;
*chunkCount = work->cbd.chunkGrainsMid*eltPerGrain;
}
}
static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) >= 16, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 16");
#define NCCL_MAX_WORK_ELEMENTS_P2P 16
struct ncclWorkElemReg {
struct ncclWorkElem elem;
struct alignas(16) ncclDevWorkCollReg {
struct ncclDevWorkColl coll;
void* dnInputs[NCCL_MAX_DIRECT_ARITY+1];
void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
};
#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg))
static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 2, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 2");
// Number of named barriers supported by CUDA
#define NCCL_MAX_GROUPS 16
struct ncclWork {
struct ncclWorkHeader header;
union {
char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)];
struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
};
enum ncclDevWorkType: uint8_t {
ncclDevWorkTypeP2p,
ncclDevWorkTypeColl,
ncclDevWorkTypeCollReg
};
constexpr size_t ncclDevWorkSize(enum ncclDevWorkType type) {
return type == ncclDevWorkTypeP2p ? sizeof(ncclDevWorkP2p) :
type == ncclDevWorkTypeColl ? sizeof(ncclDevWorkColl) : sizeof(ncclDevWorkCollReg);
}
#define NCCL_MAX_DEV_WORK_BATCH_BYTES 1024
#define NCCL_MAX_DEV_WORK_BATCH_COLLS (NCCL_MAX_DEV_WORK_BATCH_BYTES/sizeof(ncclDevWorkColl))
#define NCCL_MAX_DEV_WORK_P2P_PER_BATCH 8
struct alignas(16) ncclDevWorkBatch {
union {
struct {
// nextExtends: should next one be merged into this one.
// nextJump=0: end of this channel's batch list
// nextJump>0: batches[thisIndex+nextJump] is next batch in this list
uint32_t nextJump:14, nextExtends:1;
uint32_t workType:2, funcId:15;
};
// Unioning bitfields with underlying type hints compiler to emit the best
// SASS LD/ST accesses.
uint32_t flags;
};
// Rolling offset in fifo where this batch's work structs begin
uint32_t offsetBase;
// Set of relative offsets from offsetBase for this channel's subset of the batch:
// For each bit index i in offsetMask, find work at fifo offset: offsetBase + i*sizeof(WorkStructType)
uint64_t offsetBitset;
};
static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE");
static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0");
struct ncclDevChannelPeer {
// Stripped version of ncclChannelPeer where we only keep the ncclConnInfo
@@ -328,9 +385,8 @@ struct ncclDevComm {
int buffSizes[NCCL_NUM_PROTOCOLS];
int p2pChunkSize;
// Operation list for aggregation
int workFifoDepth;
struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
// Work fifo return credits
uint32_t* workConsumed/*[MAXCHANNELS]*/;
int* collNetDenseToUserRank;
@@ -346,11 +402,37 @@ struct alignas(16) ncclDevCommAndChannels {
struct ncclDevChannel channels[MAXCHANNELS];
};
#ifdef __CUDA_ARCH__
#define NCCL_CUDA_ARCH __CUDA_ARCH__
#else
#define NCCL_CUDA_ARCH 0
#endif
enum ncclDevWorkStorageType: uint8_t {
ncclDevWorkStorageTypeArgs=0,
ncclDevWorkStorageTypeFifo=1,
ncclDevWorkStorageTypePersistent=2
};
struct alignas(16) ncclDevKernelArgs {
struct ncclDevComm* comm;
uint64_t channelMask;
enum ncclDevWorkStorageType workStorageType;
uint32_t workMask;
void* workBuf;
// A channel's first batch is at `blockIdx.x`. Use `nextJump` to follow rest of list.
// struct ncclDevWorkBatch batches[];
};
__host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) {
//return (cudaArch < 700 || cudaDriver < 12010) ? 4<<10 : (32<<10)-4;
return 4<<10;
}
template<size_t capacity>
struct alignas(16) ncclDevKernelArgsStorage {
union {
struct ncclDevKernelArgs args;
ulong2 storage[capacity/sizeof(ulong2)];
};
};
typedef ncclDevKernelArgsStorage<(4<<10)> ncclDevKernelArgs4K;
//typedef ncclDevKernelArgsStorage<(32<<10)-4> ncclDevKernelArgs31K;
template<typename T>
__host__ __device__ constexpr T min_constexpr(T a) { return a; }
@@ -366,6 +448,10 @@ __host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) {
return max_constexpr<T>((a > b ? a : b), c...);
}
constexpr int ncclDevMaxChannelsForArgsBytes(size_t argsBytes) {
return min_constexpr<size_t>(MAXCHANNELS, (argsBytes - sizeof(struct ncclDevKernelArgs))/sizeof(struct ncclDevWorkBatch));
}
// Calculate the unroll factor given:
// * bytePerPack: number of bytes accessed per instruction
// * insns: max permissible unroll value
@@ -412,6 +498,7 @@ extern int const ncclDevKernelCount;
extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];
// Table of most specialized kernel function to run given func index.
extern int const ncclDevFuncIdCount;
extern int const ncclDevFuncRowToId[];
extern void* const ncclDevKernelForFunc[/*funcIndex*/];
extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/];
+1
Zobrazit soubor
@@ -24,5 +24,6 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo);
#endif // End include guard
+5 -4
Zobrazit soubor
@@ -8,6 +8,7 @@
#define NCCL_GDRWRAP_H_
#include "nccl.h"
#include "alloc.h"
#include <stdint.h> // for standard [u]intX_t types
#include <stdio.h>
#include <stdlib.h>
@@ -194,7 +195,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
char *devMem;
void *gdrMap;
mapSize = sizeof(T)*nelem;
mapSize = ncclSizeOfT<T>()*nelem;
// GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
@@ -203,7 +204,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
size_t align = alignedAddr - (uint64_t)devMem;
//TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize);
//TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zu size %zu", alignedAddr, devMem, align, mapSize);
NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh));
NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize));
@@ -226,7 +227,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
*ptr = (T *)((char *)gdrMap+off);
if (devPtr) *devPtr = (T *)(devMem+off+align);
TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zu at %p",
md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
return ncclSuccess;
@@ -235,7 +236,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
template <typename T>
static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T)));
NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*ncclSizeOfT<T>()));
return ncclSuccess;
}
+5 -2
Zobrazit soubor
@@ -29,6 +29,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
// Query topology
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
@@ -46,9 +47,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
#define NCCL_TOPO_CPU_ARCH_X86 1
#define NCCL_TOPO_CPU_ARCH_POWER 2
#define NCCL_TOPO_CPU_ARCH_ARM 3
#define NCCL_TOPO_CPU_ARCH_MIXED 4
#define NCCL_TOPO_CPU_VENDOR_INTEL 1
#define NCCL_TOPO_CPU_VENDOR_AMD 2
#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
#define NCCL_TOPO_CPU_VENDOR_MIXED 4
#define NCCL_TOPO_CPU_TYPE_BDW 1
#define NCCL_TOPO_CPU_TYPE_SKL 2
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
@@ -70,6 +73,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
#define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU
#define NCCL_TOPO_PATTERN_RING 4 // Ring
#define NCCL_TOPO_PATTERN_NVLS 5 // NVLS+SHARP and NVLS+Tree
#define NCCL_TOPO_PATTERN_COLLNET_DIRECT 6 // Collnet Direct
struct ncclTopoGraph {
// Input / output
int id; // ring : 0, tree : 1, collnet : 2
@@ -113,7 +117,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent);
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
#include "info.h"
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup = NULL);
ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup=nullptr);
#endif
+11 -4
Zobrazit soubor
@@ -35,9 +35,12 @@ struct ncclAsyncJob {
void(*undo)(struct ncclAsyncJob*);
void(*destructor)(void*);
ncclGroupJobState_t state;
volatile uint32_t *abortFlag; /* point to comm abortFlag */
volatile uint32_t *childAbortFlag; /* point to child abortFlag */
uint32_t* abortFlag; /* point to comm abortFlag */
uint32_t* abortFlagDev; /* point to comm abortFlagDev */
uint32_t* childAbortFlag; /* point to child abortFlag */
uint32_t* childAbortFlagDev; /* point to child abortFlagDev */
ncclComm_t comm;
int destroyFlag;
};
ncclResult_t ncclAsyncLaunch(
@@ -52,14 +55,14 @@ struct ncclGroupJob {
struct ncclComm **groupCommHeadPtr;
struct ncclComm **groupCommPreconnectHeadPtr;
ncclResult_t *groupErrorPtr;
volatile bool *abortFlagPtr;
bool *abortFlagPtr;
int *groupBlockingPtr;
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
bool initialized;
};
ncclResult_t ncclGroupStartInternal();
ncclResult_t ncclGroupEndInternal();
ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo = NULL);
ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);
////////////////////////////////////////////////////////////////////////////////
@@ -114,6 +117,10 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
// Comms gets a new memory stack scope upon joining. Each task batched for
// this comm is allocated there.
ncclMemoryStackPush(&comm->memScoped);
// Initialize planner
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
memset(&comm->planner, 0, sizeof(comm->planner));
comm->planner.peers = tmp;
}
ncclGroupBlocking = comm->config.blocking;
-123
Zobrazit soubor
@@ -8,28 +8,9 @@
#define NCCL_INFO_H_
#include "nccl.h"
#include "device.h"
#include "collectives.h"
#include "core.h"
#include "utils.h"
#include "strongstream.h"
#define NCCL_MAX_LOCAL_RANKS 64
typedef enum : uint8_t {
ncclPatternRing,
ncclPatternRingTwice,
ncclPatternPipelineFrom,
ncclPatternPipelineTo,
ncclPatternTreeUp,
ncclPatternTreeDown,
ncclPatternTreeUpDown,
ncclPatternCollnetChain,
ncclPatternCollnetDirect,
ncclPatternNvls,
ncclPatternNvlsTree,
ncclPatternSend,
ncclPatternRecv
} ncclPattern_t;
// Used to pass NCCL call information between functions
struct ncclInfo {
@@ -47,110 +28,6 @@ struct ncclInfo {
// Algorithm details
int chunkSteps;
int sliceSteps;
// Computed later
ncclDevRedOpFull opFull;
ncclPattern_t pattern;
size_t nBytes;
size_t aggnBytes;
size_t workBytes;
size_t sendbuffSize;
size_t recvbuffSize;
int stepSize;
int chunkCount;
int chunkSize;
int channelId;
int workFuncIndex;
ncclRegBufferType regBufType;
void* regBufSend[NCCL_MAX_LOCAL_RANKS];
void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
// collnet buffer reg handles
void* sendMhandle;
void* recvMhandle;
// Need to initialize
int nThreads;
int nChannels;
int algorithm;
int protocol;
bool userTuned;
struct ncclInfo *next;
};
inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
info->nBytes = info->workBytes = info->count * ncclTypeSize(info->datatype);
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
info->count = info->workBytes;
info->datatype = ncclInt8;
}
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
/* compute buffer size for NVLS buffer registration */
if (info->coll == ncclFuncAllGather) {
info->sendbuffSize = info->workBytes;
info->recvbuffSize = info->sendbuffSize * nRanks;
} else if (info->coll == ncclFuncReduceScatter) {
info->recvbuffSize = info->workBytes;
info->sendbuffSize = info->recvbuffSize * nRanks;
} else {
info->sendbuffSize = info->recvbuffSize = info->workBytes;
}
return ncclSuccess;
}
struct ncclTaskColl {
struct ncclTaskColl* next;
ncclFunc_t func;
void const* sendbuff;
void* recvbuff;
size_t count;
int root;
ncclDataType_t datatype;
ncclDevRedOpFull op;
int chunkSteps, sliceSteps;
struct ncclInfo info;
};
struct ncclTaskP2p {
ncclTaskP2p *next;
void *buff;
size_t bytes;
// Stateful chunk index. If a p2p gets "cut" over two plans this keeps track
// of where it left off.
int chunk;
};
struct ncclCudaStreamList {
struct ncclCudaStreamList *next;
cudaStream_t stream;
};
struct ncclTasks {
struct Peer {
bool sendSeen, recvSeen;
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
};
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collQueue;
// Queue for user-tuned executed collectives
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collTunedQueue;
// Queue for continuous bytes distribution (CBD) collectives
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collCBDQueue;
// Queue for collnet
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collnetQueue;
size_t workBytesTotal;
int usableChannels;
bool sorted;
struct Peer* peers/*[nRanks]*/;
int *p2pSendOrder, *p2pRecvOrder;
int p2pOrderSteps;
int nTasksColl, nTasksP2p;
// The list of user streams aggregated over all tasks present.
struct ncclCudaStreamList* streams;
// The most recent user stream. Ignored if streams==nullptr
cudaStream_t streamRecent;
// The graph capturing all user streams or invalid if none. Thus we restrict the
// user that all streams must be captured in the same graph or not captured
// at all. Technically we could probably relax this, but that would mean
// collecting a different `ncclTasks` per graph and one for non-graph.
struct ncclCudaGraph capturingGraph;
};
#endif
+28 -2
Zobrazit soubor
@@ -7,8 +7,33 @@
#ifndef NCCL_DEBUG_H_
#define NCCL_DEBUG_H_
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef enum {
NCCL_LOG_NONE = 0,
NCCL_LOG_VERSION = 1,
NCCL_LOG_WARN = 2,
NCCL_LOG_INFO = 3,
NCCL_LOG_ABORT = 4,
NCCL_LOG_TRACE = 5
} ncclDebugLogLevel;
typedef enum {
NCCL_INIT = 0x1,
NCCL_COLL = 0x2,
NCCL_P2P = 0x4,
NCCL_SHM = 0x8,
NCCL_NET = 0x10,
NCCL_GRAPH = 0x20,
NCCL_TUNING = 0x40,
NCCL_ENV = 0x80,
NCCL_ALLOC = 0x100,
NCCL_CALL = 0x200,
NCCL_PROXY = 0x400,
NCCL_NVLS = 0x800,
NCCL_BOOTSTRAP = 0x1000,
NCCL_REG = 0x2000,
NCCL_PROFILE = 0x4000,
NCCL_ALL = ~0
} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
@@ -40,4 +65,5 @@ typedef enum {
#define NCCL_PROTO_LL128 1
#define NCCL_PROTO_SIMPLE 2
#define NCCL_ALGO_PROTO_IGNORE -1.0
#endif
+50 -6
Zobrazit soubor
@@ -11,6 +11,54 @@
#include "nccl.h"
#include "nccl_common.h"
// API to be implemented by external tuner
typedef struct {
// Name of the tuner
const char* name;
// Initializes tuner states.
// Inputs:
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
// - nNodes: number of nodes in current communicator.
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
// Outputs:
// - context: tuner context object
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
// Inputs:
// - context: tuner context object
// - collType: collective type , e.g., allreduce, allgather…
// - nBytes: collective size in bytes
// - numPipeOps: number of operations in the group
// - numAlgo: number of algorithms in collCostTable
// - numProto: number of protocols in collCostTable
//
// Outputs:
// - nChannels: number of channels (hence SMs) to be used.
//
// InOut:
// - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
// NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
//
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
// default tuning for the given collective.
// Also, the plugin is allowed to not set any output, or set only the
// algorithm and protocol, but not only the algorithm or only the protocol.
// Unset fields will be set automatically by NCCL.
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
int* nChannels);
// Terminates the plugin and cleans up any resources that the plugin allocated.
// context: tuner context object
ncclResult_t (*destroy)(void* context);
} ncclTuner_v3_t;
typedef ncclTuner_v3_t ncclTuner_t;
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
// API to be implemented by external tuner
typedef struct {
// Name of the tuner
@@ -36,7 +84,7 @@ typedef struct {
//
// Outputs:
// - algorithm: selected algorithm to be used for the given collective
// - protocol: selected protocol to be used for the given collective
// - protocol: selected protocol to be used for the give collective
// - nChannels: number of channels (hence SMs) to be used.
//
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
@@ -46,15 +94,11 @@ typedef struct {
// Unset fields will be set automatically by NCCL.
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels);
int* algorithm, int* protocol, int* nChannels);
// Terminates the plugin and cleans up any resources that the plugin allocated.
// context: tuner context object
ncclResult_t (*destroy)(void* context);
} ncclTuner_v2_t;
typedef ncclTuner_v2_t ncclTuner_t;
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
#endif
+3 -1
Zobrazit soubor
@@ -14,8 +14,10 @@
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
ncclResult_t ncclNetPluginInit();
ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
ncclResult_t ncclNetInit(struct ncclComm* comm);
ncclResult_t ncclNetFinalize(struct ncclComm* comm);
int ncclNetVersion(struct ncclComm* comm);
// Test whether the current GPU support GPU Direct RDMA.
+38
Zobrazit soubor
@@ -253,6 +253,38 @@ typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t;
*/
#define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2)
/**
* Confidential Compute Feature Status values
*/
#define NVML_CC_SYSTEM_FEATURE_DISABLED 0
#define NVML_CC_SYSTEM_FEATURE_ENABLED 1
typedef struct nvmlConfComputeSystemState_st {
unsigned int environment;
unsigned int ccFeature;
unsigned int devToolsMode;
} nvmlConfComputeSystemState_t;
/**
* Confidential Compute Multigpu mode values
*/
#define NVML_CC_SYSTEM_MULTIGPU_NONE 0
#define NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE 1
/**
* Confidential Compute System settings
*/
typedef struct {
unsigned int version;
unsigned int environment;
unsigned int ccFeature;
unsigned int devToolsMode;
unsigned int multiGpuMode;
} nvmlSystemConfComputeSettings_v1_t;
typedef nvmlSystemConfComputeSettings_v1_t nvmlSystemConfComputeSettings_t;
#define nvmlSystemConfComputeSettings_v1 NVML_STRUCT_VERSION(SystemConfComputeSettings, 1)
/* End of nvml.h */
#endif // NCCL_NVML_DIRECT
@@ -268,6 +300,11 @@ extern int ncclNvmlDeviceCount;
extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices];
extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices];
struct ncclNvmlCCStatus {
bool CCEnabled;
bool multiGpuCCEnabled;
};
// All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly.
// Outsiders need only call it if they want to inspect the ncclNvml global
// tables above.
@@ -283,5 +320,6 @@ ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* ma
ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo);
ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status);
#endif // End include guard
+1 -1
Zobrazit soubor
@@ -63,7 +63,7 @@ class payload_schema {
nullptr,
NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
nullptr, 0, 0, 0};
nullptr, 0, 0, 0, 0, nullptr};
};
// Create NVTX push/pop range with parameters
+50 -46
Zobrazit soubor
@@ -25,9 +25,9 @@
*
* \section INITIALIZATION_SECTION Initialization
*
* Typically the tool's library that plugs into NVTX is indirectly
* loaded via enviromental properties that are platform specific.
* For some platform or special cases, the user may be required
* Typically the tool's library that plugs into NVTX is indirectly
* loaded via enviromental properties that are platform specific.
* For some platform or special cases, the user may be required
* to instead explicity initialize instead though. This can also
* be helpful to control when the API loads a tool's library instead
* of what would typically be the first function call to emit info.
@@ -37,16 +37,16 @@
*
* Markers and ranges are used to describe events at a specific time (markers)
* or over a time span (ranges) during the execution of the application
* respectively.
* respectively.
*
* \subsection MARKERS Markers
*
*
* Markers denote specific moments in time.
*
*
*
*
* See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on
* how to specify the domain.
*
*
* \subsection THREAD_RANGES Thread Ranges
*
* Thread ranges denote nested time ranges. Nesting is maintained per thread
@@ -59,9 +59,9 @@
*
* \subsection PROCESS_RANGES Process Ranges
*
* Process ranges denote a time span that can expose arbitrary concurrency, as
* Process ranges denote a time span that can expose arbitrary concurrency, as
* opposed to thread ranges that only support nesting. In addition the range
* start event can happen on a different thread than the end marker. For the
* start event can happen on a different thread than the end marker. For the
* correlation of a start/end pair an unique correlation ID is used that is
* returned from the start API call and needs to be passed into the end API
* call.
@@ -87,15 +87,15 @@
*
* The function ::nvtxDomainCreateA or ::nvtxDomainCreateW is used to create
* a named domain.
*
*
* Each domain maintains its own
* - categories
* - thread range stacks
* - registered strings
*
* The function ::nvtxDomainDestroy marks the end of the domain. Destroying
* a domain unregisters and destroys all objects associated with it such as
* registered strings, resource objects, named categories, and started ranges.
* The function ::nvtxDomainDestroy marks the end of the domain. Destroying
* a domain unregisters and destroys all objects associated with it such as
* registered strings, resource objects, named categories, and started ranges.
*
* \section RESOURCE_NAMING Resource Naming
*
@@ -105,41 +105,41 @@
* The functions can be called multiple times during the execution of an
* application, however, in that case it is implementation dependent which
* name will be reported by the tool.
*
*
* \subsection CATEGORY_NAMING Category Naming
*
* Some function in this library support associating an integer category
* to enable filtering and sorting. The category naming functions allow
* the application to associate a user friendly name with the integer
* category. Support for domains have been added in NVTX_VERSION_2 to
* avoid collisions when domains are developed independantly.
* Some function in this library support associating an integer category
* to enable filtering and sorting. The category naming functions allow
* the application to associate a user friendly name with the integer
* category. Support for domains have been added in NVTX_VERSION_2 to
* avoid collisions when domains are developed independantly.
*
* \subsection RESOURCE_OBJECTS Resource Objects
*
* Resource objects are a generic mechanism for attaching data to an application
* resource. The identifier field makes the association to a pointer or handle,
* while the type field helps provide deeper understanding of the identifier as
* Resource objects are a generic mechanism for attaching data to an application
* resource. The identifier field makes the association to a pointer or handle,
* while the type field helps provide deeper understanding of the identifier as
* well as enabling differentiation in cases where handles generated by different
* APIs may collide. The resource object may also have an associated message to
* associate with the application resource, enabling further annotation of this
* associate with the application resource, enabling further annotation of this
* object and how it is used.
*
*
* The resource object was introduced in NVTX_VERSION_2 to supersede existing naming
* functions and allow the application resource identified by those functions to be
* associated to a domain. The other naming functions are still supported for backward
* compatibility but will be associated only to the default domain.
*
* \subsection RESOURCE_NAMING_OS Resource Naming
*
* Some operating system resources creation APIs do not support providing a user friendly
* name, such as some OS thread creation APIs. This API support resource naming though
* both through resource objects and functions following the pattern
* nvtxName[RESOURCE_TYPE][A|W](identifier, name). Resource objects introduced in NVTX_VERSION 2
*
* Some operating system resources creation APIs do not support providing a user friendly
* name, such as some OS thread creation APIs. This API support resource naming though
* both through resource objects and functions following the pattern
* nvtxName[RESOURCE_TYPE][A|W](identifier, name). Resource objects introduced in NVTX_VERSION 2
* supersede the other functions with a a more general method of assigning names to OS resources,
* along with associating them to domains too. The older nvtxName* functions are only associated
* along with associating them to domains too. The older nvtxName* functions are only associated
* with the default domain.
* \section EXTENSIONS Optional Extensions
* Optional extensions will either appear within the existing sections the extend or appear
* Optional extensions will either appear within the existing sections the extend or appear
* in the "Related Pages" when they introduce new concepts.
*/
@@ -159,7 +159,11 @@
#define NVTX_INLINE_STATIC __inline static
#else /*defined(__GNUC__)*/
#define NVTX_API
#if defined(__cplusplus) || (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
#define NVTX_INLINE_STATIC inline static
#else
#define NVTX_INLINE_STATIC __inline__ static
#endif
#endif /* Platform */
#if defined(NVTX_NO_IMPL)
@@ -212,7 +216,7 @@
extern "C" {
#endif /* __cplusplus */
/**
/**
* Result Codes
*/
@@ -281,12 +285,12 @@ typedef enum nvtxColorType_t
* ------------------------------------------------------------------------- */
typedef enum nvtxMessageType_t
{
NVTX_MESSAGE_UNKNOWN = 0, /**< Message payload is unused. */
NVTX_MESSAGE_UNKNOWN = 0, /**< Message attribute is unused. */
NVTX_MESSAGE_TYPE_ASCII = 1, /**< A character sequence is used as payload. */
NVTX_MESSAGE_TYPE_UNICODE = 2, /**< A wide character sequence is used as payload. */
/* NVTX_VERSION_2 */
NVTX_MESSAGE_TYPE_REGISTERED = 3, /**< A unique string handle that was registered
with \ref nvtxDomainRegisterStringA() or
with \ref nvtxDomainRegisterStringA() or
\ref nvtxDomainRegisterStringW(). */
} nvtxMessageType_t;
@@ -338,7 +342,7 @@ NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved);
* ------------------------------------------------------------------------- */
typedef enum nvtxPayloadType_t
{
NVTX_PAYLOAD_UNKNOWN = 0, /**< Color payload is unused. */
NVTX_PAYLOAD_UNKNOWN = 0, /**< Payload attribute is unused. */
NVTX_PAYLOAD_TYPE_UNSIGNED_INT64 = 1, /**< A 64 bit unsigned integer value is used as payload. */
NVTX_PAYLOAD_TYPE_INT64 = 2, /**< A 64 bit signed integer value is used as payload. */
NVTX_PAYLOAD_TYPE_DOUBLE = 3, /**< A 64 bit floating point value is used as payload. */
@@ -714,10 +718,10 @@ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message);
/* ------------------------------------------------------------------------- */
/** \brief Ends a process range.
*
* \param domain - The domain
* \param domain - The domain
* \param id - The correlation ID returned from a nvtxRangeStart call.
*
* \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd.
* \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd.
* It does not need a domain param since that is associated iwth the range ID at ::nvtxDomainRangeStartEx
*
* \par Example:
@@ -929,10 +933,10 @@ NVTX_DECLSPEC int NVTX_API nvtxRangePop(void);
/* ------------------------------------------------------------------------- */
/** \cond SHOW_HIDDEN
* \brief Resource typing helpers.
* \brief Resource typing helpers.
*
* Classes are used to make it easy to create a series of resource types
* per API without collisions
* Classes are used to make it easy to create a series of resource types
* per API without collisions
*/
#define NVTX_RESOURCE_MAKE_TYPE(CLASS, INDEX) ((((uint32_t)(NVTX_RESOURCE_CLASS_ ## CLASS))<<16)|((uint32_t)(INDEX)))
#define NVTX_RESOURCE_CLASS_GENERIC 1
@@ -1062,7 +1066,7 @@ typedef struct nvtxResourceAttributes_v0
int32_t identifierType; /* values from enums following the pattern nvtxResource[name]Type_t */
/**
* \brief Identifier for the resource.
* \brief Identifier for the resource.
* \anchor RESOURCE_IDENTIFIER_FIELD
*
* An identifier may be a pointer or a handle to an OS or middleware API object.
@@ -1093,7 +1097,7 @@ typedef struct nvtxResourceAttributes_v0
typedef struct nvtxResourceAttributes_v0 nvtxResourceAttributes_t;
/* \cond SHOW_HIDDEN
/* \cond SHOW_HIDDEN
* \version \NVTX_VERSION_2
*/
#define NVTX_RESOURCE_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxResourceAttributes_v0) ) )
@@ -1106,7 +1110,7 @@ typedef struct nvtxResourceHandle* nvtxResourceHandle_t;
/** \brief Create a resource object to track and associate data with OS and middleware objects
*
* Allows users to associate an API handle or pointer with a user-provided name.
*
*
*
* \param domain - Domain to own the resource object
* \param attribs - Attributes to be associated with the resource
@@ -1240,7 +1244,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t*
* POSIX pthread_t type returned by pthread_self() may not comply with these
* expectations. Please use OS-specific thread ID instead of pthread_t.
*
* The thread name is associated to the default domain. To support domains
* The thread name is associated to the default domain. To support domains
* use resource objects via ::nvtxDomainResourceCreate.
*
* \param threadId - The ID of the thread to name.
@@ -1457,7 +1461,7 @@ NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain);
} /* extern "C" */
#endif /* __cplusplus */
#define NVTX_IMPL_GUARD /* Ensure other headers cannot included directly */
#define NVTX_IMPL_GUARD /* Ensure other headers cannot be included directly */
#include "nvtxDetail/nvtxTypes.h"
+335
Zobrazit soubor
@@ -0,0 +1,335 @@
/**
* The NVTX counters extension is intended to collect counter values of various
* sources. It uses the NVTX payload extension to specify the data layout a
* counter group.
*
* A counter group is a set of counters that are collected together (at the same
* time). Counters are always registered as a group. Hence, a single counter is
* represented by a group with one counter.
*
* A sample refers to all values for a given timestamp. These values must
* include counter values and may include multiple instances of a counter group.
*
* The NVTX domain handle is the first argument to all counter collect
* functions. 0/NULL/nullptr represents the default domain (no domain).
*/
#include "nvToolsExtPayload.h"
#ifndef NVTOOLSEXT_COUNTERS_H
#define NVTOOLSEXT_COUNTERS_H
/**
* \brief The compatibility ID is used for versioning of this extension.
*/
#ifndef NVTX_EXT_COUNTERS_COMPATID
#define NVTX_EXT_COUNTERS_COMPATID 0x0101
#endif
/**
* \brief The module ID identifies the payload extension. It has to be unique
* among the extension modules.
*/
#ifndef NVTX_EXT_COUNTERS_MODULEID
#define NVTX_EXT_COUNTERS_MODULEID 4
#endif
/** Identifies an invalid scope and indicates an error if returned by `nvtxScopeRegister`. */
#define NVTX_SCOPE_NONE 0 /* no scope */
#define NVTX_SCOPE_ROOT 1
#define NVTX_SCOPE_CURRENT_HW_MACHINE 2 /* Node/machine name, Device? */
#define NVTX_SCOPE_CURRENT_HW_SOCKET 3
#define NVTX_SCOPE_CURRENT_HW_CPU 4
#define NVTX_SCOPE_CURRENT_HW_CPU_LOGICAL 5
/* Innermost HW execution context at registration time */
#define NVTX_SCOPE_CURRENT_HW_INNERMOST 6
/* Virtualized hardware, virtual machines, OS (if you don't know any better) */
#define NVTX_SCOPE_CURRENT_HYPERVISOR 7
#define NVTX_SCOPE_CURRENT_VM 8
#define NVTX_SCOPE_CURRENT_KERNEL 9
#define NVTX_SCOPE_CURRENT_CONTAINER 10
#define NVTX_SCOPE_CURRENT_OS 11
/* Software scopes */
#define NVTX_SCOPE_CURRENT_SW_PROCESS 12 /* Process scope */
#define NVTX_SCOPE_CURRENT_SW_THREAD 13 /* Thread scope */
#define NVTX_SCOPE_CURRENT_SW_FIBER 14
/* Innermost SW execution context at registration time */
#define NVTX_SCOPE_CURRENT_SW_INNERMOST 15
/** Static (user-provided) scope IDs (feed forward) */
#define NVTX_SCOPE_ID_STATIC_START (1 << 24)
/** Dynamically (tool) generated scope IDs */
#define NVTX_SCOPE_ID_DYNAMIC_START 4294967296 /* 1 << 32 */
/** Identifier of the semantic extension for counters. */
#define NVTX_SEMANTIC_ID_COUNTERS_V1 5
/*** Flags to augment the counter value. ***/
#define NVTX_COUNTERS_FLAG_NONE 0
/**
* Convert the fixed point value to a normalized floating point.
* Use the sign/unsign from the underlying type this flag is applied to.
* Unsigned [0f : 1f] or signed [-1f : 1f]
*/
#define NVTX_COUNTERS_FLAG_NORM (1 << 1)
/**
* Tools should apply scale and limits when graphing, ideally in a "soft" way to
* to see when limits are exceeded.
*/
#define NVTX_COUNTERS_FLAG_LIMIT_MIN (1 << 2)
#define NVTX_COUNTERS_FLAG_LIMIT_MAX (1 << 3)
#define NVTX_COUNTERS_FLAG_LIMITS \
(NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX)
/** Counter time scope **/
#define NVTX_COUNTERS_FLAG_TIME_POINT (1 << 5)
#define NVTX_COUNTERS_FLAG_TIME_SINCE_LAST (2 << 5)
#define NVTX_COUNTERS_FLAG_TIME_UNTIL_NEXT (3 << 5)
#define NVTX_COUNTERS_FLAG_TIME_SINCE_START (4 << 5)
/** Counter value type **/
#define NVTX_COUNTERS_FLAG_VALUE_ABSOLUTE (1 << 10)
#define NVTX_COUNTERS_FLAG_VALUE_DELTA (2 << 10) // delta to previous counter sample
/** Counter visualization hints **/
#define NVTX_COUNTERS_FLAG_INTERPOLATE (1 << 14)
/** Datatypes for limits union (value of `limitType`). */
#define NVTX_COUNTERS_LIMIT_I64 0
#define NVTX_COUNTERS_LIMIT_U64 1
#define NVTX_COUNTERS_LIMIT_F64 2
/** Reasons for the missing sample value. */
#define NVTX_COUNTERS_SAMPLE_ZERO 0
#define NVTX_COUNTERS_SAMPLE_UNCHANGED 1
#define NVTX_COUNTERS_SAMPLE_UNAVAILABLE 2
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/**
* \brief Specify additional properties of a counter or counter group.
*/
typedef struct nvtxSemanticsCounter_v1
{
/** Header of the semantic extension (with identifier, version, etc.). */
struct nvtxSemanticsHeader_v1 header;
/**
* Flag if normalization, scale limits, etc. should be applied to counter
* values.
*/
uint64_t flags;
/** Unit of the counter value (case insensitive) */
const char* unit;
/** Should be 1 if not used. */
uint64_t unitScaleNumerator;
/** Should be 1 if not used. */
uint64_t unitScaleDenominator;
/** Determines the used union member. Use defines `NVTX_COUNTERS_LIMIT_*`. */
int64_t limitType;
/** Soft graph limit. */
union limits_t {
int64_t i64[2];
uint64_t u64[2];
double d[2];
} limits;
} nvtxSemanticsCounter_t;
typedef struct nvtxCountersAttr_v1
{
size_t structSize;
/**
* A schema ID referring to the data layout of the counter group or a
* predefined NVTX payloads number type.
*/
uint64_t schemaId;
/** Name of the counter group. */
const char* name;
/** Identifier of the scope of the counters. */
uint64_t scopeId;
/**
* (Optional) Specify additional semantics for a counter (group). The
* semantics provided are applied to the all counters in a group. If the
* semantics should only refer to a single counter in a group, the semantics
* field of the payload entry has to be used. Accepted semantics are
* `nvtxSemanticsCounter_t` and `nvtxSemanticsTime_t`.
*/
const nvtxSemanticsHeader_t* semantics;
} nvtxCountersAttr_t;
/* Forward declaration of opaque counter group registration structure */
struct nvtxCountersRegistration_st;
typedef struct nvtxCountersRegistration_st nvtxCountersRegistration;
/* \brief Counters Handle Structure.
* \anchor COUNTERS_HANDLE_STRUCTURE
*
* This structure is opaque to the user and is used as a handle to reference a counter group.
* This type is returned from tools when using the NVTX API to create a counters group.
*/
typedef nvtxCountersRegistration* nvtxCountersHandle_t;
typedef struct nvtxCountersBatch_v1
{
/** Handle to attributes (data layout, scope, etc.) of a counter (group). */
nvtxCountersHandle_t hCounter;
/** Array of counter samples. */
const void* counters;
/** Size of the `counters` array (in bytes). */
size_t cntArrSize;
/** Array of timestamps or reference-time plus delta pair. `NULL` is used, if
timestamps are part of the counter (group) layout.) */
const void* timestamps;
/** Size of the `timestamps` array or definition (in bytes). */
size_t tsSize;
} nvtxCountersBatch_t;
/**
* \brief Register a counter group.
*
* @param hDomain NVTX domain handle.
* @param attr Pointer to the attributes of the counter (group).
*
* @return Counter handle identifying a counter or counter (group).
* The counter handle is unique within the NVTX domain.
*/
NVTX_DECLSPEC nvtxCountersHandle_t NVTX_API nvtxCountersRegister(
nvtxDomainHandle_t hDomain,
const nvtxCountersAttr_t* attr);
/**
* \brief Sample one integer counter by value immediately (the NVTX tool determines the timestamp).
*
* @param hDomain handle of the NVTX domain.
* @param hCounter handle of the NVTX counter (group).
* @param value 64-bit integer counter value.
*/
NVTX_DECLSPEC void NVTX_API nvtxCountersSampleInt64(
nvtxDomainHandle_t hDomain,
nvtxCountersHandle_t hCounter,
int64_t value);
/**
* \brief Sample one floating point counter by value immediately (the NVTX tool determines the timestamp).
*
* @param hDomain handle of the NVTX domain.
* @param hCounter handle of the NVTX counter (group).
* @param value 64-bit floating-point counter value.
*/
NVTX_DECLSPEC void NVTX_API nvtxCountersSampleFloat64(
nvtxDomainHandle_t hDomain,
nvtxCountersHandle_t hCounter,
double value);
/**
* \brief Sample a counter group by reference immediately (the NVTX tool determines the timestamp).
*
* @param hDomain handle of the NVTX domain.
* @param hCounter handle of the NVTX counter (group).
* @param counters pointer to one or more counter values.
* @param size size of the counter value(s) in bytes.
*/
NVTX_DECLSPEC void NVTX_API nvtxCountersSample(
nvtxDomainHandle_t hDomain,
nvtxCountersHandle_t hCounter,
void* values,
size_t size);
/**
* \brief Sample without value.
*
* @param hDomain handle of the NVTX domain.
* @param hCounter handle of the NVTX counter (group).
* @param reason reason for the missing sample value.
*/
NVTX_DECLSPEC void NVTX_API nvtxCountersSampleNoValue(
nvtxDomainHandle_t hDomain,
nvtxCountersHandle_t hCounter,
uint8_t reason);
/**
* \brief Submit a batch of counters in the given domain.
* Timestamps are part of the counter sample data.
*
* The size of a data sampling point is defined by the `staticSize` field of the
* payload schema. An NVTX tool can assume that the counter samples are stored
* as an array with each entry being `staticSize` bytes.
*
* @param hDomain handle of the NVTX domain
* @param hCounter handle of the counter group (includes counter data decoding schema)
* @param counters blob containing counter data and timestamps
* @param size size of the counter data blob in bytes
*/
NVTX_DECLSPEC void NVTX_API nvtxCountersSubmitBatch(
nvtxDomainHandle_t hDomain,
nvtxCountersHandle_t hCounter,
const void* counters,
size_t size);
/**
* \brief Submit a batch of counters in the given domain.
* Timestamps are separated from the counter data.
*
* @param hDomain handle of the NVTX domain
* @param counterBatch Pointer to the counter data to be submitted.
*/
NVTX_DECLSPEC void NVTX_API nvtxCountersSubmitBatchEx(
nvtxDomainHandle_t hDomain,
const nvtxCountersBatch_t* counterBatch);
#define NVTX3EXT_CBID_nvtxCountersRegister 0
#define NVTX3EXT_CBID_nvtxCountersSampleInt64 1
#define NVTX3EXT_CBID_nvtxCountersSampleFloat64 2
#define NVTX3EXT_CBID_nvtxCountersSample 3
#define NVTX3EXT_CBID_nvtxCountersSampleNoValue 4
#define NVTX3EXT_CBID_nvtxCountersSubmitBatch 5
#define NVTX3EXT_CBID_nvtxCountersSubmitBatchEx 6
#ifdef __GNUC__
#pragma GCC visibility push(internal)
#endif
#define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot be included directly */
#include "nvtxDetail/nvtxExtTypes.h"
#undef NVTX_EXT_TYPES_GUARD
#ifndef NVTX_NO_IMPL
#define NVTX_EXT_IMPL_COUNTERS_GUARD /* Ensure other headers cannot be included directly */
#include "nvtxDetail/nvtxExtImplCounters_v1.h"
#undef NVTX_EXT_IMPL_COUNTERS_GUARD
#endif /*NVTX_NO_IMPL*/
#ifdef __GNUC__
#pragma GCC visibility pop
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif /* NVTOOLSEXT_COUNTERS_H */
+2 -2
Zobrazit soubor
@@ -30,7 +30,7 @@ extern "C" {
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
@@ -133,7 +133,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */
#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot be included directly */
#include "nvtxDetail/nvtxImplCuda_v3.h"
#undef NVTX_IMPL_GUARD_CUDA
#endif /*NVTX_NO_IMPL*/
+2 -2
Zobrazit soubor
@@ -31,7 +31,7 @@ extern "C" {
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
@@ -109,7 +109,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t*
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */
#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot be included directly */
#include "nvtxDetail/nvtxImplCudaRt_v3.h"
#undef NVTX_IMPL_GUARD_CUDART
#endif /*NVTX_NO_IMPL*/
+694
Zobrazit soubor
@@ -0,0 +1,694 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvToolsExt.h"
#ifndef NVTOOLSEXTV3_MEM_V1
#define NVTOOLSEXTV3_MEM_V1
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#define NVTX_EXT_MODULEID_MEM 1
/* \cond SHOW_HIDDEN
* \brief A compatibility ID value used in structures and initialization to
* identify version differences.
*/
#define NVTX_EXT_COMPATID_MEM 0x0102
/* \cond SHOW_HIDDEN
* \brief This value is returned by functions that return `nvtxMemHeapHandle_t`,
* if a tool is not attached.
*/
#define NVTX_MEM_HEAP_HANDLE_NO_TOOL ((nvtxMemHeapHandle_t)(intptr_t)-1)
/* \cond SHOW_HIDDEN
* \brief This value is returned by functions that return `nvtxMemRegionHandle_t`
* if a tool is not attached.
*/
#define NVTX_MEM_REGION_HANDLE_NO_TOOL ((nvtxMemRegionHandle_t)(intptr_t)-1)
/* \cond SHOW_HIDDEN
* \brief This value is returned by functions that return `nvtxMemPermissionsHandle_t`
* if a tool is not attached.
*/
#define NVTX_MEM_PERMISSIONS_HANDLE_NO_TOOL ((nvtxMemPermissionsHandle_t)-1)
/* \cond SHOW_HIDDEN
* \brief This should not be used and is considered an error but defined to
* detect an accidental use of zero or NULL.
*/
#define NVTX_MEM_HEAP_USAGE_UNKNOWN 0x0
/* \cond SHOW_HIDDEN
* \brief This should not be used and is considered an error but defined to
* detect an accidental use of zero or NULL.
*/
#define NVTX_MEM_TYPE_UNKNOWN 0x0
/* ------------------------------------------------------------------------- */
/** \defgroup MEMORY Memory
* See page \ref PAGE_MEMORY.
* @{
*/
/**
* \brief To indicate the full process virtual address space as a heap for
* functions where a nvtxMemHeapHandle_t is accepted.
*
* The heap by default is always read-write-execute permissions without creating regions.
* Regions created in this heap have read-write access by default but not execute.
*/
#define NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE ((nvtxMemHeapHandle_t)0)
/** \brief This heap is a sub-allocator.
*
* Heap created with this usage should not be accessed by the user until regions are registered.
* Regions from a heap with this usage have read-write access by default but not execute.
*/
#define NVTX_MEM_HEAP_USAGE_TYPE_SUB_ALLOCATOR 0x1
/**
* \brief This is a heap of memory that has an explicit layout.
*
* The layout could be static or dynamic (calculated). This often represents an algorithm's
* structures that are packed together. By default this heap is assumed to be accessible for
* scopes where the memory is naturally accessible by hardware. Regions may be use to further
* annotate or restrict access. A tool may have an option to be more strict, but special
* consideration must be made for `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`.
*
* The behavior of this usage is similar to NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE but
* a tool can use it to track special behaviors and reservation.
*
* Memory in a heap with this usage has read-write permissions by default but not execute without
* creating regions. Regions created in this heap have the same default permission access.
*/
#define NVTX_MEM_HEAP_USAGE_TYPE_LAYOUT 0x2
/**
* \brief Standard process userspace virtual addresses for linear allocations.
*
* APIs that map into this space, such as CUDA UVA should use this type.
*
* Relevant functions: cudaMalloc, cudaMallocManaged, cudaHostAlloc, cudaMallocHost
* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is supported
*
* nvtxMemHeapRegister receives a heapDesc of type nvtxMemVirtualRangeDesc_t
*/
#define NVTX_MEM_TYPE_VIRTUAL_ADDRESS 0x1
/**
* \brief To indicate you are modifying permissions to the process-wide
* full virtual address space.
*
* This is a companion object to `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`.
*/
#define NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE ((nvtxMemPermissionsHandle_t)0)
#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_NONE 0x0
#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_READ 0x1
#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE 0x2
#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_ATOMIC 0x4
/* \cond SHOW_HIDDEN
* \brief Forward declaration of opaque memory heap structure.
*/
struct nvtxMemHeap_v1;
typedef struct nvtxMemHeap_v1 nvtxMemHeap_t;
/** \brief A handle returned by a tool to represent a memory heap. */
typedef nvtxMemHeap_t* nvtxMemHeapHandle_t;
/* \cond SHOW_HIDDEN
* \brief Forward declaration of opaque memory heap structure.
*/
struct nvtxMemRegion_v1;
typedef struct nvtxMemRegion_v1 nvtxMemRegion_t;
/** \brief A handle returned by a tool to represent a memory region. */
typedef nvtxMemRegion_t* nvtxMemRegionHandle_t;
/** \brief A reference to a memory region (by pointer or handle).
* Which member of the union will be determined by a type or flag field outside.
*/
typedef union nvtxMemRegionRef_t
{
void const* pointer;
nvtxMemRegionHandle_t handle;
} nvtxMemRegionRef_t;
/* \cond SHOW_HIDDEN
* \brief Forward declaration of opaque memory permissions structure
*/
struct nvtxMemPermissions_v1;
typedef struct nvtxMemPermissions_v1 nvtxMemPermissions_t;
/** \brief A handle returned by a tool to represent a memory permissions mask. */
typedef nvtxMemPermissions_t* nvtxMemPermissionsHandle_t;
typedef struct nvtxMemVirtualRangeDesc_v1
{
size_t size;
void const* ptr;
} nvtxMemVirtualRangeDesc_v1 ;
typedef nvtxMemVirtualRangeDesc_v1 nvtxMemVirtualRangeDesc_t;
/** \brief structure to describe a heap in process virtual memory. */
typedef struct nvtxMemHeapDesc_v1
{
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
uint16_t structSize; /* Size of the structure. */
uint32_t reserved0;
/** \brief Usage characteristics of the heap
*
* Usage characteristics help tools like memcheckers, santiizer,
* as well as other debugging and profiling tools to determine some
* special behaviors they should apply to the heap and it's regions.
* The value follows the convention NVTX_MEM_HEAP_USAGE_*
*
* Default Value is 0, which is invalid.
*/
uint32_t usage;
/** \brief Memory type characteristics of the heap
*
* The 'type' indicates how to interpret the ptr field of the heapDesc.
* This is intended to support many additional types of memory, beyond
* standard process virtual memory, such as API specific memory only
* addressed by handles or multi-dimensional memory requiring more complex
* descriptions to handle features like strides, tiling, or interlace.
*
* The values conforms to NVTX_MEM_TYPE_*
*
* The value in the field 'type' identifies the descriptor type that will
* be in the field 'typeSpecificDesc'. 'typeSpecificDesc' is void* because
* it is extensible. Example usage is if type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS,
* then typeSpecificDesc points to a nvtxMemVirtualRangeDesc_t.
*
* Default Value is 0, which is invalid.
*/
uint32_t type;
/** \brief size of the heap memory descriptor pointed to by typeSpecificDesc
*
* Default Value is 0 which is invalid.
*/
size_t typeSpecificDescSize;
/** \brief Pointer to the heap memory descriptor
*
* The value in the field 'type' identifies the descriptor type that will
* be in the field 'typeSpecificDesc'. 'typeSpecificDesc' is void* because
* it is extensible. Example usage is if type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS,
* then typeSpecificDesc points to a nvtxMemVirtualRangeDesc_t.
*
* Default Value is 0, which is invalid.
*/
void const* typeSpecificDesc;
/** \brief ID of the category the event is assigned to.
*
* A category is a user-controlled ID that can be used to group
* events. The tool may use category IDs to improve filtering or
* enable grouping of events in the same category. The functions
* \ref ::nvtxNameCategoryA or \ref ::nvtxNameCategoryW can be used
* to name a category.
*
* Default Value is 0.
*/
uint32_t category;
/** \brief Message type specified in this attribute structure.
*
* Defines the message format of the attribute structure's \ref MESSAGE_FIELD
* "message" field.
*
* Default Value is `NVTX_MESSAGE_UNKNOWN`.
*/
uint32_t messageType; /* nvtxMessageType_t */
/** \brief Message assigned to this attribute structure. \anchor MESSAGE_FIELD
*
* The text message that is attached to an event.
*/
nvtxMessageValue_t message;
} nvtxMemHeapDesc_v1 ;
typedef nvtxMemHeapDesc_v1 nvtxMemHeapDesc_t;
/**
* \brief Create a memory heap to represent a object or range of memory that will be further
* sub-divided into regions.
*
* The handle used to addrss the heap will depend on the heap's type. Where the heap is virtual
* memory accessible, the addrss of the heap's memory itself is it's handle. This will likewise
* be returned from the function.
*
* For more advanced types, where the heap is not virtual memory accessible the tools may be
* responsible for returning a void const * that that uniquely identifies the object. Please see
* the description of each heap type for more details on whether this is expected to be a uniquely
* generated by the tool or otherwise.
*/
NVTX_DECLSPEC nvtxMemHeapHandle_t NVTX_API nvtxMemHeapRegister(
nvtxDomainHandle_t domain,
nvtxMemHeapDesc_t const* desc);
/** \brief Destroy a memory heap. */
NVTX_DECLSPEC void NVTX_API nvtxMemHeapUnregister(
nvtxDomainHandle_t domain,
nvtxMemHeapHandle_t heap);/* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported */
/**
* \brief Reset the memory heap wipes out any changes, as if it were a fresh heap.
*
* This includes invalidating all regions and their handles.
*/
NVTX_DECLSPEC void NVTX_API nvtxMemHeapReset(
nvtxDomainHandle_t domain,
nvtxMemHeapHandle_t heap); /* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is supported */
/**
* \brief Register a region of memory inside of a heap.
*
* The heap refers the the heap within which the region resides. This can be from
* `nvtxMemHeapRegister`, `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`, or one provided
* from other extension API.
*
* The regionType arg will define which type is used in regionDescArray.
* The most commonly used type is `NVTX_MEM_TYPE_VIRTUAL_ADDRESS`.
* In this case regionDescElements is an array of `nvtxMemVirtualRangeDesc_t`.
*
* The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut.
*
* The regionHandleArrayOut arg points to an array where the tool will provide region handles. If
* a pointer is provided, it is expected to have regionCount elements. This pointer can be NULL if
* regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case, the user can use the pointer to the
* virtual memory to reference the region in other related functions which accept nvtMemRegionRef_t.
*/
typedef struct nvtxMemRegionsRegisterBatch_v1
{
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
uint16_t structSize; /* Size of the structure. */
uint32_t regionType; /* NVTX_MEM_TYPE_* */
nvtxMemHeapHandle_t heap;
size_t regionCount;
size_t regionDescElementSize;
void const* regionDescElements; /* This will also become the handle for this region. */
nvtxMemRegionHandle_t* regionHandleElementsOut; /* This will also become the handle for this region. */
} nvtxMemRegionsRegisterBatch_v1;
typedef nvtxMemRegionsRegisterBatch_v1 nvtxMemRegionsRegisterBatch_t;
/** \brief Register a region of memory inside of a heap of linear process virtual memory
*/
NVTX_DECLSPEC void NVTX_API nvtxMemRegionsRegister(
nvtxDomainHandle_t domain,
nvtxMemRegionsRegisterBatch_t const* desc);
/**
* \brief Register a region of memory inside of a heap.
*
* The heap refers the the heap within which the region resides.
* This can be from nvtxMemHeapRegister, NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE, or
* one provided from other extension API.
*
* The regionType arg will define which type is used in regionDescArray.
* The most commonly used type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS.
*
* The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut.
*
* The regionHandleArrayOut arg points to an array where the tool will provide region handles. If
* a pointer if provided, it is expected to have regionCount elements. This pointer can be NULL if
* regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case, the user can use the pointer to the
* virtual memory to reference the region in other related functions which accept nvtMemRegionRef_t.
*/
typedef struct nvtxMemRegionsResizeBatch_v1
{
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
uint16_t structSize; /* Size of the structure. */
uint32_t regionType; /* NVTX_MEM_TYPE_* */
size_t regionDescCount;
size_t regionDescElementSize;
void const* regionDescElements; /* This will also become the handle for this region. */
} nvtxMemRegionsResizeBatch_v1;
typedef nvtxMemRegionsResizeBatch_v1 nvtxMemRegionsResizeBatch_t;
/** \brief Register a region of memory inside of a heap of linear process virtual memory
*/
NVTX_DECLSPEC void NVTX_API nvtxMemRegionsResize(
nvtxDomainHandle_t domain,
nvtxMemRegionsResizeBatch_t const* desc);
#define NVTX_MEM_REGION_REF_TYPE_UNKNOWN 0x0
#define NVTX_MEM_REGION_REF_TYPE_POINTER 0x1
#define NVTX_MEM_REGION_REF_TYPE_HANDLE 0x2
/**
* \brief Register a region of memory inside of a heap.
*
* The heap refers the the heap within which the region resides.
* This can be from nvtxMemHeapRegister, `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`, or
* one provided from other extension API.
*
* The regionType arg will define which type is used in `regionDescArray`.
* The most commonly used type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS.
*
* The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut.
*
* The regionHandleArrayOut arg points to an array where the tool will provide region handles.
* If a pointer if provided, it is expected to have regionCount elements.
* This pointer can be NULL if regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case,
* the user can use the pointer to the virtual memory to reference the region in other
* related functions which accept a nvtMemRegionRef_t.
*/
typedef struct nvtxMemRegionsUnregisterBatch_v1
{
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
uint16_t structSize; /* Size of the structure. */
uint32_t refType; /* NVTX_MEM_REGION_REF_TYPE_* */
size_t refCount; /* count of elements in refArray */
size_t refElementSize;
nvtxMemRegionRef_t const* refElements; /* This will also become the handle for this region. */
} nvtxMemRegionsUnregisterBatch_v1;
typedef nvtxMemRegionsUnregisterBatch_v1 nvtxMemRegionsUnregisterBatch_t;
/**
* \brief Unregistration for regions of process virtual memory
*
* This is not necessary if the nvtx heap destroy function has been called that
* contains this object.
*/
NVTX_DECLSPEC void NVTX_API nvtxMemRegionsUnregister(
nvtxDomainHandle_t domain,
nvtxMemRegionsUnregisterBatch_t const* desc);
typedef struct nvtxMemRegionNameDesc_v1
{
uint32_t regionRefType; /* NVTX_MEM_REGION_REF_TYPE_* */
uint32_t nameType; /* nvtxMessageType_t */
nvtxMemRegionRef_t region;
nvtxMessageValue_t name;
uint32_t category;
uint32_t reserved0;
} nvtxMemRegionNameDesc_v1;
typedef nvtxMemRegionNameDesc_v1 nvtxMemRegionNameDesc_t;
typedef struct nvtxMemRegionsNameBatch_v1
{
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
uint16_t structSize; /* Size of the structure. */
uint32_t reserved0;
size_t regionCount;
size_t regionElementSize;
nvtxMemRegionNameDesc_t const* regionElements;
size_t reserved1;
} nvtxMemRegionsNameBatch_v1 ;
typedef nvtxMemRegionsNameBatch_v1 nvtxMemRegionsNameBatch_t;
/** \brief Name or rename a region. */
NVTX_DECLSPEC void NVTX_API nvtxMemRegionsName(
nvtxDomainHandle_t domain,
nvtxMemRegionsNameBatch_t const* desc);
/** \brief There are no permissions for this memory. */
#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_NONE 0x0
/** \brief The memory is readable. */
#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_READ 0x1
/** \brief The memory is writable. */
#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_WRITE 0x2
/** \brief The memory is for atomic RW. */
#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_ATOMIC 0x4
/**
* \brief The memory access permissions are reset for a region.
*
* This is as if never set, rather than documented defaults. As as result any flags
* indicating how unspecified regions are handle will affect this area.
*
* This should not be used with READ, WRITE, nor ATOMIC, as those flags would have no effect.
*/
#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_RESET 0x8
typedef struct nvtxMemPermissionsAssignRegionDesc_v1
{
uint32_t flags; /* NVTX_MEM_PERMISSIONS_REGION_FLAGS_* */
uint32_t regionRefType; /* NVTX_MEM_REGION_REF_TYPE_* */
nvtxMemRegionRef_t region;
} nvtxMemPermissionsAssignRegionDesc_v1 ;
typedef nvtxMemPermissionsAssignRegionDesc_v1 nvtxMemPermissionsAssignRegionDesc_t;
typedef struct nvtxMemPermissionsAssignBatch_v1
{
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
uint16_t structSize; /* Size of the structure. */
uint32_t reserved0;
nvtxMemPermissionsHandle_t permissions;
size_t regionCount;
size_t regionElementSize;
nvtxMemPermissionsAssignRegionDesc_t const* regionElements;
size_t reserved1;
} nvtxMemPermissionsAssignBatch_v1 ;
typedef nvtxMemPermissionsAssignBatch_v1 nvtxMemPermissionsAssignBatch_t;
/** \brief Change the permissions of a region of process virtual memory. */
NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsAssign(
nvtxDomainHandle_t domain,
nvtxMemPermissionsAssignBatch_t const* desc);
/**
* \brief Create a permissions object for fine grain thread-local control in
* multi-threading scenarios
*
* Unlike the global permissions object (NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE), a new
* permissions object is empty. There are no regions registered to it, so more memory is accessible
* if bound(bind) without calls to nvtxMemPermissionsSetAccess* first. The permissions are not
* active until nvtxMemPermissionsBind. See `nvtxMemPermissionsBind` for more details.
*
* Use the flags NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_* to control how the regions in
* this permission object will interact with global permissions when bound. You may choose to
* either replace global memory regions setting or overlay on top of them. The most common uses are
* as follows:
* * To limit tools to validate writing exclusively specified in this object but inherit all
* global read access regions use `NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE`
* * To limit tools to validate both read & write permissions exclusively specified in this
* object use NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_READ
* & NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE
*
* Also see `nvtxMemPermissionsBind` & `nvtxMemPermissionsSetAccess*`.
*/
NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemPermissionsCreate(
nvtxDomainHandle_t domain,
int32_t creationflags); /* NVTX_MEM_PERMISSIONS_CREATE_FLAGS_* */
/**
* \brief Destroy the permissions object.
*
* If bound(bind), destroy will also unbind it.
*/
NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsDestroy(
nvtxDomainHandle_t domain,
nvtxMemPermissionsHandle_t permissionsHandle); /* only supported on objects from nvtxMemPermissionsCreate */
/** \brief Reset the permissions object back to its created state. */
NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsReset(
nvtxDomainHandle_t domain,
nvtxMemPermissionsHandle_t permissionsHandle);
/* NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE and other special handles are supported */
#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_NONE 0x0
/** \brief Upon binding, with the thread, exclude parent scope write regions instead of overlaying on top of them.
*
* EX A developer may chose to first prevent all writes except the ones specified to avoid
* OOB writes, since there are typically less regions written to than read from.
**/
#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_WRITE 0x2
/** \brief Upon binding, with the thread, exclude parent scope read regions instead of overlaying on top of them.
*
* EX After eliminating any errors when applying strict writes, a developer may then choose to
* annotate and enforce strict reads behaviors in segments of code.
**/
#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_READ 0x1
/** \brief Upon binding, with the thread, exclude parent scope atomic RW regions instead of overlaying on top of them.
*
* EX After eliminating any errors from read and write, a developer may chose to ensure
* that atomics are in their own region, removing standard read/write, and replacing with
* this strict atomic only access. This way they know that conventional reads or writes
* will not cause unepected issues.
**/
#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_ATOMIC 0x4
#define NVTX_MEM_PERMISSIONS_BIND_SCOPE_UNKNOWN 0x0
/** \brief Bind to thread scope. In this case, tools should validate that local thread's
* execution is honoring the permissions as well as the state of NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE
* at the time of binding. If this is not bound then NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE should be
* used to validate the memory.
*
* Not all tools will support every scope, such a GPU sanitizer.
**/
#define NVTX_MEM_PERMISSIONS_BIND_SCOPE_CPU_THREAD 0x1
/**
* \brief Bind to CUDA stream scope.
*
* In this case, work enqueued to a CUDA stream should be validated by the tool,
* when it executes, that it respect the permission of the permission at the point
* of binding, as well as the appropriate nvtxMemCudaGetDevicePermissions at the
* time of binding. If this is not bound then nvtxMemCudaGetDevicePermissions at
* the time of stream enqueue should be used to validate the memory.
*
* This could apply to work done either on the GPU like a kernel launch or to
* CPU based callbacks like cudaStreamAddCallback if the tools supports it.
*
* Binding is applies locally to a CPU thread so that if N CPU threads are enqueing
* work to the same stream (like the default stream) that there cannot be a race
* condition between thread binding vs launching their work. IE users should
* expect the permissions bound in the thread to be honored by the proceeding
* work (launches, copies, etc) invoked from in the CPU thread until unbound.
*/
#define NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM 0x2
/**
* \brief Bind the permissions object into a particular scope on the caller thread
*
* Permissions do not take affect until binding. Binding permissions is a thread local
* activity that overrides global behaviors. This is to avoid multi-threaded race conditions,
*
* The scope dictates what type of processing it applies to, and when in some cases.
* EX1: NVTX_MEM_PERMISSIONS_BIND_SCOPE_CPU_THREAD applies to CPU code accessing memory while bound.
* EX2: NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM applies to CUDA streams, and the permissions
* must be recorded and applied when the work in the stream dequeues to executes. In this case
* it could be GPU or CPU, if the tool support both.
*
* Bind can be called again on the same object and thread to take any updates to the
* specified permission object or the inherited properties.
*
* Bind flags support changing how the binding process inherits region access control.
* In the case of thread scope this is NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE and from CUDA_STREAM
* this is nvtxMemCudaGetDevicePermissions. Choosing stricter modes allows the user to
* further reduce the access with less work, since memory by default, behaves as natural
* until the NVTX annotations instructs a tool to treat it anther way. See strict flags
* for more details.
*
* Also see nvtxMemPermissionsUnbind
*/
NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsBind(
nvtxDomainHandle_t domain,
nvtxMemPermissionsHandle_t permissions, /* special object like NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE are not supported */
uint32_t bindScope, /* NVTX_MEM_PERMISSIONS_BIND_SCOPE_* */
uint32_t bindFlags); /* NVTX_MEM_PERMISSIONS_BIND_FLAGS_* */
/**
* \brief Unbind the permissions object bound to the caller thread.
*
* Upon unbind, the thread local permissions for a scope are restored to the default
* behavior defined by the scope.
*/
NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsUnbind(
nvtxDomainHandle_t domain,
uint32_t bindScope);
/** @} */ /*END defgroup*/
typedef enum NvtxExtMemCallbackId
{
/* CBID 0 is invalid */
NVTX3EXT_CBID_nvtxMemHeapRegister = 1,
NVTX3EXT_CBID_nvtxMemHeapUnregister = 2,
NVTX3EXT_CBID_nvtxMemHeapReset = 3,
NVTX3EXT_CBID_nvtxMemRegionsRegister = 4,
NVTX3EXT_CBID_nvtxMemRegionsResize = 5,
NVTX3EXT_CBID_nvtxMemRegionsUnregister = 6,
NVTX3EXT_CBID_nvtxMemRegionsName = 7,
NVTX3EXT_CBID_nvtxMemPermissionsAssign = 8,
NVTX3EXT_CBID_nvtxMemPermissionsCreate = 9,
NVTX3EXT_CBID_nvtxMemPermissionsDestroy = 10,
NVTX3EXT_CBID_nvtxMemPermissionsReset = 11,
NVTX3EXT_CBID_nvtxMemPermissionsBind = 12,
NVTX3EXT_CBID_nvtxMemPermissionsUnbind = 13,
/* 14-16 in nvtExtImplMemCudaRt1.h */
NVTX3EXT_CBID_nvtxMemCudaGetProcessWidePermissions = 14,
NVTX3EXT_CBID_nvtxMemCudaGetDeviceWidePermissions = 15,
NVTX3EXT_CBID_nvtxMemCudaSetPeerAccess = 16,
NVTX3EXT_CBID_MEM_FN_NUM = 17
} NvtxExtMemCallbackId;
#ifdef __GNUC__
#pragma GCC visibility push(internal)
#endif
/* Extension types are required for the implementation and the NVTX handler. */
#define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot be included directly */
#include "nvtxDetail/nvtxExtTypes.h"
#undef NVTX_EXT_TYPES_GUARD
#ifndef NVTX_NO_IMPL
/* Ensure other headers cannot be included directly */
#define NVTX_EXT_IMPL_MEM_GUARD
#include "nvtxDetail/nvtxExtImplMem_v1.h"
#undef NVTX_EXT_IMPL_MEM_GUARD
#endif /*NVTX_NO_IMPL*/
#ifdef __GNUC__
#pragma GCC visibility pop
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif /* NVTOOLSEXTV3_MEM_V1 */
+150
Zobrazit soubor
@@ -0,0 +1,150 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTOOLSEXTV3_MEM_CUDART_V1
#define NVTOOLSEXTV3_MEM_CUDART_V1
#include "nvToolsExtMem.h"
#include "cuda.h"
#include "cuda_runtime.h"
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/** \brief The memory is from a CUDA runtime array.
*
* Relevant functions: cudaMallocArray, cudaMalloc3DArray
* Also cudaArray_t from other types such as cudaMipmappedArray_t
*
* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported
*
* nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo()
* nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCudaArrayRangeDesc_t
*/
#define NVTX_MEM_TYPE_CUDA_ARRAY 0x11
/** \brief structure to describe memory in a CUDA array object
*/
typedef struct nvtxMemCudaArrayRangeDesc_v1
{
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
uint16_t structSize; /* Size of the structure. */
uint32_t reserved0;
cudaArray_t src;
size_t offset[3];
size_t extent[3];
} nvtxMemCudaArrayRangeDesc_v1;
typedef nvtxMemCudaArrayRangeDesc_v1 nvtxMemCudaArrayRangeDesc_t;
/** \brief The memory is from a CUDA device array.
*
* Relevant functions: cuArrayCreate, cuArray3DCreate
* Also CUarray from other types such as CUmipmappedArray
*
* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported
*
* nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo()
* nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCuArrayRangeDesc_t
*/
#define NVTX_MEM_TYPE_CU_ARRAY 0x12
/** \brief structure to describe memory in a CUDA array object
*/
typedef struct nvtxMemCuArrayRangeDesc_v1
{
uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
uint16_t structSize; /* Size of the structure. */
uint32_t reserved0;
CUarray src;
size_t offset[3];
size_t extent[3];
} nvtxMemCuArrayRangeDesc_v1;
typedef nvtxMemCuArrayRangeDesc_v1 nvtxMemCuArrayRangeDesc_t;
/* Reserving 0x2-0xF for more common types */
#define NVTX_MEM_CUDA_PEER_ALL_DEVICES -1
/** \brief Get the permission object that represent the CUDA runtime device
* or cuda driver context
*
* This object will allow developers to adjust permissions applied to work executed
* on the GPU. It may be inherited or overridden by permissions object bound
* with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags.
*
* Ex. change the peer to peer access permissions between devices in entirety
* or punch through special holes
*
* By default, all memory is accessible that naturally would be to a CUDA kernel until
* modified otherwise by nvtxMemCudaSetPeerAccess or changing regions.
*
* This object should also represent the CUDA driver API level context.
*/
NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetProcessWidePermissions(
nvtxDomainHandle_t domain);
/** \brief Get the permission object that represent the CUDA runtime device
* or cuda driver context
*
* This object will allow developers to adjust permissions applied to work executed
* on the GPU. It may be inherited or overridden by permissions object bound
* with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags.
*
* Ex. change the peer to peer access permissions between devices in entirety
* or punch through special holes
*
* By default, all memory is accessible that naturally would be to a CUDA kernel until
* modified otherwise by nvtxMemCudaSetPeerAccess or changing regions.
*
* This object should also represent the CUDA driver API level context.
*/
NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetDeviceWidePermissions(
nvtxDomainHandle_t domain,
int device);
/** \brief Change the default behavior for all memory mapped in from a particular device.
*
* While typically all memory defaults to readable and writable, users may desire to limit
* access to reduced default permissions such as read-only and a per-device basis.
*
* Regions can used to further override smaller windows of memory.
*
* devicePeer can be NVTX_MEM_CUDA_PEER_ALL_DEVICES
*
*/
NVTX_DECLSPEC void NVTX_API nvtxMemCudaSetPeerAccess(
nvtxDomainHandle_t domain,
nvtxMemPermissionsHandle_t permissions,
int devicePeer, /* device number such as from cudaGetDevice() or NVTX_MEM_CUDA_PEER_ALL_DEVICES */
uint32_t flags); /* NVTX_MEM_PERMISSIONS_REGION_FLAGS_* */
/** @} */ /*END defgroup*/
#ifdef __GNUC__
#pragma GCC visibility push(internal)
#endif
#ifndef NVTX_NO_IMPL
#define NVTX_EXT_IMPL_MEM_CUDART_GUARD /* Ensure other headers cannot be included directly */
#include "nvtxDetail/nvtxExtImplMemCudaRt_v1.h"
#undef NVTX_EXT_IMPL_MEM_CUDART_GUARD
#endif /*NVTX_NO_IMPL*/
#ifdef __GNUC__
#pragma GCC visibility pop
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif /* NVTOOLSEXTV3_MEM_CUDART_V1 */
+3 -3
Zobrazit soubor
@@ -30,11 +30,11 @@ extern "C" {
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
#define NVTX_RESOURCE_CLASS_OPENCL 6
#define NVTX_RESOURCE_CLASS_OPENCL 6
/** \endcond */
/* ------------------------------------------------------------------------- */
@@ -183,7 +183,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name)
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot be included directly */
#include "nvtxDetail/nvtxImplOpenCL_v3.h"
#undef NVTX_IMPL_GUARD_OPENCL
#endif /*NVTX_NO_IMPL*/
Rozdílový obsah nebyl zobrazen, protože je příliš veliký Načíst rozdílové porovnání
+170
Zobrazit soubor
@@ -0,0 +1,170 @@
/*
* Copyright 2023 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvtxDetail/nvtxExtPayloadHelperInternal.h"
/* This is just an empty marker (for readability), which can be omitted. */
/* TODO: Fix issue with trailing comma at end of entry list. */
#define NVTX_PAYLOAD_ENTRIES
/**
* Use this macro for payload entries that are defined by a schema (nested
* payload schema).
*/
#define NVTX_PAYLOAD_NESTED(schemaId) _NVTX_PAYLOAD_NESTED(schemaId)
/**
* \brief Define a payload schema for an existing C `struct` definition.
*
* This macro does
* 1) create schema description (array of schema entries).
* 2) set the schema attributes for a static data layout.
*
* It can be used in static code or within a function context.
*
* Example:
* NVTX_DEFINE_SCHEMA_FOR_STRUCT(your_struct, "SchemaName",
* NVTX_PAYLOAD_ENTRIES(
* (index, TYPE_INT, "integer value"),
* (dpfloat, TYPE_DOUBLE, "fp64 value"),
* (text, TYPE_CSTRING, "text", NULL, 24)
* )
* )
*
* It is required to at least provide the struct name and the payload entries.
* The first two fields (member name and NVTX entry type) of each payload entry
* are required.
*
* The optional parameters are only allowed to be passed in the predefined order.
* Hence, `payload_flags` requires `payload_schema` to be given and
* `prefix` requires `payload_flags` and `payload_schema` to be given.
* The payload entries are always the last parameter. A maximum of 16 schema
* entries is supported.
*
* It is recommended to use `NVTX_PAYLOAD_SCHEMA_REGISTER` to register the schema.
*
* @param struct_id The name of the struct.
* @param schema_name (Optional 1) name of the payload schema. Default is `NULL`.
* @param prefix (Optional 2) prefix before the schema and attributes variables,
* e.g. `static const`. Leave this empty, if no prefix is desired.
* @param schema_flags (Optional 2) flags to augment the payload schema.
* Default is `NVTX_PAYLOAD_SCHEMA_FLAG_NONE`.
* @param schema_id (Optional 4) User-defined payload schema ID.
* @param entries (Mandatory) Payload schema entries. This is always the last
* parameter to the macro.
*/
#define NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, ...) \
_NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, __VA_ARGS__)
/**
* \brief Define a C struct together with a matching schema.
*
* This macro does
* 1) define the payload type (typedef struct).
* 2) create schema description (array of schema entries).
* 3) set the schema attributes for a static data layout.
*
* The macro can be used in static code or within a function context.
*
* It defines the schema attributes in `struct_id##Attr`. Thus, it is recommended
* to use `NVTX_PAYLOAD_SCHEMA_REGISTER(domain, struct_id)` to register the schema.
*
* Example:
* NVTX_DEFINE_STRUCT_WITH_SCHEMA(your_struct_name, "Your schema name",
* NVTX_PAYLOAD_ENTRIES(
* (int, index, TYPE_INT, "integer value"),
* (double, dpfloat, TYPE_DOUBLE, "fp64 value"),
* (const char, (text, 24), TYPE_CSTRING, "text", NULL, 24)
* )
* )
*
* The first three fields (C type, member, entry type) of each entry are required.
* A fixed-size array or string requires a special notation with the member
* name and the size separated by comma and put into brackets (see last entry
* in the example).
*
* The optional parameters are positional (only allowed to be passed in the
* predefined order). A maximum of 16 schema entries is supported.
*
* @param struct_id The name of the struct.
* @param schema_name (Optional 1) name of the payload schema. Default is `NULL`.
* @param prefix (Optional 2) prefix before the schema and attributes variables,
* e.g. `static const`. Leave this empty, if no prefix is desired.
* @param schema_flags (Optional 3) flags to augment the payload schema.
* Default is `NVTX_PAYLOAD_SCHEMA_FLAG_NONE`.
* @param schema_id (Optional 4) User-defined payload schema ID.
* @param entries (Mandatory) The schema entries. This is always the last
* parameter to the macro.
*/
#define NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, ...) \
_NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, __VA_ARGS__)
/**
* \brief Initialize and register the NVTX binary payload schema.
*
* This does essentially the same as `NVTX_DEFINE_STRUCT_WITH_SCHEMA`, but in
* addition the schema is registered. The schema ID will be defined as follows:
* `const uint64_t struct_id##_schemaId`.
*
* @param domain The NVTX domain handle (0 for default domain).
* All other parameters are similar to `NVTX_DEFINE_STRUCT_WITH_SCHEMA`.
*/
#define NVTX_DEFINE_STRUCT_WITH_SCHEMA_AND_REGISTER(domain, struct_id, ...) \
_NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, __VA_ARGS__) \
const uint64_t struct_id##_schemaId = nvtxPayloadSchemaRegister(domain, &struct_id##Attr);
/**
* \brief Define payload schema for an existing `struct` and register the schema.
*
* This does essentially the same as `NVTX_PAYLOAD_STATIC_SCHEMA_DEFINE`, but in
* addition, the schema is registered and `uint64_t struct_id##_schemaId` set.
*
* @param domain The NVTX domain handle (0 for default domain).
* All other parameters are similar to `NVTX_PAYLOAD_STATIC_SCHEMA_DEFINE`.
*/
#define NVTX_DEFINE_SCHEMA_FOR_STRUCT_AND_REGISTER(domain, struct_id, ...) \
_NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, __VA_ARGS__) \
const uint64_t struct_id##_schemaId = nvtxPayloadSchemaRegister(domain, &struct_id##Attr);
/**
* \brief Create a type definition for the given struct ID and members.
*
* This is a convenience macro. A normal `typedef` can be used instead.
*
* Example usage:
* NVTX_DEFINE_STRUCT(your_struct,
* (double, fp64),
* (uint8_t, u8),
* (float, fp32[3])
* )
*
* @param struct_id The name of the struct.
* @param members The members of the struct.
*/
#define NVTX_DEFINE_STRUCT(struct_id, ...) \
_NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, __VA_ARGS__)
/**
* \brief Register an NVTX binary payload schema.
*
* This is a convenience macro, which takes the same `struct_id` that has been
* used in other helper macros. Instead, `nvtxPayloadSchemaRegister` can also be
* used, but `&struct_id##Attr` has to be passed.
*
* @param domain The NVTX domain handle (0 for default domain).
* @param struct_id The name of the struct.
*
* @return NVTX schema ID
*/
#define NVTX_PAYLOAD_SCHEMA_REGISTER(domain, struct_id) \
nvtxPayloadSchemaRegister(domain, &struct_id##Attr);
+88
Zobrazit soubor
@@ -0,0 +1,88 @@
/*
* Copyright 2024 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
/**
* NVTX semantic headers require nvToolsExtPayload.h to be included beforehand.
*/
#ifndef NVTX_SEMANTIC_ID_COUNTERS_V1
#define NVTX_SEMANTIC_ID_COUNTERS_V1 2
/**
* Flags to extend the semantics of counters.
*/
#define NVTX_COUNTERS_FLAGS_NONE 0
/**
* Convert the fixed point value to a normalized floating point value.
* Unsigned [0f : 1f] or signed [-1f : 1f] is determined by the underlying type
* this flag is applied to.
*/
#define NVTX_COUNTERS_FLAG_NORMALIZE (1 << 1)
/**
* Visual tools should apply scale and limits when graphing.
*/
#define NVTX_COUNTERS_FLAG_LIMIT_MIN (1 << 2)
#define NVTX_COUNTERS_FLAG_LIMIT_MAX (1 << 3)
#define NVTX_COUNTERS_FLAG_LIMITS \
(NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX)
/**
* Counter time scopes.
*/
#define NVTX_COUNTERS_FLAG_TIMESCOPE_POINT (1 << 5)
#define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_LAST (2 << 5)
#define NVTX_COUNTERS_FLAG_TIMESCOPE_UNTIL_NEXT (3 << 5)
#define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_START (4 << 5)
/**
* Counter value types.
*/
#define NVTX_COUNTERS_FLAG_VALUETYPE_ABSOLUTE (1 << 10)
/** Delta to previous value of same counter type. */
#define NVTX_COUNTERS_FLAG_VALUETYPE_DELTA (2 << 10)
/**
* Datatypes for the `limits` union.
*/
#define NVTX_COUNTERS_LIMIT_I64 0
#define NVTX_COUNTERS_LIMIT_U64 1
#define NVTX_COUNTERS_LIMIT_F64 2
/**
*\brief Specify counter semantics.
*/
typedef struct nvtxSemanticsCounter_v1 {
/** Header of the semantic extensions (with identifier, version, etc.). */
struct nvtxSemanticsHeader_v1 header;
/** Flags to provide more context about the counter value. */
uint64_t flags;
/** Unit of the counter value (case-insensitive). */
const char* unit;
/** Should be 1 if not used. */
uint64_t unitScaleNumerator;
/** Should be 1 if not used. */
uint64_t unitScaleDenominator;
/** Determines the used union member. Use defines `NVTX_COUNTER_LIMIT_*`. */
int64_t limitType;
/** Graph limits {minimum, maximum}. */
union limits_t {
int64_t i64[2];
uint64_t u64[2];
double d[2];
} limits;
} nvtxSemanticsCounter_t;
#endif /* NVTX_SEMANTIC_ID_COUNTERS_V1 */
+30
Zobrazit soubor
@@ -0,0 +1,30 @@
/*
* Copyright 2024 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
/**
* NVTX semantic headers require nvToolsExtPayload.h to be included beforehand.
*/
#ifndef NVTX_SEMANTIC_ID_SCOPE_V1
#define NVTX_SEMANTIC_ID_SCOPE_V1 1
/**
* \brief Specify the NVTX scope for a payload entry.
*
* This allows the scope to be set for a specific value or counter in a payload.
* The scope must be known at schema registration time.
*/
typedef struct nvtxSemanticsScope_v1
{
struct nvtxSemanticsHeader_v1 header;
/** Specifies the scope of a payload entry, e.g. a counter or timestamp. */
uint64_t scopeId;
} nvtxSemanticsScope_t;
#endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */
+13 -13
Zobrazit soubor
@@ -15,23 +15,23 @@
extern "C" {
#endif /* __cplusplus */
/* \cond SHOW_HIDDEN
/* \cond SHOW_HIDDEN
* \version \NVTX_VERSION_2
*/
#define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) )
/** \endcond */
/**
/**
* \page PAGE_SYNCHRONIZATION Synchronization
*
* This section covers a subset of the API that allow users to track additional
* synchronization details of their application. Naming OS synchronization primitives
* may allow users to better understand the data collected by traced synchronization
* synchronization details of their application. Naming OS synchronization primitives
* may allow users to better understand the data collected by traced synchronization
* APIs. Additionally, a user defined synchronization object can allow the users to
* to tell the tools when the user is building their own synchronization system
* that do not rely on the OS to provide behaviors and instead use techniques like
* atomic operations and spinlocks.
* atomic operations and spinlocks.
*
* See module \ref SYNCHRONIZATION for details.
*
@@ -59,7 +59,7 @@ extern "C" {
*
* bool Lock() {
* nvtxDomainSyncUserAcquireStart(hSync);
* bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic
* bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic
* if (acquired) {
* nvtxDomainSyncUserAcquireSuccess(hSync);
@@ -76,12 +76,12 @@ extern "C" {
* }
* };
* \endcode
*
*
* \version \NVTX_VERSION_2
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
@@ -154,8 +154,8 @@ typedef struct nvtxSyncUser* nvtxSyncUser_t;
/** \brief User Defined Synchronization Object Attributes Structure.
* \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE
*
* This structure is used to describe the attributes of a user defined synchronization
* object. The layout of the structure is defined by a specific version of the tools
* This structure is used to describe the attributes of a user defined synchronization
* object. The layout of the structure is defined by a specific version of the tools
* extension library and can change between different versions of the Tools Extension
* library.
*
@@ -259,7 +259,7 @@ typedef struct nvtxSyncUserAttributes_v0
typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
/* ------------------------------------------------------------------------- */
/** \brief Create a user defined synchronization object
/** \brief Create a user defined synchronization object
* This is used to track non-OS synchronization working with spinlocks and atomics
*
* \param domain - Domain to own the resource
@@ -317,7 +317,7 @@ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle
/* ------------------------------------------------------------------------- */
/** \brief Signal to tools of failure in acquiring a user defined synchronization object
* This should be called after \ref nvtxDomainSyncUserAcquireStart
*
*
* \param handle - A handle to the object to operate on.
*
* \sa
@@ -374,7 +374,7 @@ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle);
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot included directly */
#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot be included directly */
#include "nvtxDetail/nvtxImplSync_v3.h"
#undef NVTX_IMPL_GUARD_SYNC
#endif /*NVTX_NO_IMPL*/
+13 -8
Zobrazit soubor
@@ -12,6 +12,11 @@
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
*
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
/* Temporary helper #defines, #undef'ed at end of header */
@@ -1937,9 +1942,9 @@ class event_attributes {
0, // color value
NVTX_PAYLOAD_UNKNOWN, // payload type
0, // reserved 4B
0, // payload value (union)
{0}, // payload value (union)
NVTX_MESSAGE_UNKNOWN, // message type
0 // message value (union)
{0} // message value (union)
}
{
}
@@ -2003,20 +2008,20 @@ class event_attributes {
attributes_.messageType = m.get_type();
}
/**
* @brief Variadic constructor where the first argument is a binary payload.
/**
* @brief Variadic constructor where the first argument is an extended payload.
*
* Sets the value of the `EventAttribute`s message based on `m` and forwards
* Sets the `ullValue` of the `EventAttribute`s payload and forwards
* the remaining variadic parameter pack to the next constructor.
*
*/
template <typename... Args>
NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(nvtxPayloadData_t const* bpl, Args const&... args) noexcept
NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(nvtxPayloadData_t const* p, Args const&... args) noexcept
: event_attributes(args...)
{
attributes_.payloadType = NVTX_PAYLOAD_TYPE_BINARY;
attributes_.payloadType = NVTX_PAYLOAD_TYPE_EXT;
attributes_.reserved0 = 1; // NCCL uses only a single binary payload per event.
attributes_.payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(bpl);
attributes_.payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p);
}
~event_attributes() = default;
+31
Zobrazit soubor
@@ -0,0 +1,31 @@
/*
* Copyright 2023 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_EXT_HELPER_MACROS_H
#define NVTX_EXT_HELPER_MACROS_H
/* Combine tokens */
#define _NVTX_EXT_CONCAT(a, b) a##b
#define NVTX_EXT_CONCAT(a, b) _NVTX_EXT_CONCAT(a, b)
/* Resolves to the number of arguments passed. */
#define NVTX_EXT_NUM_ARGS(...) \
NVTX_EXT_SELECTA16(__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, throwaway)
#define NVTX_EXT_SELECTA16(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, ...) a16
/* Cast argument(s) to void to prevent unused variable warnings. */
#define _NVTX_EXT_VOIDIFY1(a1) (void)a1;
#define _NVTX_EXT_VOIDIFY2(a1, a2) (void)a1; (void)a2;
#define _NVTX_EXT_VOIDIFY3(a1, a2, a3) (void)a1; (void)a2; (void)a3;
#define _NVTX_EXT_VOIDIFY4(a1, a2, a3, a4) (void)a1; (void)a2; (void)a3; (void)a4;
/* Mark function arguments as unused. */
#define NVTX_EXT_HELPER_UNUSED_ARGS(...) \
NVTX_EXT_CONCAT(_NVTX_EXT_VOIDIFY, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
#endif /* NVTX_EXT_HELPER_MACROS_H */
@@ -14,7 +14,12 @@
#define NVTX_EXT_IMPL_H
/* ---- Include required platform headers ---- */
#if defined(_WIN32)
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <wchar.h>
#if defined(_WIN32)
#include <Windows.h>
@@ -22,27 +27,19 @@
#include <unistd.h>
#if defined(__ANDROID__)
#include <android/api-level.h>
#include <android/api-level.h>
#endif
#if defined(__linux__) || defined(__CYGWIN__)
#include <sched.h>
#endif
#include <sys/types.h>
#include <limits.h>
#include <dlfcn.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/types.h>
#include <pthread.h>
#include <stdlib.h>
#include <wchar.h>
#endif
@@ -66,26 +63,35 @@
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
// #ifdef __GNUC__
// #pragma GCC visibility push(hidden)
// #endif
/*
#ifdef __GNUC__
#pragma GCC visibility push(hidden)
#endif
*/
#define NVTX_EXTENSION_FRESH 0
#define NVTX_EXTENSION_DISABLED 1
#define NVTX_EXTENSION_STARTING 2
#define NVTX_EXTENSION_LOADED 3
NVTX_LINKONCE_DEFINE_GLOBAL NvtxExtInitializeInjectionFunc_t NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = (NvtxExtInitializeInjectionFunc_t)0;
/* Function slots are local to each extension */
typedef struct nvtxExtGlobals1_t
{
NvtxExtInitializeInjectionFunc_t injectionFnPtr;
} nvtxExtGlobals1_t;
NVTX_LINKONCE_DEFINE_GLOBAL nvtxExtGlobals1_t NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1) =
{
(NvtxExtInitializeInjectionFunc_t)0
};
#define NVTX_EXT_INIT_GUARD
#include "nvtxExtInit.h"
#undef NVTX_EXT_INIT_GUARD
// #ifdef __GNUC__
// #pragma GCC visibility pop
// #endif
/*
#ifdef __GNUC__
#pragma GCC visibility pop
#endif
*/
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
+148
Zobrazit soubor
@@ -0,0 +1,148 @@
/*
* Copyright 2023-2024 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_EXT_IMPL_COUNTERS_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExtCounters.h (except when NVTX_NO_IMPL is defined).
#endif
#define NVTX_EXT_IMPL_GUARD
#include "nvtxExtImpl.h"
#undef NVTX_EXT_IMPL_GUARD
#ifndef NVTX_EXT_IMPL_COUNTERS_V1
#define NVTX_EXT_IMPL_COUNTERS_V1
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* Macros to create versioned symbols. */
#define NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
NAME##_v##VERSION##_bpl##COMPATID
#define NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
#define NVTX_EXT_COUNTERS_VERSIONED_ID(NAME) \
NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COUNTERS_COMPATID)
#ifdef NVTX_DISABLE
#include "nvtxExtHelperMacros.h"
#define NVTX_EXT_COUNTERS_IMPL_FN_V1(ret_val, fn_name, signature, arg_names) \
ret_val fn_name signature { \
NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
return ((ret_val)(intptr_t)-1); \
}
#else /* NVTX_DISABLE */
/*
* Function slots for the counters extension. First entry is the module state,
* initialized to `0` (`NVTX_EXTENSION_FRESH`).
*/
#define NVTX_EXT_COUNTERS_SLOT_COUNT 63
NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX_EXT_COUNTERS_SLOT_COUNT + 1]
= {0};
/* Avoid warnings about missing prototype. */
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)(void);
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)()
{
intptr_t* fnSlots = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots) + 1;
nvtxExtModuleSegment_t segment = {
0, /* unused (only one segment) */
NVTX_EXT_COUNTERS_SLOT_COUNT,
fnSlots
};
nvtxExtModuleInfo_t module = {
NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
NVTX_EXT_COUNTERS_MODULEID, NVTX_EXT_COUNTERS_COMPATID,
1, &segment, /* number of segments, segments */
NULL, /* no export function needed */
/* bake type sizes and alignment information into program binary */
NULL
};
NVTX_INFO( "%s\n", __FUNCTION__ );
NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots));
}
#define NVTX_EXT_COUNTERS_IMPL_FN_V1(ret_type, fn_name, signature, arg_names) \
typedef ret_type (*fn_name##_impl_fntype)signature; \
NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
intptr_t slot = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
if (slot != NVTX_EXTENSION_DISABLED) { \
if (slot != NVTX_EXTENSION_FRESH) { \
return (*(fn_name##_impl_fntype)slot) arg_names; \
} else { \
NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)(); \
/* Re-read function slot after extension initialization. */ \
slot = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
return (*(fn_name##_impl_fntype)slot) arg_names; \
} \
} \
} \
NVTX_EXT_FN_RETURN_INVALID(ret_type) \
}
#endif /*NVTX_DISABLE*/
/* Non-void functions. */
#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
NVTX_EXT_COUNTERS_IMPL_FN_V1(nvtxCountersHandle_t, nvtxCountersRegister,
(nvtxDomainHandle_t domain, const nvtxCountersAttr_t* attr),
(domain, attr))
#undef NVTX_EXT_FN_RETURN_INVALID
/* END: Non-void functions. */
/* void functions. */
#define NVTX_EXT_FN_RETURN_INVALID(rtype)
#define return
NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleInt64,
(nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, int64_t value),
(domain, hCounter, value))
NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleFloat64,
(nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, double value),
(domain, hCounter, value))
NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSample,
(nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, void* values, size_t size),
(domain, hCounter, values, size))
NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleNoValue,
(nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, uint8_t reason),
(domain, hCounter, reason))
NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSubmitBatch,
(nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounters,
const void* counters, size_t size), (domain, hCounters, counters, size))
NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSubmitBatchEx,
(nvtxDomainHandle_t domain, const nvtxCountersBatch_t* countersBatch),
(domain, countersBatch))
#undef return
#undef NVTX_EXT_FN_RETURN_INVALID
/* END: void functions. */
/* Keep NVTX_EXT_COUNTERS_IMPL_FN_V1 defined for a future version of this extension. */
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
#endif /* NVTX_EXT_IMPL_COUNTERS_V1 */
+74
Zobrazit soubor
@@ -0,0 +1,74 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_EXT_IMPL_MEM_CUDART_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExtMemCudaRt.h (except when NVTX_NO_IMPL is defined).
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#ifdef NVTX_DISABLE
#include "nvtxExtHelperMacros.h"
#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
ret_val fn_name signature { \
NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
return ((ret_val)(intptr_t)-1); \
}
#else /* NVTX_DISABLE */
#define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \
typedef ret_type ( * fn_name##_impl_fntype )signature; \
NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
if (slot != NVTX_EXTENSION_DISABLED) { \
if (slot != NVTX_EXTENSION_FRESH) { \
return (*(fn_name##_impl_fntype)slot) arg_names; \
} else { \
NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \
/* Re-read function slot after extension initialization. */ \
slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
return (*(fn_name##_impl_fntype)slot) arg_names; \
} \
} \
} \
NVTX_EXT_FN_RETURN_INVALID(ret_type) \
}
#endif /*NVTX_DISABLE*/
/* Non-void functions. */
#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetProcessWidePermissions, (nvtxDomainHandle_t domain), (domain))
NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetDeviceWidePermissions, (nvtxDomainHandle_t domain, int device), (domain, device))
#undef NVTX_EXT_FN_RETURN_INVALID
/* END: Non-void functions. */
/* void functions. */
#define NVTX_EXT_FN_RETURN_INVALID(rtype)
#define return
NVTX_EXT_FN_IMPL(void, nvtxMemCudaSetPeerAccess, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, int devicePeer, uint32_t flags), (domain, permissions, devicePeer, flags))
#undef return
#undef NVTX_EXT_FN_RETURN_INVALID
/* END: void functions. */
#undef NVTX_EXT_FN_IMPL
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
+133
Zobrazit soubor
@@ -0,0 +1,133 @@
/*
* Copyright 2009-2020,2023 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_EXT_IMPL_MEM_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExtMem.h (except when NVTX_NO_IMPL is defined).
#endif
#define NVTX_EXT_IMPL_GUARD
#include "nvtxExtImpl.h"
#undef NVTX_EXT_IMPL_GUARD
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#define NVTXMEM_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) NAME##_v##VERSION##_mem##COMPATID
#define NVTXMEM_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) NVTXMEM_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
#define NVTX_EXT_MEM_VERSIONED_ID(NAME) NVTXMEM_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_MEM)
#ifdef NVTX_DISABLE
#include "nvtxExtHelperMacros.h"
#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
ret_val fn_name signature { \
NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
return ((ret_val)(intptr_t)-1); \
}
#else /* NVTX_DISABLE */
/*
* Function slots for the memory extension. First entry is the module
* state, initialized to `0` (`NVTX_EXTENSION_FRESH`).
*/
NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_MEM_FN_NUM + 2]
= {0};
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)()
{
intptr_t* fnSlots = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots) + 1;
nvtxExtModuleSegment_t segment = {
0, /* unused (only one segment) */
NVTX3EXT_CBID_MEM_FN_NUM,
fnSlots
};
nvtxExtModuleInfo_t module = {
NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
NVTX_EXT_MODULEID_MEM, NVTX_EXT_COMPATID_MEM,
1, &segment,
NULL, /* no export function needed */
NULL
};
NVTX_INFO( "%s\n", __FUNCTION__ );
NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots));
}
#define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \
typedef ret_type ( * fn_name##_impl_fntype )signature; \
NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
if (slot != NVTX_EXTENSION_DISABLED) { \
if (slot != NVTX_EXTENSION_FRESH) { \
return (*(fn_name##_impl_fntype)slot) arg_names; \
} else { \
NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \
/* Re-read function slot after extension initialization. */ \
slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
return (*(fn_name##_impl_fntype)slot) arg_names; \
} \
} \
} \
NVTX_EXT_FN_RETURN_INVALID(ret_type) \
}
#endif /*NVTX_DISABLE*/
/* Non-void functions. */
#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
NVTX_EXT_FN_IMPL(nvtxMemHeapHandle_t, nvtxMemHeapRegister, (nvtxDomainHandle_t domain, nvtxMemHeapDesc_t const* desc), (domain, desc))
NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemPermissionsCreate, (nvtxDomainHandle_t domain, int32_t creationflags), (domain, creationflags))
#undef NVTX_EXT_FN_RETURN_INVALID
/* END: Non-void functions. */
/* void functions. */
#define NVTX_EXT_FN_RETURN_INVALID(rtype)
#define return
NVTX_EXT_FN_IMPL(void, nvtxMemHeapUnregister, (nvtxDomainHandle_t domain, nvtxMemHeapHandle_t heap), (domain, heap))
NVTX_EXT_FN_IMPL(void, nvtxMemHeapReset, (nvtxDomainHandle_t domain, nvtxMemHeapHandle_t heap), (domain, heap))
NVTX_EXT_FN_IMPL(void, nvtxMemRegionsRegister, (nvtxDomainHandle_t domain, nvtxMemRegionsRegisterBatch_t const* desc), (domain, desc))
NVTX_EXT_FN_IMPL(void, nvtxMemRegionsResize, (nvtxDomainHandle_t domain,nvtxMemRegionsResizeBatch_t const* desc), (domain, desc))
NVTX_EXT_FN_IMPL(void, nvtxMemRegionsUnregister, (nvtxDomainHandle_t domain,nvtxMemRegionsUnregisterBatch_t const* desc), (domain, desc))
NVTX_EXT_FN_IMPL(void, nvtxMemRegionsName, (nvtxDomainHandle_t domain,nvtxMemRegionsNameBatch_t const* desc), (domain, desc))
NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsAssign, (nvtxDomainHandle_t domain,nvtxMemPermissionsAssignBatch_t const* desc), (domain, desc))
NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsDestroy, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions), (domain, permissions))
NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsReset, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions), (domain, permissions))
NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsBind, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, uint32_t bindScope, uint32_t bindFlags), (domain, permissions, bindScope, bindFlags))
NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsUnbind, (nvtxDomainHandle_t domain, uint32_t bindScope), (domain, bindScope))
#undef return
#undef NVTX_EXT_FN_RETURN_INVALID
/* END: void functions. */
#undef NVTX_EXT_FN_IMPL
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
+155
Zobrazit soubor
@@ -0,0 +1,155 @@
/*
* Copyright 2021-2023 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
#endif
#define NVTX_EXT_IMPL_GUARD
#include "nvtxExtImpl.h"
#undef NVTX_EXT_IMPL_GUARD
#ifndef NVTX_EXT_IMPL_PAYLOAD_V1
#define NVTX_EXT_IMPL_PAYLOAD_V1
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* Macros to create versioned symbols. */
#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
NAME##_v##VERSION##_bpl##COMPATID
#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
#define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \
NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_PAYLOAD_COMPATID)
#ifdef NVTX_DISABLE
#include "nvtxExtHelperMacros.h"
#define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_val, fn_name, signature, arg_names) \
ret_val fn_name signature { \
NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
return ((ret_val)(intptr_t)-1); \
}
#else /* NVTX_DISABLE */
#include "nvtxExtPayloadTypeInfo.h"
/*
* Function slots for the payload extension. First entry is the module state,
* initialized to `0` (`NVTX_EXTENSION_FRESH`).
*/
#define NVTX_EXT_PAYLOAD_SLOT_COUNT 63
NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX_EXT_PAYLOAD_SLOT_COUNT + 1]
= {0};
/* Avoid warnings about missing prototype. */
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(void);
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
{
intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1;
nvtxExtModuleSegment_t segment = {
0, /* unused (only one segment) */
NVTX_EXT_PAYLOAD_SLOT_COUNT,
fnSlots
};
nvtxExtModuleInfo_t module = {
NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
NVTX_EXT_PAYLOAD_MODULEID, NVTX_EXT_PAYLOAD_COMPATID,
1, &segment, /* number of segments, segments */
NULL, /* no export function needed */
/* bake type sizes and alignment information into program binary */
&(NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo))
};
NVTX_INFO( "%s\n", __FUNCTION__ );
NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots));
}
#define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_type, fn_name, signature, arg_names) \
typedef ret_type (*fn_name##_impl_fntype)signature; \
NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
if (slot != NVTX_EXTENSION_DISABLED) { \
if (slot != NVTX_EXTENSION_FRESH) { \
return (*(fn_name##_impl_fntype)slot) arg_names; \
} else { \
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \
/* Re-read function slot after extension initialization. */ \
slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
return (*(fn_name##_impl_fntype)slot) arg_names; \
} \
} \
} \
NVTX_EXT_FN_RETURN_INVALID(ret_type) \
}
#endif /*NVTX_DISABLE*/
/* Non-void functions. */
#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadSchemaRegister,
(nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr),
(domain, attr))
NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadEnumRegister,
(nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr),
(domain, attr))
NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePushPayload,
(nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count),
(domain, payloadData, count))
NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePopPayload,
(nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count),
(domain, payloadData, count))
NVTX_EXT_PAYLOAD_IMPL_FN_V1(nvtxRangeId_t, nvtxRangeStartPayload,
(nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count),
(domain, payloadData, count))
NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint8_t, nvtxDomainIsEnabled, (nvtxDomainHandle_t domain), (domain))
NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxScopeRegister, (nvtxDomainHandle_t domain,
const nvtxScopeAttr_t* attr), (domain, attr))
#undef NVTX_EXT_FN_RETURN_INVALID
/* END: Non-void functions. */
/* void functions. */
#define NVTX_EXT_FN_RETURN_INVALID(rtype)
#define return
NVTX_EXT_PAYLOAD_IMPL_FN_V1(void, nvtxMarkPayload, (nvtxDomainHandle_t domain,
const nvtxPayloadData_t* payloadData, size_t count), (domain, payloadData, count))
NVTX_EXT_PAYLOAD_IMPL_FN_V1(void, nvtxRangeEndPayload, (nvtxDomainHandle_t domain,
nvtxRangeId_t id, const nvtxPayloadData_t* payloadData, size_t count),
(domain, id, payloadData, count))
#undef return
#undef NVTX_EXT_FN_RETURN_INVALID
/* END: void functions. */
/* Keep NVTX_EXT_PAYLOAD_IMPL_FN_V1 defined for a future version of this extension. */
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
#endif /* NVTX_EXT_IMPL_PAYLOAD_V1 */
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2023 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
@@ -22,7 +22,7 @@ extern "C" {
#define NVTX_PATHCHAR wchar_t
#define NVTX_STR(x) L##x
#define NVTX_GETENV _wgetenv
#define NVTX_BUFSIZE MAX_PATH
#define NVTX_BUFSIZE 16384
#define NVTX_DLLHANDLE HMODULE
#define NVTX_DLLOPEN(x) LoadLibraryW(x)
#define NVTX_DLLFUNC GetProcAddress
@@ -39,14 +39,14 @@ extern "C" {
#define NVTX_PATHCHAR char
#define NVTX_STR(x) x
#define NVTX_GETENV getenv
#define NVTX_BUFSIZE PATH_MAX
#define NVTX_BUFSIZE 16384
#define NVTX_DLLHANDLE void*
#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
#define NVTX_DLLFUNC dlsym
#define NVTX_DLLCLOSE dlclose
#define NVTX_YIELD() sched_yield()
#define NVTX_MEMBAR() __sync_synchronize()
/* Ensure full memory barrier for atomics, to match Windows functions */
/* Ensure full memory barrier for atomics, to match Windows functions. */
#define NVTX_ATOMIC_WRITE_32(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value)
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
#define NVTX_ATOMIC_WRITE_PTR(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value)
@@ -63,7 +63,7 @@ extern "C" {
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
#endif
/* Define this to 1 for platforms that support environment variables */
/* Define this to 1 for platforms that support environment variables. */
/* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */
/* Try: #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */
#define NVTX_SUPPORT_ENV_VARS 1
@@ -72,16 +72,16 @@ extern "C" {
#define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1
/* Injection libraries implementing InitializeInjectionNvtxExtension may be statically linked,
* and this will override any dynamic injection. Useful for platforms where dynamic
* injection is not available. Since weak symbols not explicitly marked extern are
* guaranteed to be initialized to zero if no definitions are found by the linker, the
* dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */
* which will override any dynamic injection. This is useful for platforms, where dynamic
* injection is not available. Since weak symbols, not explicitly marked extern, are
* guaranteed to be initialized to zero, if no definitions are found by the linker, the
* dynamic injection process proceeds normally, if pfnInitializeInjectionNvtx2 is 0. */
#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__)
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1
/* To statically inject an NVTX library, define InitializeInjectionNvtxExtension_fnptr as a normal
* symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension (which
* does not need to be named "InitializeInjectionNvtxExtension" as is necessary in a dynamic
* injection library. */
* symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension, which
* does not need to be named "InitializeInjectionNvtxExtension" as it is necessary in a dynamic
* injection library. */
__attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxExtension_fnptr;
#else
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0
@@ -89,35 +89,37 @@ __attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxEx
/* This function tries to find or load an NVTX injection library and get the
* address of its InitializeInjectionExtension function. If such a function pointer
* is found, it is called, and passed the address of this NVTX instance's
* nvtxGetExportTable function, so the injection can attach to this instance.
* If the initialization fails for any reason, any dynamic library loaded will
* be freed, and all NVTX implementation functions will be set to no-ops. If
* initialization succeeds, NVTX functions not attached to the tool will be set
* to no-ops. This is implemented as one function instead of several small
* functions to minimize the number of weak symbols the linker must resolve.
* Order of search is:
* - Pre-injected library exporting InitializeInjectionNvtxExtension
* - Loadable library exporting InitializeInjectionNvtxExtension
* - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
* - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
* - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
*/
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr);
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr)
/* This function tries to find or load an NVTX injection library and get the address of its
* `InitializeInjectionExtension` function. If such a function pointer is found, it is called and
* passed the address of this NVTX instance's `nvtxGetExportTable` function, so that the injection
* can attach to this instance.
* If the initialization fails for any reason, any dynamic library loaded will be freed, and all
* NVTX implementation functions will be set to no-ops. If the initialization succeeds, NVTX
* functions that are not attached to the tool will be set to no-ops. This is implemented as one
* function instead of several small functions to minimize the number of weak symbols the linker
* must resolve. The order of search is:
* 1) Pre-injected library exporting InitializeInjectionNvtxExtension
* 2) Loadable library exporting InitializeInjectionNvtxExtension
* - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
* - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
* 3) Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
*/
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(
NvtxExtInitializeInjectionFunc_t* out_init_fnptr);
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(
NvtxExtInitializeInjectionFunc_t* out_init_fnptr)
{
const char* const initFuncName = "InitializeInjectionNvtxExtension";
NvtxExtInitializeInjectionFunc_t init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0;
if(out_init_fnptr){
if (out_init_fnptr)
{
*out_init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
}
#if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY
/* Use POSIX global symbol chain to query for init function from any module */
/* Use POSIX global symbol chain to query for init function from any module. */
init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(0, initFuncName);
#endif
@@ -127,7 +129,7 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
{
#if NVTX_SUPPORT_ENV_VARS
/* If env var NVTX_INJECTION64_PATH is set, it should contain the path
* to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4)
? NVTX_STR("NVTX_INJECTION32_PATH")
: NVTX_STR("NVTX_INJECTION64_PATH");
@@ -135,12 +137,12 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE];
const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0;
/* Refer to this variable explicitly in case all references to it are #if'ed out */
/* Refer to this variable explicitly in case all references to it are #if'ed out. */
(void)injectionLibraryPathBuf;
#if NVTX_SUPPORT_ENV_VARS
/* Disable the warning for getenv & _wgetenv -- this usage is safe because
* these functions are not called again before using the returned value. */
these functions are not called again before using the returned value. */
#if defined(_MSC_VER)
#pragma warning( push )
#pragma warning( disable : 4996 )
@@ -188,7 +190,8 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
pkgName[bytesRead] = 0;
/* String can contain colon as a process separator. In this case the package name is before the colon. */
/* String can contain colon as a process separator. In this case the
package name is before the colon. */
pos = 0;
while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0')
{
@@ -223,8 +226,8 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
}
#endif
/* At this point, injectionLibraryPath is specified if a dynamic
* injection library was specified by a tool. */
/* At this point, `injectionLibraryPath` is specified if a dynamic
injection library was specified by a tool. */
if (injectionLibraryPath)
{
/* Load the injection library */
@@ -236,7 +239,7 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
}
else
{
/* Attempt to get the injection library's entry-point */
/* Attempt to get the injection library's entry-point. */
init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName);
if (!init_fnptr)
{
@@ -252,8 +255,8 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
#if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY
if (!init_fnptr)
{
/* Check weakly-defined function pointer. A statically-linked injection can define this as
* a normal symbol and it will take precedence over a dynamic injection. */
/* Check weakly-defined function pointer. A statically-linked injection can define
this as a normal symbol and it will take precedence over a dynamic injection. */
if (InitializeInjectionNvtxExtension_fnptr)
{
init_fnptr = InitializeInjectionNvtxExtension_fnptr;
@@ -261,13 +264,13 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
}
#endif
if(out_init_fnptr){
if (out_init_fnptr)
{
*out_init_fnptr = init_fnptr;
}
/* At this point, if init_fnptr is not set, then no tool has specified
* an NVTX injection library -- return non-success result so all NVTX
* API functions will be set to no-ops. */
/* At this point, if `init_fnptr` is not set, no tool has specified an NVTX injection library.
Non-success result is returned, so that all NVTX API functions will be set to no-ops. */
if (!init_fnptr)
{
return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE;
@@ -276,16 +279,19 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
return NVTX_SUCCESS;
}
/* Avoid warnings about missing prototypes. */
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState);
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
nvtxExtModuleInfo_t* moduleInfo,
intptr_t* moduleState
)
nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState)
{
intptr_t old;
NVTX_INFO( "%s\n", __FUNCTION__ );
if( *moduleState == NVTX_EXTENSION_LOADED) {
if (*moduleState == NVTX_EXTENSION_LOADED)
{
NVTX_INFO("Module loaded\n");
return;
}
@@ -296,45 +302,55 @@ NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
NVTX_EXTENSION_FRESH);
if (old == NVTX_EXTENSION_FRESH)
{
NvtxExtInitializeInjectionFunc_t init_fnptr = NVTX_VERSIONED_IDENTIFIER(injectionFnPtr);
NvtxExtInitializeInjectionFunc_t init_fnptr =
NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr;
int entryPointStatus = 0;
int forceAllToNoops = 0;
size_t s;
/* Load & initialize injection library -- it will assign the function pointers */
if(init_fnptr == 0){
/* Load and initialize injection library, which will assign the function pointers. */
if (init_fnptr == 0)
{
int result = 0;
/* try to load vanilla NVTX first*/
/* Try to load vanilla NVTX first. */
nvtxInitialize(0);
result = NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(&init_fnptr);
/*at this point init_fnptr will be either 0 or a real function*/
/* At this point `init_fnptr` will be either 0 or a real function. */
if(result == NVTX_SUCCESS) {
NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = init_fnptr;
if (result == NVTX_SUCCESS)
{
NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr = init_fnptr;
}
else {
else
{
NVTX_ERR("Failed to load injection library\n");
}
}
if(init_fnptr != 0) {
/* Invoke injection library's initialization function. If it returns
* 0 (failure) and a dynamic injection was loaded, unload it. */
if (init_fnptr != 0)
{
/* Invoke injection library's initialization function. If it returns
0 (failure) and a dynamic injection was loaded, unload it. */
entryPointStatus = init_fnptr(moduleInfo);
if (entryPointStatus == 0) {
if (entryPointStatus == 0)
{
NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n");
}
}
/* Clean up any functions that are still uninitialized so that they are skipped.
* Set all to null if injection init function failed as well.
*/
/* Clean up any functions that are still uninitialized so that they are
skipped. Set all to null if injection init function failed as well. */
forceAllToNoops = (init_fnptr == 0) || (entryPointStatus == 0);
for(size_t s = 0; s < moduleInfo->segmentsCount; ++s){
nvtxExtModuleSegment_t* segment = moduleInfo->segments+s;
for(size_t i = 0; i < segment->slotCount; ++i){
if(forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH)){
for (s = 0; s < moduleInfo->segmentsCount; ++s)
{
nvtxExtModuleSegment_t* segment = moduleInfo->segments + s;
size_t i;
for (i = 0; i < segment->slotCount; ++i)
{
if (forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH))
{
segment->functionSlots[i] = NVTX_EXTENSION_DISABLED;
}
}
@@ -342,12 +358,11 @@ NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
NVTX_MEMBAR();
/* Signal that initialization has finished, so now the assigned function pointers will be used */
NVTX_ATOMIC_WRITE_PTR(
moduleState,
NVTX_EXTENSION_LOADED);
/* Signal that initialization has finished and the assigned function
pointers will be used. */
NVTX_ATOMIC_WRITE_PTR(moduleState, NVTX_EXTENSION_LOADED);
}
else /* Spin-wait until initialization has finished */
else /* Spin-wait until initialization has finished. */
{
NVTX_MEMBAR();
while (*moduleState != NVTX_EXTENSION_LOADED)
+272
Zobrazit soubor
@@ -0,0 +1,272 @@
/*
* Copyright 2023 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H
#define NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H
/* General helper macros */
#include "nvtxExtHelperMacros.h"
/* Get variable name with line number (almost unique per file). */
#define _NVTX_PAYLOAD_DATA_VAR NVTX_EXT_CONCAT(nvtxDFDB,__LINE__)
/* Create real arguments from just pasting tokens next to each other. */
#define _NVTX_PAYLOAD_PASS_THROUGH(...) __VA_ARGS__
/* Avoid prefixing `NVTX_PAYLOAD_ENTRY_` for nested payloads. */
#define NVTX_PAYLOAD_ENTRY_THROWAWAY
#define _NVTX_PAYLOAD_NESTED(id) THROWAWAY id
/*
* Create the NVTX binary payloads schema attributes.
*
* @param struct_id The name of the struct.
* @param schema_name The name of the schema.
* @param schema_flags Additional schema flags
* @param mask_add Fields to be added to the mask.
* @param num_entries The number schema entries.
*/
#define NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, mask_add, num_entries) \
nvtxPayloadSchemaAttr_t struct_id##Attr = { \
/*.fieldMask = */NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | mask_add \
NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | \
NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | \
NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE, \
/*.name = */schema_name, \
/*.type = */NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, \
/*.flags = */schema_flags, \
/*.entries = */struct_id##Schema, /*.numEntries = */num_entries, \
/*.payloadStaticSize = */sizeof(struct_id), \
/*.packAlign = */0, /*.schemaId = */schema_id};
/*****************************************************************/
/*** Helper for `NVTX_DEFINE_SCHEMA_FOR_STRUCT[_AND_REGISTER]` ***/
/* First part of schema entry for different number of arguments. */
#define _NVTX_PAYLOAD_SCHEMA_EF2(member, etype) \
0, NVTX_PAYLOAD_ENTRY_##etype, NULL, NULL, 0,
#define _NVTX_PAYLOAD_SCHEMA_EF3(member, etype, name) \
0, NVTX_PAYLOAD_ENTRY_##etype, name, NULL, 0,
#define _NVTX_PAYLOAD_SCHEMA_EF4(member, etype, name, desc) \
0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, 0,
#define _NVTX_PAYLOAD_SCHEMA_EF5(member, etype, name, desc, arraylen) \
0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
#define _NVTX_PAYLOAD_SCHEMA_EF6(member, etype, name, desc, arraylen, flags) \
NVTX_PAYLOAD_ENTRY_FLAG_##flags, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
#define _NVTX_PAYLOAD_SCHEMA_ENTRY_FRONT(...) \
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_SCHEMA_EF, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
/* Second part of schema entry (append struct member).
(At least two arguments are passed (`member` and `etype`). */
#define _NVTX_PAYLOAD_SCHEMA_ENTRY_END(member, ...) member
/* Resolve to schema entry. `entry` is `(ctype, name, ...)`. */
#define _NVTX_PAYLOAD_SCHEMA_ENTRY(struct_id, entry) \
{_NVTX_PAYLOAD_SCHEMA_ENTRY_FRONT entry \
offsetof(struct_id, _NVTX_PAYLOAD_SCHEMA_ENTRY_END entry)},
/* Handle up to 16 schema entries. */
#define _NVTX_PAYLOAD_SME1(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1)
#define _NVTX_PAYLOAD_SME2(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME1(s,__VA_ARGS__)
#define _NVTX_PAYLOAD_SME3(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME2(s,__VA_ARGS__)
#define _NVTX_PAYLOAD_SME4(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME3(s,__VA_ARGS__)
#define _NVTX_PAYLOAD_SME5(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME4(s,__VA_ARGS__)
#define _NVTX_PAYLOAD_SME6(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME5(s,__VA_ARGS__)
#define _NVTX_PAYLOAD_SME7(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME6(s,__VA_ARGS__)
#define _NVTX_PAYLOAD_SME8(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME7(s,__VA_ARGS__)
#define _NVTX_PAYLOAD_SME9(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME8(s,__VA_ARGS__)
#define _NVTX_PAYLOAD_SME10(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME9(s,__VA_ARGS__)
#define _NVTX_PAYLOAD_SME11(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME10(s,__VA_ARGS__)
#define _NVTX_PAYLOAD_SME12(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME11(s,__VA_ARGS__)
#define _NVTX_PAYLOAD_SME13(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME12(s,__VA_ARGS__)
#define _NVTX_PAYLOAD_SME14(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME13(s,__VA_ARGS__)
#define _NVTX_PAYLOAD_SME15(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME14(s,__VA_ARGS__)
#define _NVTX_PAYLOAD_SME16(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME15(s,__VA_ARGS__)
#define _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, ...) \
nvtxPayloadSchemaEntry_t struct_id##Schema[] = { \
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_SME, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(struct_id, __VA_ARGS__) \
{0, 0} \
};
/*
* Handle optional parameters for `NVTX_DEFINE_SCHEMA_FOR_STRUCT[_AND_REGISTER]`.
*/
#define _NVTX_DEFINE_S4S_6(struct_id, schema_name, prefix, schema_flags, schema_id, entries) \
prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, \
NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID |,\
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
#define _NVTX_DEFINE_S4S_5(struct_id, schema_name, prefix, schema_flags, entries) \
prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, 0, \
NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS |, \
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
#define _NVTX_DEFINE_S4S_4(struct_id, schema_name, prefix, entries) \
prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, \
NVTX_PAYLOAD_SCHEMA_ATTR_NAME |, \
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
#define _NVTX_DEFINE_S4S_3(struct_id, schema_name, entries) \
_NVTX_DEFINE_S4S_4(struct_id, schema_name, /*prefix*/, entries)
#define _NVTX_DEFINE_S4S_2(struct_id, entries) \
_NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, NULL, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, ,\
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
#define _NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, ...) \
NVTX_EXT_CONCAT(_NVTX_DEFINE_S4S_, \
NVTX_EXT_NUM_ARGS(struct_id, __VA_ARGS__))(struct_id, __VA_ARGS__)
/*** END: Helper for `NVTX_PAYLOAD_STATIC_SCHEMA_{DEFINE,SETUP}` ***/
/******************************************************************/
/*** Helper for `NVTX_DEFINE_STRUCT_WITH_SCHEMA[_AND_REGISTER]` ***/
/* Extract struct member for fixed-size arrays. */
#define _NVTX_PAYLOAD_STRUCT_ARR_MEM1(name) name
#define _NVTX_PAYLOAD_STRUCT_ARR_MEM2(name, count) name[count]
/* Extract type and member name and handle special case of fixed-size array. */
#define _NVTX_PAYLOAD_STRUCT_E2(type, member) type member;
#define _NVTX_PAYLOAD_STRUCT_E3(type, member, etype) type member;
#define _NVTX_PAYLOAD_STRUCT_E4(type, member, etype, name) type member;
#define _NVTX_PAYLOAD_STRUCT_E5(type, member, etype, name, desc) type member;
#define _NVTX_PAYLOAD_STRUCT_E6(type, member, etype, name, desc, arraylen) \
type NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT_ARR_MEM, NVTX_EXT_NUM_ARGS member) member;
#define _NVTX_PAYLOAD_STRUCT_E7(type, member, etype, name, desc, arraylen, flags) \
_NVTX_PAYLOAD_STRUCT_E6(type, member, etype, name, desc, arraylen)
/* Handle different number of arguments per struct entry. */
#define _NVTX_PAYLOAD_STRUCT_ENTRY_(...) \
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT_E, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
/* Handle up to 16 struct members. */
#define _NVTX_PAYLOAD_STRUCT_ENTRY(entry) _NVTX_PAYLOAD_STRUCT_ENTRY_ entry
#define _NVTX_PAYLOAD_STRUCT1(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1)
#define _NVTX_PAYLOAD_STRUCT2(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT1(__VA_ARGS__)
#define _NVTX_PAYLOAD_STRUCT3(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT2(__VA_ARGS__)
#define _NVTX_PAYLOAD_STRUCT4(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT3(__VA_ARGS__)
#define _NVTX_PAYLOAD_STRUCT5(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT4(__VA_ARGS__)
#define _NVTX_PAYLOAD_STRUCT6(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT5(__VA_ARGS__)
#define _NVTX_PAYLOAD_STRUCT7(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT6(__VA_ARGS__)
#define _NVTX_PAYLOAD_STRUCT8(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT7(__VA_ARGS__)
#define _NVTX_PAYLOAD_STRUCT9(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT8(__VA_ARGS__)
#define _NVTX_PAYLOAD_STRUCT10(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT9(__VA_ARGS__)
#define _NVTX_PAYLOAD_STRUCT11(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT10(__VA_ARGS__)
#define _NVTX_PAYLOAD_STRUCT12(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT11(__VA_ARGS__)
#define _NVTX_PAYLOAD_STRUCT13(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT12(__VA_ARGS__)
#define _NVTX_PAYLOAD_STRUCT14(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT13(__VA_ARGS__)
#define _NVTX_PAYLOAD_STRUCT15(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT14(__VA_ARGS__)
#define _NVTX_PAYLOAD_STRUCT16(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT15(__VA_ARGS__)
/* Generate the typedef. */
#define _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, ...) \
typedef struct { \
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) \
} struct_id;
/* Generate first part of the schema entry. */
#define _NVTX_PAYLOAD_INIT_SCHEMA_N3(type, memberId, etype) \
0, NVTX_PAYLOAD_ENTRY_##etype, NULL, NULL, 0,
#define _NVTX_PAYLOAD_INIT_SCHEMA_N4(type, memberId, etype, name) \
0, NVTX_PAYLOAD_ENTRY_##etype, name, NULL, 0,
#define _NVTX_PAYLOAD_INIT_SCHEMA_N5(type, memberId, etype, name, desc) \
0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, 0,
#define _NVTX_PAYLOAD_INIT_SCHEMA_N6(type, memberId, etype, name, desc, arraylen) \
0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
#define _NVTX_PAYLOAD_INIT_SCHEMA_N7(type, memberId, etype, name, desc, arraylen, flags) \
NVTX_PAYLOAD_ENTRY_FLAG_##flags, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_FRONT(...) \
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SCHEMA_N, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
#define _NVTX_PAYLOAD_ARRAY_MEMBER1(name) name
#define _NVTX_PAYLOAD_ARRAY_MEMBER2(name, count) name
/* Resolve to last part of schema entry (append struct member). */
#define _NVTX_PAYLOAD_INIT_SCHEMA_NX3(type, memberId, ...) memberId
#define _NVTX_PAYLOAD_INIT_SCHEMA_NX4(type, memberId, ...) memberId
#define _NVTX_PAYLOAD_INIT_SCHEMA_NX5(type, memberId, ...) memberId
#define _NVTX_PAYLOAD_INIT_SCHEMA_NX6(type, memberId, ...) \
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_ARRAY_MEMBER, NVTX_EXT_NUM_ARGS memberId) memberId
#define _NVTX_PAYLOAD_INIT_SCHEMA_NX7(type, memberId, ...) \
_NVTX_PAYLOAD_INIT_SCHEMA_NX6(type, memberId, __VA_ARGS__)
#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_END(...) \
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SCHEMA_NX, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
/* Resolve to schema entry. `entry` is `(ctype, name, ...)`. */
#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(struct_id, entry) \
{_NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_FRONT entry \
offsetof(struct_id, _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_END entry)},
/* Handle up to 16 schema entries. */
#define _NVTX_PAYLOAD_INIT_SME1(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1)
#define _NVTX_PAYLOAD_INIT_SME2(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME1(s, __VA_ARGS__)
#define _NVTX_PAYLOAD_INIT_SME3(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME2(s, __VA_ARGS__)
#define _NVTX_PAYLOAD_INIT_SME4(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME3(s, __VA_ARGS__)
#define _NVTX_PAYLOAD_INIT_SME5(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME4(s, __VA_ARGS__)
#define _NVTX_PAYLOAD_INIT_SME6(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME5(s, __VA_ARGS__)
#define _NVTX_PAYLOAD_INIT_SME7(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME6(s, __VA_ARGS__)
#define _NVTX_PAYLOAD_INIT_SME8(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME7(s, __VA_ARGS__)
#define _NVTX_PAYLOAD_INIT_SME9(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME8(s, __VA_ARGS__)
#define _NVTX_PAYLOAD_INIT_SME10(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME9(s, __VA_ARGS__)
#define _NVTX_PAYLOAD_INIT_SME11(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME10(s, __VA_ARGS__)
#define _NVTX_PAYLOAD_INIT_SME12(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME11(s, __VA_ARGS__)
#define _NVTX_PAYLOAD_INIT_SME13(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME12(s, __VA_ARGS__)
#define _NVTX_PAYLOAD_INIT_SME14(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME13(s, __VA_ARGS__)
#define _NVTX_PAYLOAD_INIT_SME15(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME14(s, __VA_ARGS__)
#define _NVTX_PAYLOAD_INIT_SME16(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME15(s, __VA_ARGS__)
#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, ...) \
nvtxPayloadSchemaEntry_t struct_id##Schema[] = { \
NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SME, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(struct_id, __VA_ARGS__) \
{0, 0} \
};
/*
* Handle optional parameters for `NVTX_DEFINE_STRUCT_WITH_SCHEMA[_AND_REGISTER]`.
*/
#define _NVTX_DEFINE_SWS_6(struct_id, schema_name, prefix, schema_flags, schema_id, entries) \
_NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, \
NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS | \
NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID |, \
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
#define _NVTX_DEFINE_SWS_5(struct_id, schema_name, prefix, schema_flags, entries) \
_NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, 0, \
NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS |, \
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
#define _NVTX_DEFINE_SWS_4(struct_id, schema_name, prefix, entries) \
_NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, \
NVTX_PAYLOAD_SCHEMA_ATTR_NAME |, \
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
#define _NVTX_DEFINE_SWS_3(struct_id, schema_name, entries) \
_NVTX_DEFINE_SWS_4(struct_id, schema_name, /* no prefix */, entries)
#define _NVTX_DEFINE_SWS_2(struct_id, entries) \
_NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
_NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, NULL, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, , \
NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
#define _NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, ...) \
NVTX_EXT_CONCAT(_NVTX_DEFINE_SWS_, \
NVTX_EXT_NUM_ARGS(struct_id, __VA_ARGS__))(struct_id, __VA_ARGS__)
/*** END: Helper for `NVTX_PAYLOAD_STATIC_SCHEMA_{INIT,CREATE}` */
#endif /* NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H */
@@ -10,14 +10,14 @@
#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
#endif
typedef void* pointer_type;
typedef void* nvtx_payload_pointer_type;
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
#include <uchar.h>
#include <stdalign.h>
#endif
/* `alignof` is available as of C11 or C++11 */
/* `alignof` is available as of C11 or C++11. */
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L)
#define nvtx_alignof(type) alignof(type)
@@ -54,7 +54,7 @@ MKTYPEDEF(double);
MKTYPEDEF2(long double, longdouble);
MKTYPEDEF(size_t);
MKTYPEDEF(pointer_type);
MKTYPEDEF(nvtx_payload_pointer_type);
MKTYPEDEF(wchar_t);
@@ -85,8 +85,16 @@ MKTYPEDEF(wchar_t);
/*
* Helper array to get the alignment for each predefined C/C++ language type.
* The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`.
*
* In C++, `const` variables use internal linkage by default, but we need it to
* be public (extern) since weak declarations must be public.
*/
const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
NVTX_LINKONCE_DEFINE_GLOBAL
#ifdef __cplusplus
extern
#endif
const nvtxPayloadEntryTypeInfo_t
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo)[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
{
/* The first entry contains this array's length and the size of each entry in this array. */
{NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE, sizeof(nvtxPayloadEntryTypeInfo_t)},
@@ -119,7 +127,7 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
/* NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE */ {sizeof(long double), nvtx_alignof2(long double, longdouble)},
/* NVTX_PAYLOAD_ENTRY_TYPE_SIZE */ {sizeof(size_t), nvtx_alignof(size_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(pointer_type), nvtx_alignof(pointer_type)},
/* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(nvtx_payload_pointer_type), nvtx_alignof(nvtx_payload_pointer_type)},
/*** Special character types ***/
/* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)},
@@ -140,4 +148,4 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
};
#undef nvtx_alignof
#undef nvtx_alignof2
#undef nvtx_alignof2
+9 -12
Zobrazit soubor
@@ -10,37 +10,34 @@
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
#endif
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <wchar.h>
/* ---- Include required platform headers ---- */
#if defined(_WIN32)
#if defined(_WIN32)
#include <Windows.h>
#include <windows.h>
#else
#include <unistd.h>
#if defined(__ANDROID__)
#include <android/api-level.h>
#include <android/api-level.h>
#endif
#if defined(__linux__) || defined(__CYGWIN__)
#include <sched.h>
#endif
#include <sys/types.h>
#include <limits.h>
#include <dlfcn.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/types.h>
#include <pthread.h>
#include <stdlib.h>
#include <wchar.h>
#endif
+3 -3
Zobrazit soubor
@@ -14,11 +14,11 @@
/* Prefer macros over inline functions to reduce symbol resolution at link time */
#if defined(_WIN32)
#if defined(_WIN32)
#define NVTX_PATHCHAR wchar_t
#define NVTX_STR(x) L##x
#define NVTX_GETENV _wgetenv
#define NVTX_BUFSIZE MAX_PATH
#define NVTX_BUFSIZE 16384
#define NVTX_DLLHANDLE HMODULE
#define NVTX_DLLOPEN(x) LoadLibraryW(x)
#define NVTX_DLLFUNC GetProcAddress
@@ -31,7 +31,7 @@
#define NVTX_PATHCHAR char
#define NVTX_STR(x) x
#define NVTX_GETENV getenv
#define NVTX_BUFSIZE PATH_MAX
#define NVTX_BUFSIZE 16384
#define NVTX_DLLHANDLE void*
#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
#define NVTX_DLLFUNC dlsym
+1 -1
Zobrazit soubor
@@ -23,7 +23,7 @@
* In some situations it is desirable to declare a variable without initializing
* it, refer to it in code or other variables' initializers, and then initialize
* it later. Similarly, functions can be prototyped, have their address taken,
* and then have their body defined later. In such cases, use the FWDDECL macros
* and then have their body defined later. In such cases, use the FWDDECL macros
* when forward-declaring LINKONCE global variables without initializers and
* function prototypes, and then use the DEFINE macros when later defining them.
* Although in many cases the FWDDECL macro is equivalent to the DEFINE macro,
-86
Zobrazit soubor
@@ -1,86 +0,0 @@
/*
* Copyright 2021 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
#endif
#define NVTX_EXT_IMPL_GUARD
#include "nvtxExtImpl.h"
#undef NVTX_EXT_IMPL_GUARD
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
NAME##_v##VERSION##_mem##COMPATID
#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
#define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \
NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_PAYLOAD)
/*
* Function slots for the binary payload extension. First entry is the module
* state, initialized to `0` (`NVTX_EXTENSION_FRESH`).
*/
NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_PAYLOAD_FN_NUM + 1]
= {0};
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
{
intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1;
nvtxExtModuleSegment_t segment = {
0, // unused (only one segment)
NVTX3EXT_CBID_PAYLOAD_FN_NUM,
fnSlots
};
nvtxExtModuleInfo_t module = {
NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
NVTX_EXT_MODULEID_PAYLOAD, NVTX_EXT_COMPATID_PAYLOAD,
1, &segment, // number of segments, segments
NULL, // no export function needed
// bake type sizes and alignment information into program binary
&nvtxExtPayloadTypeInfo
};
NVTX_INFO( "%s\n", __FUNCTION__ );
NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots));
}
#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
typedef ret_val ( * fn_name##_impl_fntype )signature; \
NVTX_LINKONCE_DEFINE_FUNCTION ret_val fn_name signature { \
intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
if (slot != NVTX_EXTENSION_DISABLED) { \
if (slot) { \
return (*(fn_name##_impl_fntype)slot) arg_names; \
} else { \
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \
slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
if (slot != NVTX_EXTENSION_DISABLED && slot) { \
return (*(fn_name##_impl_fntype)slot) arg_names; \
} \
} \
} \
return ((ret_val)(intptr_t)-1); \
}
NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadSchemaRegister, (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr), (domain, attr))
NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadEnumRegister, (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr), (domain, attr))
#undef NVTX_EXT_FN_IMPL
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
+3
Zobrazit soubor
@@ -10,6 +10,9 @@
#define NCCL_P2P_H_
#include <cuda.h>
#include <cuda_runtime.h>
#include "core.h"
#if CUDART_VERSION < 12030
// MNNVL: FABRIC handle support lifted from CUDA 12.3
+21 -5
Zobrazit soubor
@@ -16,13 +16,29 @@
#include "shm.h"
#include "p2p.h"
typedef enum : uint8_t {
ncclPatternRing,
ncclPatternRingTwice,
ncclPatternPipelineFrom,
ncclPatternPipelineTo,
ncclPatternTreeUp,
ncclPatternTreeDown,
ncclPatternTreeUpDown,
ncclPatternCollnetChain,
ncclPatternCollnetDirect,
ncclPatternNvls,
ncclPatternNvlsTree,
ncclPatternSend,
ncclPatternRecv
} ncclPattern_t;
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
struct ncclProxyArgs;
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*);
#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
static_assert(2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH <= MAXCHANNELS, "Not enough sub space for max work elements");
union ncclProxyOpSpecifics {
struct {
@@ -124,8 +140,9 @@ struct ncclProxyArgs {
// ProxyOps are used to communicate between main thread and service thread
// Make sure we have enough to store two full rounds of operations on all channels.
// Otherwise we'd be unable to post half of them to free new elements.
#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P)
// Otherwise we'd be unable to post half of them to free new elements. Each
// p2p work contains a send and recv proxy op hence the 2x before it.
#define MAX_OPS_PER_PEER (2*MAXCHANNELS*2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH)
struct ncclProxyOpsPool {
struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS];
@@ -243,7 +260,7 @@ struct ncclProxyState {
bool dmaBufSupport;
ncclNet_t* ncclNet;
ncclCollNet_t* ncclCollNet;
volatile uint32_t* abortFlag;
uint32_t* abortFlag;
// Service threads
pthread_t thread;
pthread_t threadUDS;
@@ -301,7 +318,6 @@ enum proxyMode {
};
ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp, int reg);
ncclResult_t ncclProxyStart(struct ncclComm* comm);
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS);
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
+5
Zobrazit soubor
@@ -1,6 +1,11 @@
#ifndef NCCL_REGISTER_H_
#define NCCL_REGISTER_H_
#include "device.h"
#include <cuda.h>
#include <stdint.h>
enum {
NET_REG_COMPLETE = 0x01,
NVLS_REG_COMPLETE = 0x02,
+27 -9
Zobrazit soubor
@@ -13,12 +13,14 @@
#include "core.h"
#define NTRANSPORTS 4
#define TRANSPORT_UNDEFINED -1
#define TRANSPORT_P2P 0
#define TRANSPORT_SHM 1
#define TRANSPORT_NET 2
#define TRANSPORT_COLLNET 3
#include "proxy.h"
#include "comm.h"
extern struct ncclTransport p2pTransport;
extern struct ncclTransport shmTransport;
@@ -45,6 +47,7 @@ struct ncclPeerInfo {
int cudaCompCap;
// MNNVL support
nvmlGpuFabricInfoV_t fabricInfo;
int cuMemSupport;
};
#define CONNECT_SIZE 128
@@ -57,17 +60,21 @@ struct ncclConnect {
#define NVLS_HANDLE_SIZE 64
struct ncclNvlsSharedRes {
int refCount;
CUmulticastObjectProp properties;
bool inited;
CUmulticastObjectProp bufProp;
CUmulticastObjectProp signalProp;
CUmemAccessDesc accessDesc;
int dev;
size_t size;
size_t granularity;
CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
size_t buffSize;
size_t creditSize;
CUmemGenericAllocationHandle mcBuffHandle; // Multicast handle for NVLS buffer
CUmemGenericAllocationHandle mcCreditHandle; // Multicast handle for NVLS credit buffer
char* mcBuff; // Multicast NVLS buffer address
CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
char* mcCredit; // Multicast NVLS credit address
CUmemGenericAllocationHandle ucBuffHandle; // Unicast Handle for NVLS buffer
CUmemGenericAllocationHandle ucCreditHandle; // Unicast Handle for NVLS credit buffer
char* ucBuff; // Unicast NVLS buffer address
char shareableHandle[NVLS_HANDLE_SIZE];
size_t ucGran;
char* ucCredit; // Unicast NVLS credit address
int nChannels;
struct ncclShmemCollBuff nvlsShmem;
void *nvlsShmemHandle;
@@ -84,6 +91,7 @@ struct ncclCollNetSharedRes {
void* resources;
int nChannels;
size_t buffSize;
int intraHighestTransportType;
};
struct ncclTransportComm {
@@ -111,7 +119,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
ncclResult_t ncclNvlsInit(struct ncclComm* comm);
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm);
ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm);
ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
@@ -121,6 +131,14 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufUsed, void** outHandle);
ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, struct ncclKernelPlan *plan, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle);
ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyconn, void* handle);
ncclResult_t ncclTransportRingConnect(struct ncclComm* comm);
ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm);
ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* graphs[]);
ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm);
ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm);
#endif
+3 -2
Zobrazit soubor
@@ -9,14 +9,15 @@
#define NCCL_INT_TUNER_H_
#include "nccl_tuner.h"
#include "comm.h"
// Tuning plugin to override NCCL's default algorithm/protocol tuning.
// Attempts to load NCCL tuner from environmental variable.
// Returns ncclSuccess if the correct tuner symbol has been found and
// successully loaded. Otherwise returns an error and also logs the error.
ncclResult_t ncclTunerPluginLoad(ncclTuner_t** tuner);
ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm);
// Cleans up NCCL tuner plugin.
ncclResult_t ncclTunerPluginUnload(ncclTuner_t** tuner);
ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm);
#endif
+39 -48
Zobrazit soubor
@@ -9,12 +9,14 @@
#include "nccl.h"
#include "alloc.h"
#include "bitops.h"
#include "checks.h"
#include <stdint.h>
#include <time.h>
#include <sched.h>
#include <algorithm>
#include <new>
#include <type_traits>
int ncclCudaCompCap();
@@ -30,11 +32,6 @@ uint64_t getHostHash();
uint64_t getPidHash();
ncclResult_t getRandomData(void* buffer, size_t bytes);
const char* ncclOpToString(ncclRedOp_t op);
const char* ncclDatatypeToString(ncclDataType_t type);
const char* ncclAlgoToString(int algo);
const char* ncclProtoToString(int proto);
struct netIf {
char prefix[64];
int port;
@@ -44,9 +41,7 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList);
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
static long log2i(long n) {
long l = 0;
while (n>>=1) l++;
return l;
return log2Down(n);
}
inline uint64_t clockNano() {
@@ -96,8 +91,11 @@ void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
void ncclMemoryStackPush(struct ncclMemoryStack* me);
void ncclMemoryStackPop(struct ncclMemoryStack* me);
void* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t size, size_t align);
template<typename T>
T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1);
template<typename Header, typename Element>
inline Header* ncclMemoryStackAllocInlineArray(struct ncclMemoryStack* me, size_t nElt);
////////////////////////////////////////////////////////////////////////////////
/* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
@@ -140,11 +138,14 @@ T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me);
template<typename T, T *T::*next>
void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x);
template<typename T, T *T::*next>
void ncclIntruQueueEnqueueFront(ncclIntruQueue<T,next> *me, T *x);
template<typename T, T *T::*next>
T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me);
template<typename T, T *T::*next>
T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me);
template<typename T, T *T::*next>
void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *memPool);
void ncclIntruQueueTransfer(ncclIntruQueue<T,next> *dst, ncclIntruQueue<T,next> *src);
////////////////////////////////////////////////////////////////////////////////
/* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
@@ -233,6 +234,12 @@ inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size,
return obj;
}
inline void* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t size, size_t align) {
void *obj = ncclMemoryStack::allocate(me, size, align);
memset(obj, 0, size);
return obj;
}
template<typename T>
inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T));
@@ -240,6 +247,17 @@ inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
return (T*)obj;
}
template<typename Header, typename Element>
inline Header* ncclMemoryStackAllocInlineArray(struct ncclMemoryStack* me, size_t nElt) {
size_t size = sizeof(Header);
size = (size + alignof(Element)-1) & -alignof(Element);
size += nElt*sizeof(Element);
size_t align = alignof(Header) < alignof(Element) ? alignof(Element) : alignof(Header);
void *obj = ncclMemoryStack::allocate(me, size, align);
memset(obj, 0, size);
return (Header*)obj;
}
inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
using Frame = ncclMemoryStack::Frame;
Frame tmp = me->topFrame;
@@ -343,6 +361,13 @@ inline void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x) {
me->tail = x;
}
template<typename T, T *T::*next>
inline void ncclIntruQueueEnqueueFront(ncclIntruQueue<T,next> *me, T *x) {
if (me->head == nullptr) me->tail = x;
x->*next = me->head;
me->head = x;
}
template<typename T, T *T::*next>
inline T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me) {
T *ans = me->head;
@@ -388,45 +413,11 @@ inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me) {
}
template<typename T, T *T::*next>
void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *pool) {
T *head = me->head;
me->head = nullptr;
me->tail = nullptr;
while (head != nullptr) {
T *tmp = head->*next;
ncclMemoryPoolFree(pool, tmp);
head = tmp;
}
}
/* cmp function determines the sequence of objects in the queue. If cmp returns value >= 0, it means a > b,
* and we should put a before b; otherwise, b should be put ahead of a. */
template<typename T, T *T::*next>
inline void ncclIntruQueueSortEnqueue(ncclIntruQueue<T,next> *me, T *x, int (*cmp)(T *a, T *b)) {
T *cur = me->head;
T *prev = NULL;
if (cur == NULL) {
x->*next = nullptr;
me->tail = me->head = x;
} else {
while (cur) {
if (cmp(cur, x) > 0) {
prev = cur;
cur = cur->next;
} else {
break;
}
}
x->*next = cur;
if (prev) {
prev->*next = x;
if (cur == NULL) me->tail = x;
} else {
me->head = x;
}
}
void ncclIntruQueueTransfer(ncclIntruQueue<T,next> *dst, ncclIntruQueue<T,next> *src) {
(dst->tail ? dst->tail->next : dst->head) = src->head;
if (src->tail) dst->tail = src->tail;
src->head = nullptr;
src->tail = nullptr;
}
////////////////////////////////////////////////////////////////////////////////
+379 -540
Zobrazit soubor
Rozdílový obsah nebyl zobrazen, protože je příliš veliký Načíst rozdílové porovnání
+7 -6
Zobrazit soubor
@@ -2,11 +2,11 @@
#include "nvtx.h"
static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = {
{"Sum", ncclSum},
{"Product", ncclProd},
{"Max", ncclMax},
{"Min", ncclMin},
{"Avg", ncclAvg}
{"Sum", ncclSum, 0},
{"Product", ncclProd, 0},
{"Max", ncclMax, 0},
{"Min", ncclMin, 0},
{"Avg", ncclAvg, 0}
};
// Must be called before the first call to any reduction operation.
@@ -19,7 +19,8 @@ void initNvtxRegisteredEnums() {
.entries = NvtxEnumRedSchema,
.numEntries = std::extent<decltype(NvtxEnumRedSchema)>::value,
.sizeOfEnum = sizeof(ncclRedOp_t),
.schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP
.schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP,
.extension = nullptr
};
nvtxPayloadEnumRegister(nvtx3::domain::get<nccl_domain>(), &eAttr);
-2
Zobrazit soubor
@@ -52,8 +52,6 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
WARN("%s : invalid type %d", info->opName, info->datatype);
return ncclInvalidArgument;
}
// Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
NCCLCHECK(ncclInfoSetDerived(info, info->comm->nRanks));
if (info->op < 0 || ncclMaxRedOp < info->op) {
WARN("%s : invalid reduction operation %d", info->opName, info->op);
+8
Zobrazit soubor
@@ -59,6 +59,10 @@ DECLARE_CUDA_PFN(cuGetErrorString);
DECLARE_CUDA_PFN(cuGetErrorName);
/* enqueue.cc */
DECLARE_CUDA_PFN(cuMemGetAddressRange);
DECLARE_CUDA_PFN(cuLaunchKernel);
#if CUDA_VERSION >= 11080
DECLARE_CUDA_PFN(cuLaunchKernelEx);
#endif
/* proxy.cc */
DECLARE_CUDA_PFN(cuCtxCreate);
DECLARE_CUDA_PFN(cuCtxDestroy);
@@ -137,6 +141,10 @@ static ncclResult_t cudaPfnFuncLoader(void) {
LOAD_SYM(cuCtxGetCurrent, 1);
LOAD_SYM(cuCtxSetCurrent, 1);
LOAD_SYM(cuCtxGetDevice, 1);
LOAD_SYM(cuLaunchKernel, 1);
#if CUDA_VERSION >= 11080
LOAD_SYM(cuLaunchKernelEx, 1);
#endif
/* cuMem API support */
LOAD_SYM(cuMemAddressReserve, 1);
LOAD_SYM(cuMemAddressFree, 1);
+5 -5
Zobrazit soubor
@@ -130,7 +130,7 @@ ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint6
int ret;
GDRLOCKCALL(gdr_internal_pin_buffer(g, addr, size, p2p_token, va_space, handle), ret);
if (ret != 0) {
WARN("gdr_pin_buffer(addr %lx, size %zi) failed: %d", addr, size, ret);
WARN("gdr_pin_buffer(addr %lx, size %zu) failed: %d", addr, size, ret);
return ncclSystemError;
}
return ncclSuccess;
@@ -172,7 +172,7 @@ ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
int ret;
GDRLOCKCALL(gdr_internal_map(g, handle, va, size), ret);
if (ret != 0) {
WARN("gdr_map(handle %lx, size %zi) failed: %d", handle.h, size, ret);
WARN("gdr_map(handle %lx, size %zu) failed: %d", handle.h, size, ret);
return ncclSystemError;
}
return ncclSuccess;
@@ -186,7 +186,7 @@ ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
int ret;
GDRLOCKCALL(gdr_internal_unmap(g, handle, va, size), ret);
if (ret != 0) {
WARN("gdr_unmap(handle %lx, va %p, size %zi) failed: %d", handle.h, va, size, ret);
WARN("gdr_unmap(handle %lx, va %p, size %zu) failed: %d", handle.h, va, size, ret);
return ncclSystemError;
}
return ncclSuccess;
@@ -218,7 +218,7 @@ ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const vo
int ret;
GDRLOCKCALL(gdr_internal_copy_to_mapping(handle, map_d_ptr, h_ptr, size), ret);
if (ret != 0) {
WARN("gdr_copy_to_mapping(handle %lx, map_d_ptr %p, h_ptr %p, size %zi) failed: %d", handle.h, map_d_ptr, h_ptr, size, ret);
WARN("gdr_copy_to_mapping(handle %lx, map_d_ptr %p, h_ptr %p, size %zu) failed: %d", handle.h, map_d_ptr, h_ptr, size, ret);
return ncclSystemError;
}
return ncclSuccess;
@@ -232,7 +232,7 @@ ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void
int ret;
GDRLOCKCALL(gdr_internal_copy_from_mapping(handle, h_ptr, map_d_ptr, size), ret);
if (ret != 0) {
WARN("gdr_copy_from_mapping(handle %lx, h_ptr %p, map_d_ptr %p, size %zi) failed: %d", handle.h, h_ptr, map_d_ptr, size, ret);
WARN("gdr_copy_from_mapping(handle %lx, h_ptr %p, map_d_ptr %p, size %zu) failed: %d", handle.h, h_ptr, map_d_ptr, size, ret);
return ncclSystemError;
}
return ncclSuccess;
+2 -2
Zobrazit soubor
@@ -132,7 +132,7 @@ ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
WARN("UDS: Receiving data over socket failed : %d", errno);
return ncclSystemError;
}
if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
}
if (recvFd != NULL) {
@@ -221,7 +221,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno);
return ncclSystemError;
}
if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
}
return ncclSuccess;
+41
Zobrazit soubor
@@ -41,11 +41,19 @@ namespace {
NCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values))
// MNNVL support
NCCL_NVML_FN(nvmlDeviceGetGpuFabricInfoV, nvmlReturn_t, (nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo))
// CC support
NCCL_NVML_FN(nvmlSystemGetConfComputeState, nvmlReturn_t, (nvmlConfComputeSystemState_t *state));
NCCL_NVML_FN(nvmlSystemGetConfComputeSettings, nvmlReturn_t, (nvmlSystemConfComputeSettings_t *setting));
std::mutex lock; // NVML has had some thread safety bugs
bool initialized = false;
thread_local bool threadInitialized = false;
ncclResult_t initResult;
union nvmlCCInfoInternal {
nvmlConfComputeSystemState_t settingV12020;
nvmlSystemConfComputeSettings_t settingV12040;
};
}
ncclResult_t ncclNvmlEnsureInitialized() {
@@ -87,6 +95,9 @@ ncclResult_t ncclNvmlEnsureInitialized() {
{(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"},
// MNNVL support
{(void**)&pfn_nvmlDeviceGetGpuFabricInfoV, "nvmlDeviceGetGpuFabricInfoV"},
// CC support
{(void**)&pfn_nvmlSystemGetConfComputeState, "nvmlSystemGetConfComputeState"},
{(void**)&pfn_nvmlSystemGetConfComputeSettings, "nvmlSystemGetConfComputeSettings"}
};
for(Symbol sym: symbols) {
*sym.ppfn = dlsym(libhandle, sym.name);
@@ -282,3 +293,33 @@ ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricI
NVMLTRY(nvmlDeviceGetGpuFabricInfoV, device, gpuFabricInfo);
return ncclSuccess;
}
ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) {
NCCLCHECK(ncclNvmlEnsureInitialized());
std::lock_guard<std::mutex> locked(lock);
nvmlCCInfoInternal ccInfo;
if (pfn_nvmlSystemGetConfComputeSettings != NULL) {
ccInfo.settingV12040.version = nvmlSystemConfComputeSettings_v1;
NVMLTRY(nvmlSystemGetConfComputeSettings, &ccInfo.settingV12040);
if (ccInfo.settingV12040.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED)
status->CCEnabled = true;
else
status->CCEnabled = false;
if (ccInfo.settingV12040.multiGpuMode == NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE)
status->multiGpuCCEnabled = true;
else
status->multiGpuCCEnabled = false;
} else if (pfn_nvmlSystemGetConfComputeState != NULL) {
NVMLTRY(nvmlSystemGetConfComputeState, &ccInfo.settingV12020);
if (ccInfo.settingV12020.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED)
status->CCEnabled = true;
else
status->CCEnabled = false;
status->multiGpuCCEnabled = false;
} else {
status->CCEnabled = false;
status->multiGpuCCEnabled = false;
}
return ncclSuccess;
}
+1 -1
Zobrazit soubor
@@ -84,4 +84,4 @@ const char *ncclGetEnv(const char *name) {
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, initEnv);
return getenv(name);
}
}
+25 -9
Zobrazit soubor
@@ -63,13 +63,28 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
* goes down to 0, unlink should be called in order to delete shared memory file. */
if (shmPath[0] == '\0') {
sprintf(shmPath, "/dev/shm/nccl-XXXXXX");
retry_mkstemp:
fd = mkstemp(shmPath);
if (fd < 0) {
if (errno == EINTR) {
INFO(NCCL_ALL, "mkstemp: Failed to create %s, error: %s (%d) - retrying", shmPath, strerror(errno), errno);
goto retry_mkstemp;
}
WARN("Error: failed to create shared memory file %p, error %s (%d)", shmPath, strerror(errno), errno);
ret = ncclSystemError;
goto fail;
}
} else {
SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
}
retry_fallocate:
if (fallocate(fd, 0, 0, realShmSize) != 0) {
WARN("Error: failed to extend %s to %ld bytes", shmPath, realShmSize);
if (errno == EINTR) {
INFO(NCCL_ALL, "fallocate: Failed to extend %s to %ld bytes, error: %s (%d) - retrying", shmPath, realShmSize, strerror(errno), errno);
goto retry_fallocate;
}
WARN("Error: failed to extend %s to %ld bytes, error: %s (%d)", shmPath, realShmSize, strerror(errno), errno);
ret = ncclSystemError;
goto fail;
}
@@ -80,7 +95,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
hptr = (char*)mmap(NULL, realShmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (hptr == MAP_FAILED) {
WARN("Could not map %s size %zi, error: %s", shmPath, realShmSize, strerror(errno));
WARN("Error: Could not map %s size %zu, error: %s (%d)", shmPath, realShmSize, strerror(errno), errno);
ret = ncclSystemError;
hptr = NULL;
goto fail;
@@ -93,7 +108,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
if (remref == 0) {
/* the last peer has completed attachment, it should unlink the shm mem file. */
if (unlink(shmPath) != 0) {
WARN("unlink shared memory %s failed, error: %s", shmPath, strerror(errno));
INFO(NCCL_ALLOC, "unlink shared memory %s failed, error: %s (%d)", shmPath, strerror(errno), errno);
}
}
}
@@ -110,7 +125,8 @@ exit:
*handle = (ncclShmHandle_t)tmphandle;
return ret;
fail:
WARN("Error while %s shared memory segment %s (size %ld)", create ? "creating" : "attaching to", shmPath, shmSize);
WARN("Error while %s shared memory segment %s (size %ld), error: %s (%d)", create ? "creating" : "attaching to",
shmPath, shmSize, strerror(errno), errno);
if (tmphandle) {
shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle);
ncclShmClose((ncclShmHandle_t)tmphandle);
@@ -129,7 +145,7 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle) {
close(tmphandle->fd);
if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) {
if (unlink(tmphandle->shmPath) != 0) {
WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
WARN("unlink shared memory %s failed, error: %s (%d)", tmphandle->shmPath, strerror(errno), errno);
ret = ncclSystemError;
}
}
@@ -139,7 +155,7 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle) {
if (tmphandle->shmPtr) {
if (tmphandle->devShmPtr) CUDACHECK(cudaHostUnregister(tmphandle->shmPtr));
if (munmap(tmphandle->shmPtr, tmphandle->realShmSize) != 0) {
WARN("munmap of shared memory %p size %ld failed, error: %s", tmphandle->shmPtr, tmphandle->realShmSize, strerror(errno));
WARN("munmap of shared memory %p size %ld failed, error: %s (%d)", tmphandle->shmPtr, tmphandle->realShmSize, strerror(errno), errno);
ret = ncclSystemError;
}
}
@@ -152,9 +168,9 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) {
ncclResult_t ret = ncclSuccess;
struct shmHandleInternal* tmphandle = (struct shmHandleInternal*)handle;
if (tmphandle) {
if (tmphandle->shmPath != NULL) {
if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) {
if (unlink(tmphandle->shmPath) != 0) {
WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
WARN("unlink shared memory %s failed, error: %s (%d)", tmphandle->shmPath, strerror(errno), errno);
ret = ncclSystemError;
}
free(tmphandle->shmPath);
@@ -184,7 +200,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff
uint64_t t0 = clockNano();
while(__atomic_load_n(shmem->cnt[curRound], __ATOMIC_ACQUIRE) != comm->localRanks + 1) {
if (clockNano() - t0 >= 5 * 1000) sched_yield();
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 1) {
if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE) == 1) {
ret = ncclInternalError;
goto exit;
}

Některé soubory nejsou zobrazny, neboť je v této revizi změněno mnoho souborů Zobrazit více