Merge remote-tracking branch 'nccl/master' into no-target-id

This commit is contained in:
Wenkai Du
2020-12-01 11:33:47 -05:00
bovenliggende 2e8b3a0857 920dbe5b35
commit d469947641
106 gewijzigde bestanden met toevoegingen van 11943 en 4104 verwijderingen
+3 -1
Bestand weergeven
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -16,6 +16,8 @@ ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commSt
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr);
ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
ncclResult_t bootstrapClose(void* commState);
ncclResult_t bootstrapAbort(void* commState);
#endif
+1 -1
Bestand weergeven
@@ -24,7 +24,7 @@ static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, voi
static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
static ncclResult_t collNetFlush(void* collComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclCollNet->flush(collComm, data, size, mhandle)); return ncclSuccess; }
static ncclResult_t collNetIflush(void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
+44 -56
Bestand weergeven
@@ -8,63 +8,60 @@
#ifndef NCCL_COLLECTIVES_H_
#define NCCL_COLLECTIVES_H_
#define FUNC_INDEX_P2P (4+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps)
#define FUNC_INDEX(coll, redop, dtype, al, pr) ((coll >= NCCL_NUM_FUNCTIONS) \
? (coll-NCCL_NUM_FUNCTIONS+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps) \
: ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)))
#define FUNC_INDEX_P2P 1800
#define FUNC_INDEX(func, redop, ncclType, al, pr) ((((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
#define NCCL_COLL_NAME(coll, op, dtype) \
coll##_##op##_##dtype
#define NCCL_FUNC_NAME(func, algo, proto, redop, type) \
ncclFunction_##func##_##algo##_##proto##_##redop##_##type
#define NCCL_KERN_NAME(coll, op, dtype) \
coll##Kernel_##op##_##dtype
#define NCCL_KERN_NAME(func, algo, proto, redop, type) \
ncclKernel_##func##_##algo##_##proto##_##redop##_##type
#define NCCL_IMPL_NAME(func, algo, proto) \
nccl##func##algo##proto
/* Declare all collective operations */
#define DECL_COLL5(coll, op, dtype) \
extern __device__ __attribute__((noinline)) void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm); \
#define DECL5(func, algo, proto, redop, type) \
extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, redop, type)(struct ncclWorkElem* args); \
extern __global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(struct ncclWorkElem first); \
#define DECL_COLL4(coll, op, dtype) \
DECL_COLL5(coll, op, dtype) \
DECL_COLL5(coll##LL, op, dtype) \
DECL_COLL5(coll##LL128, op, dtype)
#define DECL4(func, algo, redop, type) \
DECL5(func, algo, SIMPLE, redop, type) \
DECL5(func, algo, LL, redop, type) \
DECL5(func, algo, LL128, redop, type)
#define DECL_COLL3(coll, op, dtype) \
DECL_COLL4(coll##Ring, op, dtype) \
DECL_COLL4(coll##Tree, op, dtype) \
DECL_COLL4(coll##CollNet, op, dtype)
#define DECL3(func, redop, type) \
DECL4(func, RING, redop, type) \
DECL4(func, TREE, redop, type) \
DECL4(func, COLLNET, redop, type)
#define DECL_COLL2(coll, op) \
DECL_COLL3(coll, op, i8) \
DECL_COLL3(coll, op, u8) \
DECL_COLL3(coll, op, i32) \
DECL_COLL3(coll, op, u32) \
DECL_COLL3(coll, op, i64) \
DECL_COLL3(coll, op, u64) \
DECL_COLL3(coll, op, f16) \
DECL_COLL3(coll, op, f32) \
DECL_COLL3(coll, op, f64) \
DECL_COLL3(coll, op, b16)
#define DECL2(func, redop) \
DECL3(func, redop, int8_t) \
DECL3(func, redop, uint8_t) \
DECL3(func, redop, int32_t) \
DECL3(func, redop, uint32_t) \
DECL3(func, redop, int64_t) \
DECL3(func, redop, uint64_t) \
DECL3(func, redop, half) \
DECL3(func, redop, float) \
DECL3(func, redop, double) \
DECL3(func, redop, rccl_bfloat16)
#define DECL_COLL(coll) \
DECL_COLL2(coll, sum) \
DECL_COLL2(coll, prod) \
DECL_COLL2(coll, min) \
DECL_COLL2(coll, max)
#define DECL(func) \
DECL2(func, Sum) \
DECL2(func, Prod) \
DECL2(func, Min) \
DECL2(func, Max)
#define DECL_ALL_COLLS \
DECL_COLL2(ncclBroadcast, copy) \
DECL_COLL(ncclReduce) \
DECL_COLL2(ncclAllGather, copy) \
DECL_COLL(ncclReduceScatter) \
DECL_COLL(ncclAllReduce) \
DECL_COLL5(ncclGather, copy, i8) \
DECL_COLL5(ncclScatter, copy, i8) \
DECL_COLL5(ncclAllToAll, copy, i8) \
DECL_COLL5(ncclAllToAllv, copy, i8) \
DECL_COLL5(ncclSendRecv, copy, i8) \
#define DECL_ALL \
DECL2(Broadcast, Sum) \
DECL(Reduce) \
DECL2(AllGather, Sum) \
DECL(ReduceScatter) \
DECL(AllReduce) \
DECL5(SendRecv, RING, SIMPLE, Sum, int8_t) \
DECL_ALL_COLLS
DECL_ALL
// CHUNKSIZE must be a multiple of SLICESIZE
//#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
@@ -84,13 +81,4 @@ DECL_ALL_COLLS
#define REDUCE_SLICESTEPS 1
#define REDUCE_CHUNKSTEPS 1
#define SENDRECV_SLICEFACTOR 1
#define GATHER_SLICESTEPS 4
#define GATHER_CHUNKSTEPS 4
#define SCATTER_SLICESTEPS 4
#define SCATTER_CHUNKSTEPS 4
#define ALLTOALL_SLICESTEPS 4
#define ALLTOALL_CHUNKSTEPS 4
#define ALLTOALLV_SLICESTEPS 4
#define ALLTOALLV_CHUNKSTEPS 4
#endif
+17 -4
Bestand weergeven
@@ -52,8 +52,8 @@ struct ncclRecvMem {
struct {
uint64_t tail;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
int sizesFifo[NCCL_STEPS];
void* ptrsFifo[NCCL_STEPS];
};
char pad4[MEM_ALIGN];
};
@@ -67,6 +67,10 @@ struct ncclComm {
struct ncclTopoSystem* topo;
void* bootstrap;
// Bitmasks for ncclTransportP2pSetup
int connect;
uint32_t* connectSend;
uint32_t* connectRecv;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
@@ -131,8 +135,8 @@ struct ncclComm {
int* intraCudaDevs;
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
struct ncclDevComm* args;
struct ncclDevComm** argsptr;
struct ncclWorkElem args;
void* argsptr;
// Global proxy thread
pthread_t proxyThread;
@@ -140,8 +144,17 @@ struct ncclComm {
// Whether this communicator uses collNet
int collNetSupport;
// Store info of async operations
struct ncclInfo* asyncOps;
int asyncOpCount;
size_t asyncTotalSize;
//list of async p2p operation queued in a group semantics
struct ncclP2Plist p2plist;
struct ncclP2Plist* p2pSends;
struct ncclP2Plist* p2pRecvs;
int p2pSendCount;
int p2pRecvCount;
// RCCL AllToAll/Scatter/Gather API
bool alltoallDisable;
+1
Bestand weergeven
@@ -57,5 +57,6 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
#include "alloc.h"
#include "utils.h"
#include "param.h"
#include "nvtx_stub.h"
#endif // end include guard
+2 -2
Bestand weergeven
@@ -19,7 +19,7 @@ static int hexToInt(char c) {
#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
uint32_t cpumasks[CPU_SET_N_U32];
int m = CPU_SET_N_U32-1;
cpumasks[m] = 0;
@@ -42,7 +42,7 @@ ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
return ncclSuccess;
}
ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
int c = 0;
uint8_t* m8 = (uint8_t*)mask;
for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
+44 -64
Bestand weergeven
@@ -23,8 +23,8 @@
#endif
#define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollGather, ncclCollScatter, ncclCollAllToAll, ncclCollAllToAllv, ncclCollSendRecv} ncclFunc_t;
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+4];
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv} ncclFunc_t;
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
#define NCCL_ALGO_TREE 0
@@ -59,6 +59,7 @@ union ncclLLFifoLine {
#define WARP_SIZE 64
#define MAXCHANNELS 32
#define NCCL_MAX_NTHREADS 256
#define NCCL_SIMPLE_MAX_NTHREADS NCCL_MAX_NTHREADS
#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
#define NCCL_LL_LINES_PER_THREAD 8
#ifdef TEST_LL_CLEANUP
@@ -72,7 +73,7 @@ union ncclLLFifoLine {
// Make sure the clean mask will last for at least NCCL_NSTEPS
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
#define NCCL_LL128_LINESIZE 64
#define NCCL_LL128_LINESIZE 128
#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
@@ -83,15 +84,12 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
// to 3 dests. Use 70% for reduce and 30% for bcast.
#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 2
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
#define NCCL_DIRECT_GPU 0x01
#define NCCL_DIRECT_NIC 0x10
#define MAXBARRIERS 2
#define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
struct ncclConnInfo {
// Regular comm mechanism
char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
@@ -99,9 +97,11 @@ struct ncclConnInfo {
uint64_t *head; // Local for send, remote for recv
int direct; // Direct communication
int shared; // Buffers are shared
void **ptrExchange; // Pointer exchange for direct communication
int *fifo; // Size fifo for proxy
int *sizesFifo; // Sizes fifo from GPU to proxy
void* *ptrsFifo; // Buffer fifo from proxy to GPU
uint64_t step; // Keep where we are
uint64_t llLastCleaning;
@@ -110,7 +110,6 @@ struct ncclConnInfo {
// allows software to explicitly initiate a flush read to HDP memory. See more
// descriptions in primitives.h.
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
};
struct ncclConnector {
@@ -151,68 +150,53 @@ struct ncclDevComm;
#pragma pack(push) /* push current alignment to stack */
#pragma pack(4) /* set alignment to 4 bytes boundary */
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclColl. */
struct CollectiveArgs {
struct ncclDevComm* comm;
uint64_t opCount;
#define NCCL_MAX_WORK_ELEMENTS 2
#define NCCL_MAX_GROUPS (NCCL_MAX_WORK_ELEMENTS*2)
/* ncclWork is to be a power of two, currently 8x64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclWorkElem. */
struct ncclWorkElem {
// Header
struct ncclDevComm* comm;
uint16_t nThreads;
uint16_t funcIndex;
uint16_t index;
uint16_t active;
// local and remote input, output, and buffer
const void * sendbuff;
void * recvbuff;
// Op-specific fields. Make sure the common part stays the
// same on all structs of the union
uint64_t opCount;
// Op-specific fields.
union {
struct {
uint16_t nThreads;
} common;
struct {
uint16_t nThreads;
uint8_t bid;
uint8_t nChannels;
uint32_t root;
size_t count;
size_t lastChunkSize;
} coll;
struct {
uint16_t nThreads;
uint16_t unused;
int32_t delta;
size_t sendCount;
size_t recvCount;
} p2p;
struct {
uint16_t nThreads;
uint32_t root;
uint8_t bid;
uint8_t nChannels;
size_t count;
size_t* extra;
} a2av;
};
};
struct ncclColl {
union {
} coll;
struct {
struct CollectiveArgs args;
uint16_t funcIndex;
uint16_t nextIndex;
uint8_t active;
};
int data[0x10];
size_t sendCount;
size_t recvCount;
int32_t delta;
uint16_t nThreads;
} p2p;
uint64_t align[3];
};
};
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
struct ncclWork {
struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
};
static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size");
struct ncclChannel {
union {
struct {
struct ncclRing ring;
struct ncclTree treeUp;
struct ncclTree treeDn;
struct ncclTree collTreeUp;
struct ncclTree collTreeDn;
struct ncclTree tree;
struct ncclTree collTree;
int id;
@@ -221,16 +205,10 @@ struct ncclChannel {
struct ncclPeer* devPeers;
// Operation list for aggregation
struct ncclColl* collectives;
size_t* collectivesExtra;
int collStart;
int collCount;
int collFifoHead; // Only used by GPU
int collFifoTail; // Only used by CPU
struct ncclWork* workFifo;
int workCount;
uint64_t workFifoTail; // Only used by CPU
uint32_t* sync;
uint64_t* barrier;
uint64_t* barrier_next;
#ifdef ENABLE_PROFILING
struct timeval tvs;
uint64_t sizes;
@@ -288,9 +266,11 @@ struct ncclProf {
#ifdef ENABLE_COLLTRACE
typedef enum {
ncclCollTraceNotReady,
ncclCollTraceKernelLaunchType,
ncclCollTraceCollEndType,
ncclCollTraceAbortType
ncclCollTraceAbortType,
ncclCollTraceDataType
} ncclCollTraceDataType_t;
struct ncclCollTrace {
@@ -304,7 +284,7 @@ struct ncclCollTrace {
};
static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must have a pow2 size");
#define COLLTRACE_NUM_ITEMS 1024
#define COLLTRACE_NUM_ITEMS 8192
#endif
struct ncclDevComm {
+2
Bestand weergeven
@@ -19,5 +19,7 @@ ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
ncclResult_t ncclSaveKernel(struct ncclInfo* info);
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info);
ncclResult_t ncclSaveCommKernels(struct ncclComm* comm);
#endif // End include guard
+9 -9
Bestand weergeven
@@ -29,7 +29,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
// Query topology
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
// Set CPU affinity
@@ -45,15 +45,16 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
#define NCCL_TOPO_CPU_TYPE_ZEN 3
#define NCCL_TOPO_CPU_TYPE_ROME 4
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
#define NCCL_TOPO_MAX_NODES 256
// Init search. Needs to be done before calling ncclTopoCompute
ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
#define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Split tree (send/recv from different ranks) flowing in both directions
#define NCCL_TOPO_PATTERN_TREE 3 // Simple tree (send/recv from same rank) flowing in both directions
#define NCCL_TOPO_PATTERN_BALANCED_TREE 1 // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU)
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
#define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU
#define NCCL_TOPO_PATTERN_RING 4 // Ring
struct ncclTopoGraph {
// Input / output
@@ -84,17 +85,16 @@ struct ncclTopoRanks {
int ringSend[MAXCHANNELS];
int ringPrev[MAXCHANNELS];
int ringNext[MAXCHANNELS];
int treeUpRecv[MAXCHANNELS];
int treeUpSend[MAXCHANNELS];
int treeDnRecv[MAXCHANNELS];
int treeDnSend[MAXCHANNELS];
int treeToParent[MAXCHANNELS];
int treeToChild0[MAXCHANNELS];
int treeToChild1[MAXCHANNELS];
};
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets);
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
+1 -7
Bestand weergeven
@@ -20,8 +20,7 @@ typedef enum {
ncclPatternTreeDown,
ncclPatternTreeUpDown,
ncclPatternCollTreeUp,
ncclPatternCollTreeDown,
ncclPatternAll
ncclPatternCollTreeDown
} ncclPattern_t;
// Used to pass NCCL call information between functions
@@ -40,11 +39,6 @@ struct ncclInfo {
// Algorithm details
int chunkSteps;
int sliceSteps;
// For alltoallv
const size_t *sendcounts;
const size_t *sdispls;
const size_t *recvcounts;
const size_t *rdispls;
// Computed later
int algorithm;
int protocol;
+15 -12
Bestand weergeven
@@ -15,6 +15,9 @@
#define NCCL_PTR_HOST 0x1
#define NCCL_PTR_CUDA 0x2
// Maximum number of requests per comm object
#define NCCL_NET_MAX_REQUESTS 8
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALL=~0} ncclDebugLogSubSys;
@@ -29,9 +32,9 @@ typedef struct {
int speed; // Port speed in Mbps.
int port; // Port number.
int maxComms; // Maximum number of comms we can create
}ncclNetProperties_v3_t;
}ncclNetProperties_v4_t;
typedef ncclNetProperties_v3_t ncclNetProperties_t;
typedef ncclNetProperties_v4_t ncclNetProperties_t;
typedef struct {
// Name of the network (mainly for logs)
@@ -41,7 +44,7 @@ typedef struct {
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
@@ -62,7 +65,7 @@ typedef struct {
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
@@ -70,11 +73,11 @@ typedef struct {
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v3_t;
} ncclNet_v4_t;
typedef ncclNet_v3_t ncclNet_t;
typedef ncclNet_v4_t ncclNet_t;
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v3
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v4
typedef struct {
// Name of the collective network (mainly for logs)
@@ -85,7 +88,7 @@ typedef struct {
// If ndev returns 0, all other functions might be set to NULL.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create connections.
@@ -105,17 +108,17 @@ typedef struct {
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*flush)(void* collComm, void* data, int size, void* mhandle);
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free collective comm objects
ncclResult_t (*closeColl)(void* collComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v3_t;
} ncclCollNet_v4_t;
typedef ncclCollNet_v3_t ncclCollNet_t;
typedef ncclCollNet_v4_t ncclCollNet_t;
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v3
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v4
#endif // end include guard
+2 -2
Bestand weergeven
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -25,7 +25,7 @@ static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, voi
static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
+1 -15
Bestand weergeven
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -45,14 +45,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index)
NVMLCHECK(nvmlDeviceGetIndex(device, index));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetHandleByPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
NVMLCHECK(nvmlDeviceGetPciInfo(device, pci));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive));
return ncclSuccess;
@@ -66,10 +58,6 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig
NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor));
return ncclSuccess;
@@ -150,12 +138,10 @@ ncclResult_t wrapNvmlShutdown(void);
ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
nvmlNvLinkCapability_t capability, unsigned int *capResult);
ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
#endif // NVML_DIRECT
+14
Bestand weergeven
@@ -0,0 +1,14 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_NVTX_H_
#define NCCL_NVTX_H_
#include "nvtx3.hpp"
struct nccl_domain{static constexpr char const* name{"NCCL"};};
#endif
Diff onderdrukt omdat het te groot bestand Laad Diff
Diff onderdrukt omdat het te groot bestand Laad Diff
+141
Bestand weergeven
@@ -0,0 +1,141 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvToolsExt.h"
#include "cuda.h"
#ifndef NVTOOLSEXT_CUDA_V3
#define NVTOOLSEXT_CUDA_V3
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* ========================================================================= */
/** \name Functions for CUDA Resource Naming
*/
/** \addtogroup RESOURCE_NAMING
* \section RESOURCE_NAMING_CUDA CUDA Resource Naming
*
* This section covers the API functions that allow to annotate CUDA resources
* with user-provided names.
*
* @{
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
#define NVTX_RESOURCE_CLASS_CUDA 4
/** \endcond */
/* ------------------------------------------------------------------------- */
/** \brief Resource types for CUDA
*/
typedef enum nvtxResourceCUDAType_t
{
NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
} nvtxResourceCUDAType_t;
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA device.
*
* Allows the user to associate a CUDA device with a user-provided name.
*
* \param device - The handle of the CUDA device to name.
* \param name - The name of the CUDA device.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA context.
*
* Allows the user to associate a CUDA context with a user-provided name.
*
* \param context - The handle of the CUDA context to name.
* \param name - The name of the CUDA context.
*
* \par Example:
* \code
* CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
* if ( CUDA_SUCCESS != status )
* goto Error;
* nvtxNameCuContext(cuContext, "CTX_NAME");
* \endcode
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA stream.
*
* Allows the user to associate a CUDA stream with a user-provided name.
*
* \param stream - The handle of the CUDA stream to name.
* \param name - The name of the CUDA stream.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA event.
*
* Allows the user to associate a CUDA event with a user-provided name.
*
* \param event - The handle of the CUDA event to name.
* \param name - The name of the CUDA event.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
/** @} */
/** @} */ /* END RESOURCE_NAMING */
/* ========================================================================= */
#ifdef UNICODE
#define nvtxNameCuDevice nvtxNameCuDeviceW
#define nvtxNameCuContext nvtxNameCuContextW
#define nvtxNameCuStream nvtxNameCuStreamW
#define nvtxNameCuEvent nvtxNameCuEventW
#else
#define nvtxNameCuDevice nvtxNameCuDeviceA
#define nvtxNameCuContext nvtxNameCuContextA
#define nvtxNameCuStream nvtxNameCuStreamA
#define nvtxNameCuEvent nvtxNameCuEventA
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */
#include "nvtxDetail/nvtxImplCuda_v3.h"
#undef NVTX_IMPL_GUARD_CUDA
#endif /*NVTX_NO_IMPL*/
#endif /* NVTOOLSEXT_CUDA_V3 */
@@ -0,0 +1,117 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvToolsExt.h"
#include "cuda.h"
#include "driver_types.h"
#ifndef NVTOOLSEXT_CUDART_V3
#define NVTOOLSEXT_CUDART_V3
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* ========================================================================= */
/** \name Functions for CUDA Resource Naming
*/
/** \addtogroup RESOURCE_NAMING
* \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming
*
* This section covers the API functions that allow to annotate CUDA resources
* with user-provided names.
*
* @{
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
#define NVTX_RESOURCE_CLASS_CUDART 5
/** \endcond */
/* ------------------------------------------------------------------------- */
/** \brief Resource types for CUDART
*/
typedef enum nvtxResourceCUDARTType_t
{
NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
} nvtxResourceCUDARTType_t;
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA device.
*
* Allows the user to associate a CUDA device with a user-provided name.
*
* \param device - The id of the CUDA device to name.
* \param name - The name of the CUDA device.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA stream.
*
* Allows the user to associate a CUDA stream with a user-provided name.
*
* \param stream - The handle of the CUDA stream to name.
* \param name - The name of the CUDA stream.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA event.
*
* Allows the user to associate a CUDA event with a user-provided name.
*
* \param event - The handle of the CUDA event to name.
* \param name - The name of the CUDA event.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
/** @} */
/** @} */ /* END RESOURCE_NAMING */
/* ========================================================================= */
#ifdef UNICODE
#define nvtxNameCudaDevice nvtxNameCudaDeviceW
#define nvtxNameCudaStream nvtxNameCudaStreamW
#define nvtxNameCudaEvent nvtxNameCudaEventW
#else
#define nvtxNameCudaDevice nvtxNameCudaDeviceA
#define nvtxNameCudaStream nvtxNameCudaStreamA
#define nvtxNameCudaEvent nvtxNameCudaEventA
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */
#include "nvtxDetail/nvtxImplCudaRt_v3.h"
#undef NVTX_IMPL_GUARD_CUDART
#endif /*NVTX_NO_IMPL*/
#endif /* NVTOOLSEXT_CUDART_V3 */
@@ -0,0 +1,191 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvToolsExt.h"
#include <CL/cl.h>
#ifndef NVTOOLSEXT_OPENCL_V3
#define NVTOOLSEXT_OPENCL_V3
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* ========================================================================= */
/** \name Functions for OpenCL Resource Naming
*/
/** \addtogroup RESOURCE_NAMING
* \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
*
* This section covers the API functions that allow to annotate OpenCL resources
* with user-provided names.
*
* @{
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
#define NVTX_RESOURCE_CLASS_OPENCL 6
/** \endcond */
/* ------------------------------------------------------------------------- */
/** \brief Resource types for OpenCL
*/
typedef enum nvtxResourceOpenCLType_t
{
NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7),
} nvtxResourceOpenCLType_t;
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL device.
*
* Allows to associate an OpenCL device with a user-provided name.
*
* \param device - The handle of the OpenCL device to name.
* \param name - The name of the OpenCL device.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL context.
*
* Allows to associate an OpenCL context with a user-provided name.
*
* \param context - The handle of the OpenCL context to name.
* \param name - The name of the OpenCL context.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL command queue.
*
* Allows to associate an OpenCL command queue with a user-provided name.
*
* \param command_queue - The handle of the OpenCL command queue to name.
* \param name - The name of the OpenCL command queue.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL memory object.
*
* Allows to associate an OpenCL memory object with a user-provided name.
*
* \param memobj - The handle of the OpenCL memory object to name.
* \param name - The name of the OpenCL memory object.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL sampler.
*
* Allows to associate an OpenCL sampler with a user-provided name.
*
* \param sampler - The handle of the OpenCL sampler to name.
* \param name - The name of the OpenCL sampler.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL program.
*
* Allows to associate an OpenCL program with a user-provided name.
*
* \param program - The handle of the OpenCL program to name.
* \param name - The name of the OpenCL program.
*
* \code
* cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
* (const char **) &cSourceCL, &program_length, &ciErrNum);
* shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
* nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
* \endcode
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL event.
*
* Allows to associate an OpenCL event with a user-provided name.
*
* \param evnt - The handle of the OpenCL event to name.
* \param name - The name of the OpenCL event.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
/** @} */
/** @} */ /* END RESOURCE_NAMING */
/* ========================================================================= */
#ifdef UNICODE
#define nvtxNameClDevice nvtxNameClDeviceW
#define nvtxNameClContext nvtxNameClContextW
#define nvtxNameClCommandQueue nvtxNameClCommandQueueW
#define nvtxNameClMemObject nvtxNameClMemObjectW
#define nvtxNameClSampler nvtxNameClSamplerW
#define nvtxNameClProgram nvtxNameClProgramW
#define nvtxNameClEvent nvtxNameClEventW
#else
#define nvtxNameClDevice nvtxNameClDeviceA
#define nvtxNameClContext nvtxNameClContextA
#define nvtxNameClCommandQueue nvtxNameClCommandQueueA
#define nvtxNameClMemObject nvtxNameClMemObjectA
#define nvtxNameClSampler nvtxNameClSamplerA
#define nvtxNameClProgram nvtxNameClProgramA
#define nvtxNameClEvent nvtxNameClEventA
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
#include "nvtxDetail/nvtxImplOpenCL_v3.h"
#undef NVTX_IMPL_GUARD_OPENCL
#endif /*NVTX_NO_IMPL*/
#endif /* NVTOOLSEXT_OPENCL_V3 */
+382
Bestand weergeven
@@ -0,0 +1,382 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvToolsExt.h"
#ifndef NVTOOLSEXT_SYNC_V3
#define NVTOOLSEXT_SYNC_V3
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* \cond SHOW_HIDDEN
* \version \NVTX_VERSION_2
*/
#define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) )
/** \endcond */
/**
* \page PAGE_SYNCHRONIZATION Synchronization
*
* This section covers a subset of the API that allow users to track additional
* synchronization details of their application. Naming OS synchronization primitives
* may allow users to better understand the data collected by traced synchronization
* APIs. Additionally, a user defined synchronization object can allow the users to
* to tell the tools when the user is building their own synchronization system
* that do not rely on the OS to provide behaviors and instead use techniques like
* atomic operations and spinlocks.
*
* See module \ref SYNCHRONIZATION for details.
*
* \par Example:
* \code
* class MyMutex
* {
* volatile long bLocked;
* nvtxSyncUser_t hSync;
* public:
* MyMutex(const char* name, nvtxDomainHandle_t d){
* bLocked = 0;
*
* nvtxSyncUserAttributes_t attribs = { 0 };
* attribs.version = NVTX_VERSION;
* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
* attribs.message.ascii = name;
* hSync = nvtxDomainSyncUserCreate(d, &attribs);
* }
*
* ~MyMutex() {
* nvtxDomainSyncUserDestroy(hSync);
* }
*
* bool Lock() {
* nvtxDomainSyncUserAcquireStart(hSync);
* bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic
* if (acquired) {
* nvtxDomainSyncUserAcquireSuccess(hSync);
* }
* else {
* nvtxDomainSyncUserAcquireFailed(hSync);
* }
* return acquired;
* }
* void Unlock() {
* nvtxDomainSyncUserReleasing(hSync);
* bLocked = false;
* }
* };
* \endcode
*
* \version \NVTX_VERSION_2
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
#define NVTX_RESOURCE_CLASS_SYNC_OS 2 /**< Synchronization objects that are OS specific. */
#define NVTX_RESOURCE_CLASS_SYNC_PTHREAD 3 /**< Synchronization objects that are from the POSIX Threads API (pthread)*/
/** \endcond */
/* ------------------------------------------------------------------------- */
/** \defgroup SYNCHRONIZATION Synchronization
* See page \ref PAGE_SYNCHRONIZATION.
* @{
*/
/** \brief Resource type values for OSs with POSIX Thread API support
*/
typedef enum nvtxResourceSyncPosixThreadType_t
{
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 1), /* pthread_mutex_t */
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_CONDITION = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 2), /* pthread_cond_t */
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_RWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 3), /* pthread_rwlock_t */
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_BARRIER = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 4), /* pthread_barrier_t */
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 5), /* pthread_spinlock_t */
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_ONCE = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 6) /* pthread_once_t */
} nvtxResourceSyncPosixThreadType_t;
/** \brief Resource type values for Windows OSs
*/
typedef enum nvtxResourceSyncWindowsType_t
{
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_EVENT = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_CRITICAL_SECTION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SRWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5)
} nvtxResourceSyncWindowsType_t;
/** \brief Resource type values for Linux and Linux derived OSs such as Android
* \sa
* ::nvtxResourceSyncPosixThreadType_t
*/
typedef enum nvtxResourceSyncLinuxType_t
{
NVTX_RESOURCE_TYPE_SYNC_LINUX_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
NVTX_RESOURCE_TYPE_SYNC_LINUX_FUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
NVTX_RESOURCE_TYPE_SYNC_LINUX_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
NVTX_RESOURCE_TYPE_SYNC_LINUX_COMPLETION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
NVTX_RESOURCE_TYPE_SYNC_LINUX_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5),
NVTX_RESOURCE_TYPE_SYNC_LINUX_SEQLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 6),
NVTX_RESOURCE_TYPE_SYNC_LINUX_RCU = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 7)
} nvtxResourceSyncLinuxType_t;
/** \brief Resource type values for Android come from Linux.
* \sa
* ::nvtxResourceSyncLinuxType_t
* ::nvtxResourceSyncPosixThreadType_t
*/
typedef enum nvtxResourceSyncLinuxType_t nvtxResourceSyncAndroidType_t;
/** \brief User Defined Synchronization Object Handle .
* \anchor SYNCUSER_HANDLE_STRUCTURE
*
* This structure is opaque to the user and is used as a handle to reference
* a user defined syncrhonization object. The tools will return a pointer through the API for the application
* to hold on it's behalf to reference the string in the future.
*
*/
typedef struct nvtxSyncUser* nvtxSyncUser_t;
/** \brief User Defined Synchronization Object Attributes Structure.
* \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE
*
* This structure is used to describe the attributes of a user defined synchronization
* object. The layout of the structure is defined by a specific version of the tools
* extension library and can change between different versions of the Tools Extension
* library.
*
* \par Initializing the Attributes
*
* The caller should always perform the following three tasks when using
* attributes:
* <ul>
* <li>Zero the structure
* <li>Set the version field
* <li>Set the size field
* </ul>
*
* Zeroing the structure sets all the event attributes types and values
* to the default value.
*
* The version and size field are used by the Tools Extension
* implementation to handle multiple versions of the attributes structure.
*
* It is recommended that the caller use one of the following to methods
* to initialize the event attributes structure:
*
* \par Method 1: Initializing nvtxEventAttributes for future compatibility
* \code
* nvtxSyncUserAttributes_t attribs = {0};
* attribs.version = NVTX_VERSION;
* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
* \endcode
*
* \par Method 2: Initializing nvtxSyncUserAttributes_t for a specific version
* \code
* nvtxSyncUserAttributes_t attribs = {0};
* attribs.version = 1;
* attribs.size = (uint16_t)(sizeof(nvtxSyncUserAttributes_t));
* \endcode
*
* If the caller uses Method 1 it is critical that the entire binary
* layout of the structure be configured to 0 so that all fields
* are initialized to the default value.
*
* The caller should either use both NVTX_VERSION and
* NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
* and a versioned type (Method 2). Using a mix of the two methods
* will likely cause either source level incompatibility or binary
* incompatibility in the future.
*
* \par Settings Attribute Types and Values
*
*
* \par Example:
* \code
* // Initialize
* nvtxSyncUserAttributes_t attribs = {0};
* attribs.version = NVTX_VERSION;
* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
*
* // Configure the Attributes
* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
* attribs.message.ascii = "Example";
* \endcode
*
* \sa
* ::nvtxDomainSyncUserCreate
*/
typedef struct nvtxSyncUserAttributes_v0
{
/**
* \brief Version flag of the structure.
*
* Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
* supported in this header file. This can optionally be overridden to
* another version of the tools extension library.
*/
uint16_t version;
/**
* \brief Size of the structure.
*
* Needs to be set to the size in bytes of the event attribute
* structure used to specify the event.
*/
uint16_t size;
/** \brief Message type specified in this attribute structure.
*
* Defines the message format of the attribute structure's \ref nvtxSyncUserAttributes_v0::message
* "message" field.
*
* Default Value is NVTX_MESSAGE_UNKNOWN
*/
int32_t messageType; /* nvtxMessageType_t */
/** \brief Message assigned to this attribute structure.
*
* The text message that is attached to an event.
*/
nvtxMessageValue_t message;
} nvtxSyncUserAttributes_v0;
typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
/* ------------------------------------------------------------------------- */
/** \brief Create a user defined synchronization object
* This is used to track non-OS synchronization working with spinlocks and atomics
*
* \param domain - Domain to own the resource
* \param attribs - A structure to assign multiple attributes to the object.
*
* \return A handle that represents the newly created user defined synchronization object.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/
NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
/* ------------------------------------------------------------------------- */
/** \brief Destroy a user defined synchronization object
* This is used to track non-OS synchronization working with spinlocks and atomics
*
* \param handle - A handle to the object to operate on.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle);
/* ------------------------------------------------------------------------- */
/** \brief Signal to tools that an attempt to acquire a user defined synchronization object
*
* \param handle - A handle to the object to operate on.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle);
/* ------------------------------------------------------------------------- */
/** \brief Signal to tools of failure in acquiring a user defined synchronization object
* This should be called after \ref nvtxDomainSyncUserAcquireStart
*
* \param handle - A handle to the object to operate on.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle);
/* ------------------------------------------------------------------------- */
/** \brief Signal to tools of success in acquiring a user defined synchronization object
* This should be called after \ref nvtxDomainSyncUserAcquireStart.
*
* \param handle - A handle to the object to operate on.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle);
/* ------------------------------------------------------------------------- */
/** \brief Signal to tools of releasing a reservation on user defined synchronization object
* This should be called after \ref nvtxDomainSyncUserAcquireSuccess.
*
* \param handle - A handle to the object to operate on.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle);
/** @} */ /*END defgroup*/
#ifdef __cplusplus
}
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot included directly */
#include "nvtxDetail/nvtxImplSync_v3.h"
#undef NVTX_IMPL_GUARD_SYNC
#endif /*NVTX_NO_IMPL*/
#endif /* NVTOOLSEXT_SYNC_V3 */
@@ -0,0 +1,438 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
#endif
/* ---- Include required platform headers ---- */
#if defined(_WIN32)
#include <Windows.h>
#else
#include <unistd.h>
#if defined(__ANDROID__)
#include <android/api-level.h>
#endif
#if defined(__linux__) || defined(__CYGWIN__)
#include <sched.h>
#endif
#include <limits.h>
#include <dlfcn.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/types.h>
#include <pthread.h>
#include <stdlib.h>
#include <wchar.h>
#endif
/* ---- Define macros used in this file ---- */
#define NVTX_INIT_STATE_FRESH 0
#define NVTX_INIT_STATE_STARTED 1
#define NVTX_INIT_STATE_COMPLETE 2
#ifdef NVTX_DEBUG_PRINT
#ifdef __ANDROID__
#include <android/log.h>
#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
#else
#include <stdio.h>
#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
#endif
#else /* !defined(NVTX_DEBUG_PRINT) */
#define NVTX_ERR(...)
#define NVTX_INFO(...)
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#ifdef __GNUC__
#pragma GCC visibility push(hidden)
#endif
/* ---- Forward declare all functions referenced in globals ---- */
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
NvtxCallbackModule module,
NvtxFunctionTable* out_table,
unsigned int* out_size);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(
uint32_t version);
NVTX_LINKONCE_FWDDECL_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(
uint32_t exportTableId);
#include "nvtxInitDecls.h"
/* ---- Define all globals ---- */
typedef struct nvtxGlobals_t
{
volatile unsigned int initState;
NvtxExportTableCallbacks etblCallbacks;
NvtxExportTableVersionInfo etblVersionInfo;
/* Implementation function pointers */
nvtxMarkEx_impl_fntype nvtxMarkEx_impl_fnptr;
nvtxMarkA_impl_fntype nvtxMarkA_impl_fnptr;
nvtxMarkW_impl_fntype nvtxMarkW_impl_fnptr;
nvtxRangeStartEx_impl_fntype nvtxRangeStartEx_impl_fnptr;
nvtxRangeStartA_impl_fntype nvtxRangeStartA_impl_fnptr;
nvtxRangeStartW_impl_fntype nvtxRangeStartW_impl_fnptr;
nvtxRangeEnd_impl_fntype nvtxRangeEnd_impl_fnptr;
nvtxRangePushEx_impl_fntype nvtxRangePushEx_impl_fnptr;
nvtxRangePushA_impl_fntype nvtxRangePushA_impl_fnptr;
nvtxRangePushW_impl_fntype nvtxRangePushW_impl_fnptr;
nvtxRangePop_impl_fntype nvtxRangePop_impl_fnptr;
nvtxNameCategoryA_impl_fntype nvtxNameCategoryA_impl_fnptr;
nvtxNameCategoryW_impl_fntype nvtxNameCategoryW_impl_fnptr;
nvtxNameOsThreadA_impl_fntype nvtxNameOsThreadA_impl_fnptr;
nvtxNameOsThreadW_impl_fntype nvtxNameOsThreadW_impl_fnptr;
nvtxNameCuDeviceA_fakeimpl_fntype nvtxNameCuDeviceA_impl_fnptr;
nvtxNameCuDeviceW_fakeimpl_fntype nvtxNameCuDeviceW_impl_fnptr;
nvtxNameCuContextA_fakeimpl_fntype nvtxNameCuContextA_impl_fnptr;
nvtxNameCuContextW_fakeimpl_fntype nvtxNameCuContextW_impl_fnptr;
nvtxNameCuStreamA_fakeimpl_fntype nvtxNameCuStreamA_impl_fnptr;
nvtxNameCuStreamW_fakeimpl_fntype nvtxNameCuStreamW_impl_fnptr;
nvtxNameCuEventA_fakeimpl_fntype nvtxNameCuEventA_impl_fnptr;
nvtxNameCuEventW_fakeimpl_fntype nvtxNameCuEventW_impl_fnptr;
nvtxNameClDeviceA_fakeimpl_fntype nvtxNameClDeviceA_impl_fnptr;
nvtxNameClDeviceW_fakeimpl_fntype nvtxNameClDeviceW_impl_fnptr;
nvtxNameClContextA_fakeimpl_fntype nvtxNameClContextA_impl_fnptr;
nvtxNameClContextW_fakeimpl_fntype nvtxNameClContextW_impl_fnptr;
nvtxNameClCommandQueueA_fakeimpl_fntype nvtxNameClCommandQueueA_impl_fnptr;
nvtxNameClCommandQueueW_fakeimpl_fntype nvtxNameClCommandQueueW_impl_fnptr;
nvtxNameClMemObjectA_fakeimpl_fntype nvtxNameClMemObjectA_impl_fnptr;
nvtxNameClMemObjectW_fakeimpl_fntype nvtxNameClMemObjectW_impl_fnptr;
nvtxNameClSamplerA_fakeimpl_fntype nvtxNameClSamplerA_impl_fnptr;
nvtxNameClSamplerW_fakeimpl_fntype nvtxNameClSamplerW_impl_fnptr;
nvtxNameClProgramA_fakeimpl_fntype nvtxNameClProgramA_impl_fnptr;
nvtxNameClProgramW_fakeimpl_fntype nvtxNameClProgramW_impl_fnptr;
nvtxNameClEventA_fakeimpl_fntype nvtxNameClEventA_impl_fnptr;
nvtxNameClEventW_fakeimpl_fntype nvtxNameClEventW_impl_fnptr;
nvtxNameCudaDeviceA_impl_fntype nvtxNameCudaDeviceA_impl_fnptr;
nvtxNameCudaDeviceW_impl_fntype nvtxNameCudaDeviceW_impl_fnptr;
nvtxNameCudaStreamA_fakeimpl_fntype nvtxNameCudaStreamA_impl_fnptr;
nvtxNameCudaStreamW_fakeimpl_fntype nvtxNameCudaStreamW_impl_fnptr;
nvtxNameCudaEventA_fakeimpl_fntype nvtxNameCudaEventA_impl_fnptr;
nvtxNameCudaEventW_fakeimpl_fntype nvtxNameCudaEventW_impl_fnptr;
nvtxDomainMarkEx_impl_fntype nvtxDomainMarkEx_impl_fnptr;
nvtxDomainRangeStartEx_impl_fntype nvtxDomainRangeStartEx_impl_fnptr;
nvtxDomainRangeEnd_impl_fntype nvtxDomainRangeEnd_impl_fnptr;
nvtxDomainRangePushEx_impl_fntype nvtxDomainRangePushEx_impl_fnptr;
nvtxDomainRangePop_impl_fntype nvtxDomainRangePop_impl_fnptr;
nvtxDomainResourceCreate_impl_fntype nvtxDomainResourceCreate_impl_fnptr;
nvtxDomainResourceDestroy_impl_fntype nvtxDomainResourceDestroy_impl_fnptr;
nvtxDomainNameCategoryA_impl_fntype nvtxDomainNameCategoryA_impl_fnptr;
nvtxDomainNameCategoryW_impl_fntype nvtxDomainNameCategoryW_impl_fnptr;
nvtxDomainRegisterStringA_impl_fntype nvtxDomainRegisterStringA_impl_fnptr;
nvtxDomainRegisterStringW_impl_fntype nvtxDomainRegisterStringW_impl_fnptr;
nvtxDomainCreateA_impl_fntype nvtxDomainCreateA_impl_fnptr;
nvtxDomainCreateW_impl_fntype nvtxDomainCreateW_impl_fnptr;
nvtxDomainDestroy_impl_fntype nvtxDomainDestroy_impl_fnptr;
nvtxInitialize_impl_fntype nvtxInitialize_impl_fnptr;
nvtxDomainSyncUserCreate_impl_fntype nvtxDomainSyncUserCreate_impl_fnptr;
nvtxDomainSyncUserDestroy_impl_fntype nvtxDomainSyncUserDestroy_impl_fnptr;
nvtxDomainSyncUserAcquireStart_impl_fntype nvtxDomainSyncUserAcquireStart_impl_fnptr;
nvtxDomainSyncUserAcquireFailed_impl_fntype nvtxDomainSyncUserAcquireFailed_impl_fnptr;
nvtxDomainSyncUserAcquireSuccess_impl_fntype nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
nvtxDomainSyncUserReleasing_impl_fntype nvtxDomainSyncUserReleasing_impl_fnptr;
/* Tables of function pointers -- Extra null added to the end to ensure
* a crash instead of silent corruption if a tool reads off the end. */
NvtxFunctionPointer* functionTable_CORE [NVTX_CBID_CORE_SIZE + 1];
NvtxFunctionPointer* functionTable_CUDA [NVTX_CBID_CUDA_SIZE + 1];
NvtxFunctionPointer* functionTable_OPENCL[NVTX_CBID_OPENCL_SIZE + 1];
NvtxFunctionPointer* functionTable_CUDART[NVTX_CBID_CUDART_SIZE + 1];
NvtxFunctionPointer* functionTable_CORE2 [NVTX_CBID_CORE2_SIZE + 1];
NvtxFunctionPointer* functionTable_SYNC [NVTX_CBID_SYNC_SIZE + 1];
} nvtxGlobals_t;
NVTX_LINKONCE_DEFINE_GLOBAL nvtxGlobals_t NVTX_VERSIONED_IDENTIFIER(nvtxGlobals) =
{
NVTX_INIT_STATE_FRESH,
{
sizeof(NvtxExportTableCallbacks),
NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)
},
{
sizeof(NvtxExportTableVersionInfo),
NVTX_VERSION,
0,
NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)
},
/* Implementation function pointers */
NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init),
/* Tables of function pointers */
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr,
0
},
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr,
0
},
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr,
0
},
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr,
0
},
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr,
0
},
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr,
0
}
};
/* ---- Define static inline implementations of core API functions ---- */
#include "nvtxImplCore.h"
/* ---- Define implementations of export table functions ---- */
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
NvtxCallbackModule module,
NvtxFunctionTable* out_table,
unsigned int* out_size)
{
unsigned int bytes = 0;
NvtxFunctionTable table = (NvtxFunctionTable)0;
switch (module)
{
case NVTX_CB_MODULE_CORE:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE);
break;
case NVTX_CB_MODULE_CUDA:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA);
break;
case NVTX_CB_MODULE_OPENCL:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL);
break;
case NVTX_CB_MODULE_CUDART:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART);
break;
case NVTX_CB_MODULE_CORE2:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2);
break;
case NVTX_CB_MODULE_SYNC:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC);
break;
default: return 0;
}
if (out_size)
*out_size = (bytes / (unsigned int)sizeof(NvtxFunctionPointer*)) - 1;
if (out_table)
*out_table = table;
return 1;
}
NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(uint32_t exportTableId)
{
switch (exportTableId)
{
case NVTX_ETID_CALLBACKS: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblCallbacks;
case NVTX_ETID_VERSIONINFO: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblVersionInfo;
default: return 0;
}
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(uint32_t version)
{
/* Reserved for custom implementations to resolve problems with tools */
(void)version;
}
/* ---- Define implementations of init versions of all API functions ---- */
#include "nvtxInitDefs.h"
/* ---- Define implementations of initialization functions ---- */
#include "nvtxInit.h"
#ifdef __GNUC__
#pragma GCC visibility pop
#endif
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
@@ -0,0 +1,307 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr;
if(local!=0)
(*local)(eventAttrib);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message)
{
#ifndef NVTX_DISABLE
nvtxMarkA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr;
if(local!=0)
(*local)(message);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message)
{
#ifndef NVTX_DISABLE
nvtxMarkW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr;
if(local!=0)
(*local)(message);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr;
if(local!=0)
return (*local)(eventAttrib);
else
#endif /*NVTX_DISABLE*/
return (nvtxRangeId_t)0;
}
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message)
{
#ifndef NVTX_DISABLE
nvtxRangeStartA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (nvtxRangeId_t)0;
}
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message)
{
#ifndef NVTX_DISABLE
nvtxRangeStartW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (nvtxRangeId_t)0;
}
NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id)
{
#ifndef NVTX_DISABLE
nvtxRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr;
if(local!=0)
(*local)(id);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr;
if(local!=0)
return (*local)(eventAttrib);
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message)
{
#ifndef NVTX_DISABLE
nvtxRangePushA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message)
{
#ifndef NVTX_DISABLE
nvtxRangePushW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC int NVTX_API nvtxRangePop(void)
{
#ifndef NVTX_DISABLE
nvtxRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr;
if(local!=0)
return (*local)();
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr;
if(local!=0)
(*local)(category, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr;
if(local!=0)
(*local)(category, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameOsThreadA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr;
if(local!=0)
(*local)(threadId, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameOsThreadW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr;
if(local!=0)
(*local)(threadId, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxDomainMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr;
if(local!=0)
(*local)(domain, eventAttrib);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxDomainRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr;
if(local!=0)
return (*local)(domain, eventAttrib);
else
#endif /*NVTX_DISABLE*/
return (nvtxRangeId_t)0;
}
NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id)
{
#ifndef NVTX_DISABLE
nvtxDomainRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr;
if(local!=0)
(*local)(domain, id);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxDomainRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr;
if(local!=0)
return (*local)(domain, eventAttrib);
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain)
{
#ifndef NVTX_DISABLE
nvtxDomainRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr;
if(local!=0)
return (*local)(domain);
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs)
{
#ifndef NVTX_DISABLE
nvtxDomainResourceCreate_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr;
if(local!=0)
return (*local)(domain, attribs);
else
#endif /*NVTX_DISABLE*/
return (nvtxResourceHandle_t)0;
}
NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource)
{
#ifndef NVTX_DISABLE
nvtxDomainResourceDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr;
if(local!=0)
(*local)(resource);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name)
{
#ifndef NVTX_DISABLE
nvtxDomainNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr;
if(local!=0)
(*local)(domain, category, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxDomainNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr;
if(local!=0)
(*local)(domain, category, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string)
{
#ifndef NVTX_DISABLE
nvtxDomainRegisterStringA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr;
if(local!=0)
return (*local)(domain, string);
else
#endif /*NVTX_DISABLE*/
return (nvtxStringHandle_t)0;
}
NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string)
{
#ifndef NVTX_DISABLE
nvtxDomainRegisterStringW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr;
if(local!=0)
return (*local)(domain, string);
else
#endif /*NVTX_DISABLE*/
return (nvtxStringHandle_t)0;
}
NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* message)
{
#ifndef NVTX_DISABLE
nvtxDomainCreateA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (nvtxDomainHandle_t)0;
}
NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* message)
{
#ifndef NVTX_DISABLE
nvtxDomainCreateW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (nvtxDomainHandle_t)0;
}
NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain)
{
#ifndef NVTX_DISABLE
nvtxDomainDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr;
if(local!=0)
(*local)(domain);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved)
{
#ifndef NVTX_DISABLE
nvtxInitialize_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr;
if(local!=0)
(*local)(reserved);
#endif /*NVTX_DISABLE*/
}
@@ -0,0 +1,81 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD_CUDART
#error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined).
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
if(local!=0)
(*local)(stream, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
if(local!=0)
(*local)(stream, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
if(local!=0)
(*local)(event, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
if(local!=0)
(*local)(event, name);
#endif /*NVTX_DISABLE*/
}
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
@@ -0,0 +1,102 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD_CUDA
#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name);
typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name);
typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name);
typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name);
typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
if(local!=0)
(*local)(context, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
if(local!=0)
(*local)(context, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
if(local!=0)
(*local)(stream, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
if(local!=0)
(*local)(stream, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
if(local!=0)
(*local)(event, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
if(local!=0)
(*local)(event, name);
#endif /*NVTX_DISABLE*/
}
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
@@ -0,0 +1,161 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD_OPENCL
#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
typedef void (NVTX_API * nvtxNameClDeviceA_impl_fntype)(cl_device_id device, const char* name);
typedef void (NVTX_API * nvtxNameClDeviceW_impl_fntype)(cl_device_id device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClContextA_impl_fntype)(cl_context context, const char* name);
typedef void (NVTX_API * nvtxNameClContextW_impl_fntype)(cl_context context, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClCommandQueueA_impl_fntype)(cl_command_queue command_queue, const char* name);
typedef void (NVTX_API * nvtxNameClCommandQueueW_impl_fntype)(cl_command_queue command_queue, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClMemObjectA_impl_fntype)(cl_mem memobj, const char* name);
typedef void (NVTX_API * nvtxNameClMemObjectW_impl_fntype)(cl_mem memobj, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClSamplerA_impl_fntype)(cl_sampler sampler, const char* name);
typedef void (NVTX_API * nvtxNameClSamplerW_impl_fntype)(cl_sampler sampler, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClProgramA_impl_fntype)(cl_program program, const char* name);
typedef void (NVTX_API * nvtxNameClProgramW_impl_fntype)(cl_program program, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClEventA_impl_fntype)(cl_event evnt, const char* name);
typedef void (NVTX_API * nvtxNameClEventW_impl_fntype)(cl_event evnt, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClDeviceA_impl_fntype local = (nvtxNameClDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClDeviceW_impl_fntype local = (nvtxNameClDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClContextA_impl_fntype local = (nvtxNameClContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr;
if(local!=0)
(*local)(context, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClContextW_impl_fntype local = (nvtxNameClContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr;
if(local!=0)
(*local)(context, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClCommandQueueA_impl_fntype local = (nvtxNameClCommandQueueA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr;
if(local!=0)
(*local)(command_queue, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClCommandQueueW_impl_fntype local = (nvtxNameClCommandQueueW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr;
if(local!=0)
(*local)(command_queue, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClMemObjectA_impl_fntype local = (nvtxNameClMemObjectA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr;
if(local!=0)
(*local)(memobj, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClMemObjectW_impl_fntype local = (nvtxNameClMemObjectW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr;
if(local!=0)
(*local)(memobj, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClSamplerA_impl_fntype local = (nvtxNameClSamplerA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr;
if(local!=0)
(*local)(sampler, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClSamplerW_impl_fntype local = (nvtxNameClSamplerW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr;
if(local!=0)
(*local)(sampler, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClProgramA_impl_fntype local = (nvtxNameClProgramA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr;
if(local!=0)
(*local)(program, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClProgramW_impl_fntype local = (nvtxNameClProgramW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr;
if(local!=0)
(*local)(program, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClEventA_impl_fntype local = (nvtxNameClEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr;
if(local!=0)
(*local)(evnt, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClEventW_impl_fntype local = (nvtxNameClEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr;
if(local!=0)
(*local)(evnt, name);
#endif /*NVTX_DISABLE*/
}
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
@@ -0,0 +1,83 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD_SYNC
#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
if(local!=0)
return (*local)(domain, attribs);
else
#endif /*NVTX_DISABLE*/
return (nvtxSyncUser_t)0;
}
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
if(local!=0)
(*local)(handle);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
if(local!=0)
(*local)(handle);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
if(local!=0)
(*local)(handle);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
if(local!=0)
(*local)(handle);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
if(local!=0)
(*local)(handle);
#endif /*NVTX_DISABLE*/
}
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
@@ -0,0 +1,312 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
#endif
/* ---- Platform-independent helper definitions and functions ---- */
/* Prefer macros over inline functions to reduce symbol resolution at link time */
#if defined(_WIN32)
#define NVTX_PATHCHAR wchar_t
#define NVTX_STR(x) L##x
#define NVTX_GETENV _wgetenv
#define NVTX_BUFSIZE MAX_PATH
#define NVTX_DLLHANDLE HMODULE
#define NVTX_DLLOPEN(x) LoadLibraryW(x)
#define NVTX_DLLFUNC GetProcAddress
#define NVTX_DLLCLOSE FreeLibrary
#define NVTX_YIELD() SwitchToThread()
#define NVTX_MEMBAR() MemoryBarrier()
#define NVTX_ATOMIC_WRITE_32(address, value) InterlockedExchange((volatile LONG*)address, value)
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand)
#elif defined(__GNUC__)
#define NVTX_PATHCHAR char
#define NVTX_STR(x) x
#define NVTX_GETENV getenv
#define NVTX_BUFSIZE PATH_MAX
#define NVTX_DLLHANDLE void*
#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
#define NVTX_DLLFUNC dlsym
#define NVTX_DLLCLOSE dlclose
#define NVTX_YIELD() sched_yield()
#define NVTX_MEMBAR() __sync_synchronize()
/* Ensure full memory barrier for atomics, to match Windows functions */
#define NVTX_ATOMIC_WRITE_32(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value)
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
#else
#error The library does not support your configuration!
#endif
/* Define this to 1 for platforms that where pre-injected libraries can be discovered. */
#if defined(_WIN32)
/* TODO */
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
#else
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
#endif
/* Define this to 1 for platforms that support environment variables */
/* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */
/* Try: #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */
#define NVTX_SUPPORT_ENV_VARS 1
/* Define this to 1 for platforms that support dynamic/shared libraries */
#define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1
/* Injection libraries implementing InitializeInjectionNvtx2 may be statically linked,
* and this will override any dynamic injection. Useful for platforms where dynamic
* injection is not available. Since weak symbols not explicitly marked extern are
* guaranteed to be initialized to zero if no definitions are found by the linker, the
* dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */
#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__)
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1
/* To statically inject an NVTX library, define InitializeInjectionNvtx2_fnptr as a normal
* symbol (not weak) pointing to the implementation of InitializeInjectionNvtx2 (which
* does not need to be named "InitializeInjectionNvtx2" as is necessary in a dynamic
* injection library. */
__attribute__((weak)) NvtxInitializeInjectionNvtxFunc_t InitializeInjectionNvtx2_fnptr;
#else
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0
#endif
/* This function tries to find or load an NVTX injection library and get the
* address of its InitializeInjection2 function. If such a function pointer
* is found, it is called, and passed the address of this NVTX instance's
* nvtxGetExportTable function, so the injection can attach to this instance.
* If the initialization fails for any reason, any dynamic library loaded will
* be freed, and all NVTX implementation functions will be set to no-ops. If
* initialization succeeds, NVTX functions not attached to the tool will be set
* to no-ops. This is implemented as one function instead of several small
* functions to minimize the number of weak symbols the linker must resolve.
* Order of search is:
* - Pre-injected library exporting InitializeInjectionNvtx2
* - Loadable library exporting InitializeInjectionNvtx2
* - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
* - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
* - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
*/
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void);
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void)
{
const char* const initFuncName = "InitializeInjectionNvtx2";
NvtxInitializeInjectionNvtxFunc_t init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)0;
NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0;
int entryPointStatus = 0;
#if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY
/* Use POSIX global symbol chain to query for init function from any module */
init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(0, initFuncName);
#endif
#if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY
/* Try discovering dynamic injection library to load */
if (!init_fnptr)
{
#if NVTX_SUPPORT_ENV_VARS
/* If env var NVTX_INJECTION64_PATH is set, it should contain the path
* to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4)
? NVTX_STR("NVTX_INJECTION32_PATH")
: NVTX_STR("NVTX_INJECTION64_PATH");
#endif /* NVTX_SUPPORT_ENV_VARS */
NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE];
const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0;
/* Refer to this variable explicitly in case all references to it are #if'ed out */
(void)injectionLibraryPathBuf;
#if NVTX_SUPPORT_ENV_VARS
/* Disable the warning for getenv & _wgetenv -- this usage is safe because
* these functions are not called again before using the returned value. */
#if defined(_MSC_VER)
#pragma warning( push )
#pragma warning( disable : 4996 )
#endif
injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName);
#if defined(_MSC_VER)
#pragma warning( pop )
#endif
#endif
#if defined(__ANDROID__)
if (!injectionLibraryPath)
{
const char *bits = (sizeof(void*) == 4) ? "32" : "64";
char cmdlineBuf[32];
char pkgName[PATH_MAX];
int count;
int pid;
FILE *fp;
size_t bytesRead;
size_t pos;
pid = (int)getpid();
count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid);
if (count <= 0 || count >= (int)sizeof(cmdlineBuf))
{
NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
fp = fopen(cmdlineBuf, "r");
if (!fp)
{
NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp);
fclose(fp);
if (bytesRead == 0)
{
NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
pkgName[bytesRead] = 0;
/* String can contain colon as a process separator. In this case the package name is before the colon. */
pos = 0;
while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0')
{
++pos;
}
pkgName[pos] = 0;
count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits);
if (count <= 0 || count >= NVTX_BUFSIZE)
{
NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
/* On Android, verify path is accessible due to aggressive file access restrictions. */
/* For dlopen, if the filename contains a leading slash, then it is interpreted as a */
/* relative or absolute pathname; otherwise it will follow the rules in ld.so. */
if (injectionLibraryPathBuf[0] == '/')
{
#if (__ANDROID_API__ < 21)
int access_err = access(injectionLibraryPathBuf, F_OK | R_OK);
#else
int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0);
#endif
if (access_err != 0)
{
NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
}
injectionLibraryPath = injectionLibraryPathBuf;
}
#endif
/* At this point, injectionLibraryPath is specified if a dynamic
* injection library was specified by a tool. */
if (injectionLibraryPath)
{
/* Load the injection library */
injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath);
if (!injectionLibraryHandle)
{
NVTX_ERR("Failed to load injection library\n");
return NVTX_ERR_INIT_LOAD_LIBRARY;
}
else
{
/* Attempt to get the injection library's entry-point */
init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName);
if (!init_fnptr)
{
NVTX_DLLCLOSE(injectionLibraryHandle);
NVTX_ERR("Failed to get address of function InitializeInjectionNvtx2 from injection library\n");
return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT;
}
}
}
}
#endif
#if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY
if (!init_fnptr)
{
/* Check weakly-defined function pointer. A statically-linked injection can define this as
* a normal symbol and it will take precedence over a dynamic injection. */
if (InitializeInjectionNvtx2_fnptr)
{
init_fnptr = InitializeInjectionNvtx2_fnptr;
}
}
#endif
/* At this point, if init_fnptr is not set, then no tool has specified
* an NVTX injection library -- return non-success result so all NVTX
* API functions will be set to no-ops. */
if (!init_fnptr)
{
return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE;
}
/* Invoke injection library's initialization function. If it returns
* 0 (failure) and a dynamic injection was loaded, unload it. */
entryPointStatus = init_fnptr(NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable));
if (entryPointStatus == 0)
{
NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n");
if (injectionLibraryHandle)
{
NVTX_DLLCLOSE(injectionLibraryHandle);
}
return NVTX_ERR_INIT_FAILED_LIBRARY_ENTRY_POINT;
}
return NVTX_SUCCESS;
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void)
{
unsigned int old;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState == NVTX_INIT_STATE_COMPLETE)
{
return;
}
NVTX_ATOMIC_CAS_32(
old,
&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState,
NVTX_INIT_STATE_STARTED,
NVTX_INIT_STATE_FRESH);
if (old == NVTX_INIT_STATE_FRESH)
{
int result;
int forceAllToNoops;
/* Load & initialize injection library -- it will assign the function pointers */
result = NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)();
/* Set all pointers not assigned by the injection to null */
forceAllToNoops = result != NVTX_SUCCESS; /* Set all to null if injection init failed */
NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(forceAllToNoops);
/* Signal that initialization has finished, so now the assigned function pointers will be used */
NVTX_ATOMIC_WRITE_32(
&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState,
NVTX_INIT_STATE_COMPLETE);
}
else /* Spin-wait until initialization has finished */
{
NVTX_MEMBAR();
while (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState != NVTX_INIT_STATE_COMPLETE)
{
NVTX_YIELD();
NVTX_MEMBAR();
}
}
}
@@ -0,0 +1,81 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
#endif
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle);
@@ -0,0 +1,573 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
#endif
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxMarkEx(eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxMarkA(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxMarkW(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangeStartEx(eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangeStartA(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangeStartW(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxRangeEnd(id);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangePushEx(eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangePushA(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangePushW(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangePop();
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxNameCategoryA(category, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxNameCategoryW(category, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxNameOsThreadA(threadId, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxNameOsThreadW(threadId, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainMarkEx(domain, eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainRangeStartEx(domain, eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainRangeEnd(domain, id);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainRangePushEx(domain, eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainRangePop(domain);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainResourceCreate(domain, attribs);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainResourceDestroy(resource);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainNameCategoryA(domain, category, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainNameCategoryW(domain, category, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainRegisterStringA(domain, string);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainRegisterStringW(domain, string);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainCreateA(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainCreateW(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainDestroy(domain);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxInitialize(reserved);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name){
nvtxNameCuDeviceA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name){
nvtxNameCuDeviceW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name){
nvtxNameCuContextA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
if (local)
local(context, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name){
nvtxNameCuContextW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
if (local)
local(context, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name){
nvtxNameCuStreamA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
if (local)
local(stream, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name){
nvtxNameCuStreamW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
if (local)
local(stream, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name){
nvtxNameCuEventA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
if (local)
local(event, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name){
nvtxNameCuEventW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
if (local)
local(event, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name){
nvtxNameCudaDeviceA_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name){
nvtxNameCudaDeviceW_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name){
nvtxNameCudaStreamA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
if (local)
local(stream, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name){
nvtxNameCudaStreamW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
if (local)
local(stream, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name){
nvtxNameCudaEventA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
if (local)
local(event, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name){
nvtxNameCudaEventW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
if (local)
local(event, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name){
nvtxNameClDeviceA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name){
nvtxNameClDeviceW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name){
nvtxNameClContextA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr;
if (local)
local(context, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name){
nvtxNameClContextW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr;
if (local)
local(context, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name){
nvtxNameClCommandQueueA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr;
if (local)
local(command_queue, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name){
nvtxNameClCommandQueueW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr;
if (local)
local(command_queue, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name){
nvtxNameClMemObjectA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr;
if (local)
local(memobj, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name){
nvtxNameClMemObjectW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr;
if (local)
local(memobj, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name){
nvtxNameClSamplerA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr;
if (local)
local(sampler, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name){
nvtxNameClSamplerW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr;
if (local)
local(sampler, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name){
nvtxNameClProgramA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr;
if (local)
local(program, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name){
nvtxNameClProgramW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr;
if (local)
local(program, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name){
nvtxNameClEventA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr;
if (local)
local(evnt, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name){
nvtxNameClEventW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr;
if (local)
local(evnt, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs){
nvtxDomainSyncUserCreate_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
if (local) {
return local(domain, attribs);
}
return (nvtxSyncUser_t)0;
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle){
nvtxDomainSyncUserDestroy_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
if (local)
local(handle);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle){
nvtxDomainSyncUserAcquireStart_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
if (local)
local(handle);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle){
nvtxDomainSyncUserAcquireFailed_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
if (local)
local(handle);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle){
nvtxDomainSyncUserAcquireSuccess_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
if (local)
local(handle);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle){
nvtxDomainSyncUserReleasing_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
if (local)
local(handle);
}
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops);
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops)
{
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr = NULL;
}
@@ -0,0 +1,83 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef __NVTX_LINKONCE_H__
#define __NVTX_LINKONCE_H__
/* This header defines macros to permit making definitions of global variables
* and functions in C/C++ header files which may be included multiple times in
* a translation unit or linkage unit. It allows authoring header-only libraries
* which can be used by multiple other header-only libraries (either as the same
* copy or multiple copies), and does not require any build changes, such as
* adding another .c file, linking a static library, or deploying a dynamic
* library. Globals defined with these macros have the property that they have
* the same address, pointing to a single instance, for the entire linkage unit.
* It is expected but not guaranteed that each linkage unit will have a separate
* instance.
*
* In some situations it is desirable to declare a variable without initializing
* it, refer to it in code or other variables' initializers, and then initialize
* it later. Similarly, functions can be prototyped, have their address taken,
* and then have their body defined later. In such cases, use the FWDDECL macros
* when forward-declaring LINKONCE global variables without initializers and
* function prototypes, and then use the DEFINE macros when later defining them.
* Although in many cases the FWDDECL macro is equivalent to the DEFINE macro,
* following this pattern makes code maximally portable.
*/
#if defined(__MINGW32__) /* MinGW */
#define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
#if defined(__cplusplus)
#define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany)
#define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline NVTX_LINKONCE_WEAK
#else
#define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany)
#define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
#endif
#elif defined(_MSC_VER) /* MSVC */
#if defined(__cplusplus)
#define NVTX_LINKONCE_DEFINE_GLOBAL extern "C" __declspec(selectany)
#define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
#else
#define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany)
#define NVTX_LINKONCE_DEFINE_FUNCTION __inline
#endif
#elif defined(__CYGWIN__) && defined(__clang__) /* Clang on Cygwin */
#define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0.")))
#if defined(__cplusplus)
#define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK
#define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_WEAK
#else
#define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK
#define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
#endif
#elif defined(__CYGWIN__) /* Assume GCC or compatible */
#define NVTX_LINKONCE_WEAK __attribute__((weak))
#if defined(__cplusplus)
#define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany)
#define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline
#else
#define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK
#define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK
#endif
#else /* All others: Assume GCC, clang, or compatible */
#define NVTX_LINKONCE_WEAK __attribute__((weak))
#define NVTX_LINKONCE_HIDDEN __attribute__((visibility("hidden")))
#if defined(__cplusplus)
#define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
#define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_HIDDEN inline
#else
#define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
#define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK
#endif
#endif
#define NVTX_LINKONCE_FWDDECL_GLOBAL NVTX_LINKONCE_DEFINE_GLOBAL extern
#define NVTX_LINKONCE_FWDDECL_FUNCTION NVTX_LINKONCE_DEFINE_FUNCTION
#endif /* __NVTX_LINKONCE_H__ */
@@ -0,0 +1,304 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
/* This header defines types which are used by the internal implementation
* of NVTX and callback subscribers. API clients do not use these types,
* so they are defined here instead of in nvToolsExt.h to clarify they are
* not part of the NVTX client API. */
#ifndef NVTX_IMPL_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h.
#endif
/* ------ Dependency-free types binary-compatible with real types ------- */
/* In order to avoid having the NVTX core API headers depend on non-NVTX
* headers like cuda.h, NVTX defines binary-compatible types to use for
* safely making the initialization versions of all NVTX functions without
* needing to have definitions for the real types. */
typedef int nvtx_CUdevice;
typedef void* nvtx_CUcontext;
typedef void* nvtx_CUstream;
typedef void* nvtx_CUevent;
typedef void* nvtx_cudaStream_t;
typedef void* nvtx_cudaEvent_t;
typedef void* nvtx_cl_platform_id;
typedef void* nvtx_cl_device_id;
typedef void* nvtx_cl_context;
typedef void* nvtx_cl_command_queue;
typedef void* nvtx_cl_mem;
typedef void* nvtx_cl_program;
typedef void* nvtx_cl_kernel;
typedef void* nvtx_cl_event;
typedef void* nvtx_cl_sampler;
typedef struct nvtxSyncUser* nvtxSyncUser_t;
struct nvtxSyncUserAttributes_v0;
typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
/* --------- Types for function pointers (with fake API types) ---------- */
typedef void (NVTX_API * nvtxMarkEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib);
typedef void (NVTX_API * nvtxMarkA_impl_fntype)(const char* message);
typedef void (NVTX_API * nvtxMarkW_impl_fntype)(const wchar_t* message);
typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib);
typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartA_impl_fntype)(const char* message);
typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartW_impl_fntype)(const wchar_t* message);
typedef void (NVTX_API * nvtxRangeEnd_impl_fntype)(nvtxRangeId_t id);
typedef int (NVTX_API * nvtxRangePushEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib);
typedef int (NVTX_API * nvtxRangePushA_impl_fntype)(const char* message);
typedef int (NVTX_API * nvtxRangePushW_impl_fntype)(const wchar_t* message);
typedef int (NVTX_API * nvtxRangePop_impl_fntype)(void);
typedef void (NVTX_API * nvtxNameCategoryA_impl_fntype)(uint32_t category, const char* name);
typedef void (NVTX_API * nvtxNameCategoryW_impl_fntype)(uint32_t category, const wchar_t* name);
typedef void (NVTX_API * nvtxNameOsThreadA_impl_fntype)(uint32_t threadId, const char* name);
typedef void (NVTX_API * nvtxNameOsThreadW_impl_fntype)(uint32_t threadId, const wchar_t* name);
/* Real impl types are defined in nvtxImplCuda_v3.h, where CUDA headers are included */
typedef void (NVTX_API * nvtxNameCuDeviceA_fakeimpl_fntype)(nvtx_CUdevice device, const char* name);
typedef void (NVTX_API * nvtxNameCuDeviceW_fakeimpl_fntype)(nvtx_CUdevice device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuContextA_fakeimpl_fntype)(nvtx_CUcontext context, const char* name);
typedef void (NVTX_API * nvtxNameCuContextW_fakeimpl_fntype)(nvtx_CUcontext context, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuStreamA_fakeimpl_fntype)(nvtx_CUstream stream, const char* name);
typedef void (NVTX_API * nvtxNameCuStreamW_fakeimpl_fntype)(nvtx_CUstream stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuEventA_fakeimpl_fntype)(nvtx_CUevent event, const char* name);
typedef void (NVTX_API * nvtxNameCuEventW_fakeimpl_fntype)(nvtx_CUevent event, const wchar_t* name);
/* Real impl types are defined in nvtxImplOpenCL_v3.h, where OPENCL headers are included */
typedef void (NVTX_API * nvtxNameClDeviceA_fakeimpl_fntype)(nvtx_cl_device_id device, const char* name);
typedef void (NVTX_API * nvtxNameClDeviceW_fakeimpl_fntype)(nvtx_cl_device_id device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClContextA_fakeimpl_fntype)(nvtx_cl_context context, const char* name);
typedef void (NVTX_API * nvtxNameClContextW_fakeimpl_fntype)(nvtx_cl_context context, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClCommandQueueA_fakeimpl_fntype)(nvtx_cl_command_queue command_queue, const char* name);
typedef void (NVTX_API * nvtxNameClCommandQueueW_fakeimpl_fntype)(nvtx_cl_command_queue command_queue, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClMemObjectA_fakeimpl_fntype)(nvtx_cl_mem memobj, const char* name);
typedef void (NVTX_API * nvtxNameClMemObjectW_fakeimpl_fntype)(nvtx_cl_mem memobj, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClSamplerA_fakeimpl_fntype)(nvtx_cl_sampler sampler, const char* name);
typedef void (NVTX_API * nvtxNameClSamplerW_fakeimpl_fntype)(nvtx_cl_sampler sampler, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClProgramA_fakeimpl_fntype)(nvtx_cl_program program, const char* name);
typedef void (NVTX_API * nvtxNameClProgramW_fakeimpl_fntype)(nvtx_cl_program program, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClEventA_fakeimpl_fntype)(nvtx_cl_event evnt, const char* name);
typedef void (NVTX_API * nvtxNameClEventW_fakeimpl_fntype)(nvtx_cl_event evnt, const wchar_t* name);
/* Real impl types are defined in nvtxImplCudaRt_v3.h, where CUDART headers are included */
typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaStreamA_fakeimpl_fntype)(nvtx_cudaStream_t stream, const char* name);
typedef void (NVTX_API * nvtxNameCudaStreamW_fakeimpl_fntype)(nvtx_cudaStream_t stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaEventA_fakeimpl_fntype)(nvtx_cudaEvent_t event, const char* name);
typedef void (NVTX_API * nvtxNameCudaEventW_fakeimpl_fntype)(nvtx_cudaEvent_t event, const wchar_t* name);
typedef void (NVTX_API * nvtxDomainMarkEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
typedef nvtxRangeId_t (NVTX_API * nvtxDomainRangeStartEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
typedef void (NVTX_API * nvtxDomainRangeEnd_impl_fntype)(nvtxDomainHandle_t domain, nvtxRangeId_t id);
typedef int (NVTX_API * nvtxDomainRangePushEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
typedef int (NVTX_API * nvtxDomainRangePop_impl_fntype)(nvtxDomainHandle_t domain);
typedef nvtxResourceHandle_t (NVTX_API * nvtxDomainResourceCreate_impl_fntype)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
typedef void (NVTX_API * nvtxDomainResourceDestroy_impl_fntype)(nvtxResourceHandle_t resource);
typedef void (NVTX_API * nvtxDomainNameCategoryA_impl_fntype)(nvtxDomainHandle_t domain, uint32_t category, const char* name);
typedef void (NVTX_API * nvtxDomainNameCategoryW_impl_fntype)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
typedef nvtxStringHandle_t (NVTX_API * nvtxDomainRegisterStringA_impl_fntype)(nvtxDomainHandle_t domain, const char* string);
typedef nvtxStringHandle_t (NVTX_API * nvtxDomainRegisterStringW_impl_fntype)(nvtxDomainHandle_t domain, const wchar_t* string);
typedef nvtxDomainHandle_t (NVTX_API * nvtxDomainCreateA_impl_fntype)(const char* message);
typedef nvtxDomainHandle_t (NVTX_API * nvtxDomainCreateW_impl_fntype)(const wchar_t* message);
typedef void (NVTX_API * nvtxDomainDestroy_impl_fntype)(nvtxDomainHandle_t domain);
typedef void (NVTX_API * nvtxInitialize_impl_fntype)(const void* reserved);
typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
/* ---------------- Types for callback subscription --------------------- */
typedef const void *(NVTX_API * NvtxGetExportTableFunc_t)(uint32_t exportTableId);
typedef int (NVTX_API * NvtxInitializeInjectionNvtxFunc_t)(NvtxGetExportTableFunc_t exportTable);
typedef enum NvtxCallbackModule
{
NVTX_CB_MODULE_INVALID = 0,
NVTX_CB_MODULE_CORE = 1,
NVTX_CB_MODULE_CUDA = 2,
NVTX_CB_MODULE_OPENCL = 3,
NVTX_CB_MODULE_CUDART = 4,
NVTX_CB_MODULE_CORE2 = 5,
NVTX_CB_MODULE_SYNC = 6,
/* --- New constants must only be added directly above this line --- */
NVTX_CB_MODULE_SIZE,
NVTX_CB_MODULE_FORCE_INT = 0x7fffffff
} NvtxCallbackModule;
typedef enum NvtxCallbackIdCore
{
NVTX_CBID_CORE_INVALID = 0,
NVTX_CBID_CORE_MarkEx = 1,
NVTX_CBID_CORE_MarkA = 2,
NVTX_CBID_CORE_MarkW = 3,
NVTX_CBID_CORE_RangeStartEx = 4,
NVTX_CBID_CORE_RangeStartA = 5,
NVTX_CBID_CORE_RangeStartW = 6,
NVTX_CBID_CORE_RangeEnd = 7,
NVTX_CBID_CORE_RangePushEx = 8,
NVTX_CBID_CORE_RangePushA = 9,
NVTX_CBID_CORE_RangePushW = 10,
NVTX_CBID_CORE_RangePop = 11,
NVTX_CBID_CORE_NameCategoryA = 12,
NVTX_CBID_CORE_NameCategoryW = 13,
NVTX_CBID_CORE_NameOsThreadA = 14,
NVTX_CBID_CORE_NameOsThreadW = 15,
/* --- New constants must only be added directly above this line --- */
NVTX_CBID_CORE_SIZE,
NVTX_CBID_CORE_FORCE_INT = 0x7fffffff
} NvtxCallbackIdCore;
typedef enum NvtxCallbackIdCore2
{
NVTX_CBID_CORE2_INVALID = 0,
NVTX_CBID_CORE2_DomainMarkEx = 1,
NVTX_CBID_CORE2_DomainRangeStartEx = 2,
NVTX_CBID_CORE2_DomainRangeEnd = 3,
NVTX_CBID_CORE2_DomainRangePushEx = 4,
NVTX_CBID_CORE2_DomainRangePop = 5,
NVTX_CBID_CORE2_DomainResourceCreate = 6,
NVTX_CBID_CORE2_DomainResourceDestroy = 7,
NVTX_CBID_CORE2_DomainNameCategoryA = 8,
NVTX_CBID_CORE2_DomainNameCategoryW = 9,
NVTX_CBID_CORE2_DomainRegisterStringA = 10,
NVTX_CBID_CORE2_DomainRegisterStringW = 11,
NVTX_CBID_CORE2_DomainCreateA = 12,
NVTX_CBID_CORE2_DomainCreateW = 13,
NVTX_CBID_CORE2_DomainDestroy = 14,
NVTX_CBID_CORE2_Initialize = 15,
/* --- New constants must only be added directly above this line --- */
NVTX_CBID_CORE2_SIZE,
NVTX_CBID_CORE2_FORCE_INT = 0x7fffffff
} NvtxCallbackIdCore2;
typedef enum NvtxCallbackIdCuda
{
NVTX_CBID_CUDA_INVALID = 0,
NVTX_CBID_CUDA_NameCuDeviceA = 1,
NVTX_CBID_CUDA_NameCuDeviceW = 2,
NVTX_CBID_CUDA_NameCuContextA = 3,
NVTX_CBID_CUDA_NameCuContextW = 4,
NVTX_CBID_CUDA_NameCuStreamA = 5,
NVTX_CBID_CUDA_NameCuStreamW = 6,
NVTX_CBID_CUDA_NameCuEventA = 7,
NVTX_CBID_CUDA_NameCuEventW = 8,
/* --- New constants must only be added directly above this line --- */
NVTX_CBID_CUDA_SIZE,
NVTX_CBID_CUDA_FORCE_INT = 0x7fffffff
} NvtxCallbackIdCuda;
typedef enum NvtxCallbackIdCudaRt
{
NVTX_CBID_CUDART_INVALID = 0,
NVTX_CBID_CUDART_NameCudaDeviceA = 1,
NVTX_CBID_CUDART_NameCudaDeviceW = 2,
NVTX_CBID_CUDART_NameCudaStreamA = 3,
NVTX_CBID_CUDART_NameCudaStreamW = 4,
NVTX_CBID_CUDART_NameCudaEventA = 5,
NVTX_CBID_CUDART_NameCudaEventW = 6,
/* --- New constants must only be added directly above this line --- */
NVTX_CBID_CUDART_SIZE,
NVTX_CBID_CUDART_FORCE_INT = 0x7fffffff
} NvtxCallbackIdCudaRt;
typedef enum NvtxCallbackIdOpenCL
{
NVTX_CBID_OPENCL_INVALID = 0,
NVTX_CBID_OPENCL_NameClDeviceA = 1,
NVTX_CBID_OPENCL_NameClDeviceW = 2,
NVTX_CBID_OPENCL_NameClContextA = 3,
NVTX_CBID_OPENCL_NameClContextW = 4,
NVTX_CBID_OPENCL_NameClCommandQueueA = 5,
NVTX_CBID_OPENCL_NameClCommandQueueW = 6,
NVTX_CBID_OPENCL_NameClMemObjectA = 7,
NVTX_CBID_OPENCL_NameClMemObjectW = 8,
NVTX_CBID_OPENCL_NameClSamplerA = 9,
NVTX_CBID_OPENCL_NameClSamplerW = 10,
NVTX_CBID_OPENCL_NameClProgramA = 11,
NVTX_CBID_OPENCL_NameClProgramW = 12,
NVTX_CBID_OPENCL_NameClEventA = 13,
NVTX_CBID_OPENCL_NameClEventW = 14,
/* --- New constants must only be added directly above this line --- */
NVTX_CBID_OPENCL_SIZE,
NVTX_CBID_OPENCL_FORCE_INT = 0x7fffffff
} NvtxCallbackIdOpenCL;
typedef enum NvtxCallbackIdSync
{
NVTX_CBID_SYNC_INVALID = 0,
NVTX_CBID_SYNC_DomainSyncUserCreate = 1,
NVTX_CBID_SYNC_DomainSyncUserDestroy = 2,
NVTX_CBID_SYNC_DomainSyncUserAcquireStart = 3,
NVTX_CBID_SYNC_DomainSyncUserAcquireFailed = 4,
NVTX_CBID_SYNC_DomainSyncUserAcquireSuccess = 5,
NVTX_CBID_SYNC_DomainSyncUserReleasing = 6,
/* --- New constants must only be added directly above this line --- */
NVTX_CBID_SYNC_SIZE,
NVTX_CBID_SYNC_FORCE_INT = 0x7fffffff
} NvtxCallbackIdSync;
/* IDs for NVTX Export Tables */
typedef enum NvtxExportTableID
{
NVTX_ETID_INVALID = 0,
NVTX_ETID_CALLBACKS = 1,
NVTX_ETID_RESERVED0 = 2,
NVTX_ETID_VERSIONINFO = 3,
/* --- New constants must only be added directly above this line --- */
NVTX_ETID_SIZE,
NVTX_ETID_FORCE_INT = 0x7fffffff
} NvtxExportTableID;
typedef void (* NvtxFunctionPointer)(void); /* generic uncallable function pointer, must be casted to appropriate function type */
typedef NvtxFunctionPointer** NvtxFunctionTable; /* double pointer because array(1) of pointers(2) to function pointers */
typedef struct NvtxExportTableCallbacks
{
size_t struct_size;
/* returns an array of pointer to function pointers*/
int (NVTX_API *GetModuleFunctionTable)(
NvtxCallbackModule module,
NvtxFunctionTable* out_table,
unsigned int* out_size);
} NvtxExportTableCallbacks;
typedef struct NvtxExportTableVersionInfo
{
/* sizeof(NvtxExportTableVersionInfo) */
size_t struct_size;
/* The API version comes from the NVTX library linked to the app. The
* injection library is can use this info to make some assumptions */
uint32_t version;
/* Reserved for alignment, do not use */
uint32_t reserved0;
/* This must be set by tools when attaching to provide applications
* the ability to, in emergency situations, detect problematic tools
* versions and modify the NVTX source to prevent attaching anything
* that causes trouble in the app. Currently, this value is ignored. */
void (NVTX_API *SetInjectionNvtxVersion)(
uint32_t version);
} NvtxExportTableVersionInfo;
+15
Bestand weergeven
@@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_NVTX_STUB_H_
#define NCCL_NVTX_STUB_H_
struct nccl_domain{static constexpr char const* name{"NCCL"};};
#define NVTX3_FUNC_RANGE_IN(domain)
#define nvtxNameOsThreadA(syscall, thread)
#endif
+25 -14
Bestand weergeven
@@ -10,23 +10,34 @@
#define NCCL_P2P_H_
struct ncclP2Pinfo {
const void* sendbuff;
void* recvbuff;
ssize_t sendbytes;
ssize_t recvbytes;
};
struct ncclP2PConnect {
int nrecv[MAXCHANNELS];
int nsend[MAXCHANNELS];
int* recv;
int* send;
void* buff;
ssize_t nbytes;
struct ncclP2Pinfo* next;
};
struct ncclP2Plist {
struct ncclP2Pinfo *peerlist;
int count;
struct ncclP2PConnect connect;
struct ncclP2Pinfo *head;
struct ncclP2Pinfo *tail;
};
static ncclResult_t enqueueP2pInfo(ncclP2Plist* p2p, void* buff, ssize_t nBytes) {
if (p2p == NULL) return ncclInternalError;
struct ncclP2Pinfo* next;
NCCLCHECK(ncclCalloc(&next, 1));
next->buff = buff;
next->nbytes = nBytes;
if (p2p->tail != NULL) p2p->tail->next = next;
p2p->tail = next;
if (p2p->head == NULL) p2p->head = next;
return ncclSuccess;
}
static ncclResult_t dequeueP2pInfo(ncclP2Plist* p2p) {
if (p2p == NULL) return ncclInternalError;
struct ncclP2Pinfo* temp = p2p->head;
p2p->head = p2p->head->next;
if (p2p->tail == temp) p2p->tail = NULL;
free(temp);
return ncclSuccess;
}
#endif
+3 -2
Bestand weergeven
@@ -31,10 +31,11 @@ static void setEnvFile(const char* fileName) {
int s=0; // Env Var Size
while (line[s] != '\0' && line[s] != '=') s++;
if (line[s] == '\0') continue;
strncpy(envVar, line, std::min(1024,s));
strncpy(envVar, line, std::min(1023,s));
envVar[s] = '\0';
s++;
strncpy(envValue, line+s, 1024);
strncpy(envValue, line+s, 1023);
envValue[1023]='\0';
setenv(envVar, envValue, 0);
}
if (line) free(line);
+30 -5
Bestand weergeven
@@ -18,18 +18,23 @@ struct ncclProxyArgs {
proxyProgressFunc_t progress;
struct ncclChannel* channel;
struct ncclConnector* connector;
size_t sendbytes;
size_t recvbytes;
int sliceSteps;
int chunkSteps;
int nsteps;
uint64_t opCount;
int protocol;
int segment; // Only for profiling
ncclDataType_t dtype;
ncclRedOp_t redOp;
int state; // add component before this line -- it is left out during initialization
// Internal state
uint64_t head;
uint64_t tail;
uint64_t posted;
uint64_t received; // Only used by recv proxy to wait for flush.
uint64_t transmitted;
uint64_t done;
uint64_t end;
void* requests[NCCL_STEPS];
int idle;
@@ -38,14 +43,30 @@ struct ncclProxyArgs {
pthread_mutex_t mutex;
struct ncclProxyArgs* next;
struct ncclProxyArgs* nextPeer;
struct ncclProxyArgs* nextGroup;
struct ncclProxyArgs** proxyAppendPtr;
};
struct ncclProxySharedBuffers {
int nslots;
int slotSize;
char* cudaBuff[2*MAXCHANNELS];
int* cudaUsed[2*MAXCHANNELS];
char* hostBuff[2*MAXCHANNELS];
int* hostUsed[2*MAXCHANNELS];
struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv
};
struct ncclProxyPool;
struct ncclProxyState {
pthread_cond_t cond;
pthread_mutex_t mutex;
pthread_mutex_t opsMutex;
pthread_mutex_t poolMutex;
bool stop;
struct ncclProxySharedBuffers* sharedBuffs;
struct ncclProxyArgs* ops;
struct ncclProxyArgs* nextOps;
struct ncclProxyArgs* nextOpsEnd;
struct ncclProxyArgs* pool;
struct ncclProxyPool* pools;
};
@@ -59,12 +80,16 @@ enum proxyMode {
};
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks);
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel);
ncclResult_t ncclProxySaveA2a(struct ncclProxyArgs* args, struct ncclInfo* info);
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment);
ncclResult_t ncclProxyStart(struct ncclComm* comm);
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr);
ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr);
ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr);
ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm);
#include <unistd.h>
// Spin wait until func evaluates to true
+12 -9
Bestand weergeven
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -21,6 +21,7 @@
#define SLEEP_INT 1000 // connection retry sleep interval in usec
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
/* Common socket address storage structure for IPv4/IPv6 */
union socketAddress {
@@ -64,7 +65,7 @@ static inline int envSocketFamily(void) {
static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
#ifdef ENABLE_TRACE
char line[1024];
char line[SOCKET_NAME_MAXLEN+1];
#endif
struct netIf userIfs[MAX_IFS];
bool searchNot = prefixList && prefixList[0] == '^';
@@ -167,9 +168,9 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) {
static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
#ifdef ENABLE_TRACE
char line[1024];
char line[SOCKET_NAME_MAXLEN+1];
#endif
char line_a[1024];
char line_a[SOCKET_NAME_MAXLEN+1];
int found = 0;
struct ifaddrs *interfaces, *interface;
getifaddrs(&interfaces);
@@ -355,7 +356,7 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
SYSCHECK(getsockname(sockfd, &localAddr->sa, &size), "getsockname");
#ifdef ENABLE_TRACE
char line[1024];
char line[SOCKET_NAME_MAXLEN+1];
TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
#endif
@@ -370,6 +371,10 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
/* IPv4/IPv6 support */
int family = remoteAddr->sa.sa_family;
if (family != AF_INET && family != AF_INET6) {
WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)\n", family, AF_INET, AF_INET6);
return ncclInternalError;
}
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
/* Connect to a hostname / port */
@@ -386,10 +391,8 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
char line[1024];
#ifdef ENABLE_TRACE
char line[SOCKET_NAME_MAXLEN+1];
TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
#endif
int ret;
int timedout_retries = 0;
@@ -450,7 +453,7 @@ static ncclResult_t socketSend(int fd, void* ptr, int size) {
return ncclSuccess;
}
static ncclResult_t socketReceive(int fd, void* ptr, int size) {
static ncclResult_t socketRecv(int fd, void* ptr, int size) {
int offset = 0;
NCCLCHECK(socketWait(NCCL_SOCKET_RECV, fd, ptr, size, &offset));
return ncclSuccess;
+4 -3
Bestand weergeven
@@ -41,8 +41,8 @@ struct ncclConnect {
};
struct ncclTransportComm {
ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
ncclResult_t (*connect)(struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
ncclResult_t (*free)(void*);
ncclResult_t (*proxy)(struct ncclProxyArgs*);
};
@@ -54,6 +54,7 @@ struct ncclTransport {
struct ncclTransportComm recv;
};
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph);
#endif
+3 -3
Bestand weergeven
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,7 +7,7 @@
#ifndef NCCL_TREES_H_
#define NCCL_TREES_H_
ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0);
ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* u1, int* d1_0, int* d1_1);
ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType);
ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1);
#endif