Merge pull request #319 from wenkaidu/2.8.3

Update to 2.8.4
Cette révision appartient à :
Wenkai Du
2021-02-22 10:17:38 -08:00
révisé par GitHub
révision a058d5639e
164 fichiers modifiés avec 13249 ajouts et 5744 suppressions
+2 -1
Voir le fichier
@@ -21,7 +21,8 @@ def runTestCommand (platform, project)
def command = """#!/usr/bin/env bash
set -x
cd ${project.paths.project_build_prefix}/build/release/test
${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_output=xml --gtest_color=yes
${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter="BroadcastCorrectnessSweep*:*float32*" --gtest_output=xml --gtest_color=yes
${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTestsMultiProcess --gtest_filter="BroadcastMultiProcessCorrectnessSweep*:*float32*" --gtest_output=xml --gtest_color=yes
"""
platform.runCommand(this, command)
+68
Voir le fichier
@@ -0,0 +1,68 @@
#!/usr/bin/env groovy
// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
@Library('rocJenkins@pong') _
// This is file for internal AMD use.
// If you are interested in running your own Jenkins, please raise a github issue for assistance.
import com.amd.project.*
import com.amd.docker.*
import java.nio.file.Path
def runCompileCommand(platform, project, jobName, boolean debug=false)
{
project.paths.construct_build_prefix()
def command = """#!/usr/bin/env bash
set -x
${project.paths.project_build_prefix}/docs/run_doc.sh
"""
try
{
platform.runCommand(this, command)
}
catch(e)
{
throw e
}
publishHTML([allowMissing: false,
alwaysLinkToLastBuild: false,
keepAll: false,
reportDir: "${project.paths.project_build_prefix}/docs/source/_build/html",
reportFiles: "index.html",
reportName: "Documentation",
reportTitles: "Documentation"])
}
def runCI =
{
nodeDetails, jobName->
def prj = new rocProject('rccl-internal', 'StaticAnalysis')
// Define test architectures, optional rocm version argument is available
def nodes = new dockerNodes(nodeDetails, jobName, prj)
boolean formatCheck = false
boolean staticAnalysis = true
def compileCommand =
{
platform, project->
runCompileCommand(platform, project, jobName, false)
}
buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis)
}
ci: {
String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 6')])]))
stage(urlJobName) {
runCI([ubuntu18:['any']], urlJobName)
}
}
-2
Voir le fichier
@@ -103,8 +103,6 @@ set(CU_SOURCES
src/collectives/device/broadcast.cu
src/collectives/device/reduce_scatter.cu
src/collectives/device/sendrecv.cu
src/collectives/device/gather.cu
src/collectives/device/scatter.cu
src/collectives/device/all_to_all.cu
src/collectives/device/all_to_allv.cu
src/collectives/device/functions.cu)
+9 -1
Voir le fichier
@@ -1,5 +1,5 @@
Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -30,3 +30,11 @@
The U.S. Department of Energy funded the development of this software
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
This code also includes files from the NVIDIA Tools Extension SDK project.
See:
https://github.com/NVIDIA/NVTX
for more information and license details.
+5 -4
Voir le fichier
@@ -67,7 +67,8 @@ There are unit tests implemented with the Googletest framework in RCCL, which ar
To invoke the unit tests, go to the build folder, then the test subfolder, and execute the appropriate unit test executable(s).
Unit test names are now of the format:
[CollectiveCall]CorrectnessSweep/[CollectiveCall]CorrectnessTest.[Type of test]/[ncclRedOp_t]_[datatype]_[number of elements]_[number of devices]_[in place/out of place]_[environment variables]
[CollectiveCall]CorrectnessSweep/[CollectiveCall]CorrectnessTest.[Type of test]/[ncclRedOp_t]_[datatype]_[number of elements]_[number of devices]_[in place/out of place]_[environment variables]
This allows filtering of unit tests being run by their parameter values by passing the --gtest_filter command line flag, for example:
@@ -82,10 +83,10 @@ See the rccl-tests README for more information on how to build and run those tes
## Library and API Documentation
Please refer to the [Library documentation](http://rccl.readthedocs.io/) for current documentation.
Please refer to the [Library documentation](https://rccl.readthedocs.io/) for current documentation.
## Copyright
All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
All source code and accompanying documentation is copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
All modifications are copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
All modifications are copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+8 -10
Voir le fichier
@@ -1,17 +1,15 @@
#!/bin/bash
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
if [ -d docBin ]; then
rm -rf docBin
fi
set -eu
sed -e 's/ROCFFT_EXPORT //g' ../library/include/rocfft.h > rocfft.h
doxygen Doxyfile
# Make this directory the PWD
cd "$(dirname "${BASH_SOURCE[0]}")"
# Build doxygen info
./run_doxygen.sh
# Build sphinx docs
cd source
make clean
make html
cd ..
rm rocfft.h
+11 -7
Voir le fichier
@@ -1,13 +1,17 @@
#!/bin/bash
# # Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
# # Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
if [ -d docBin ]; then
rm -rf docBin
fi
set -eu
rm nccl.h
# Make this directory the PWD
cd "$(dirname "${BASH_SOURCE[0]}")"
sed -e 's/ROCFFT_EXPORT //g' ../src/nccl.h.in > nccl.h
# Rename our input file
cp ../src/nccl.h.in nccl.h
# Build the doxygen info
rm -rf docBin
doxygen Doxyfile
#rm nccl.h
# Cleanup
rm nccl.h
+3 -3
Voir le fichier
@@ -56,8 +56,8 @@ master_doc = 'index'
# General information about the project.
project = u'RCCL'
copyright = u'2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019-2020 Advanced Mirco Devices'
author = u'Advanced Mirco Devices'
copyright = u'2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019-2021 Advanced Micro Devices'
author = u'Advanced Micro Devices'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
@@ -156,7 +156,7 @@ latex_elements = {
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'RCCL.tex', u'RCCL Documentation',
u'Advanced Mirco Devices', 'manual'),
u'Advanced Micro Devices', 'manual'),
]
+2 -2
Voir le fichier
@@ -199,10 +199,10 @@ if ($run_tests); then
if (test -f "./test/UnitTests"); then
if ($run_tests_all); then
./test/UnitTests
NCCL_COMM_ID=$HOSTNAME:55512 ./test/UnitTestsMultiProcess
./test/UnitTestsMultiProcess
else
./test/UnitTests --gtest_filter="BroadcastCorrectnessSweep*:*float32*"
NCCL_COMM_ID=$HOSTNAME:55512 ./test/UnitTestsMultiProcess --gtest_filter="BroadcastMultiProcessCorrectnessSweep*:*float32*"
./test/UnitTestsMultiProcess --gtest_filter="BroadcastMultiProcessCorrectnessSweep*:*float32*"
fi
else
echo "Unit tests have not been built yet; please re-run script with -t to build unit tests."
+5
Voir le fichier
@@ -11,6 +11,7 @@ KEEP ?= 0
DEBUG ?= 0
TRACE ?= 0
PROFAPI ?= 0
NVTX ?= 1
NVCC = $(CUDA_HOME)/bin/nvcc
@@ -87,6 +88,10 @@ ifneq ($(TRACE), 0)
CXXFLAGS += -DENABLE_TRACE
endif
ifeq ($(NVTX), 0)
CXXFLAGS += -DNVTX_DISABLE
endif
ifneq ($(KEEP), 0)
NVCUFLAGS += -keep
endif
+2 -2
Voir le fichier
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 7
NCCL_PATCH := 8
NCCL_MINOR := 8
NCCL_PATCH := 4
NCCL_SUFFIX :=
PKG_REVISION := 1
+2 -2
Voir le fichier
@@ -9,7 +9,7 @@ Package: libnccl${nccl:Major}
Section: libs
Architecture: ${pkg:Arch}
Depends: ${misc:Depends}, ${shlibs:Depends}
Description: NVIDIA Collectives Communication Library (NCCL) Runtime
Description: NVIDIA Collective Communication Library (NCCL) Runtime
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
communication routines for GPUs, implementing all-reduce, all-gather, reduce,
broadcast, and reduce-scatter.
@@ -21,7 +21,7 @@ Package: libnccl-dev
Section: libdevel
Architecture: ${pkg:Arch}
Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
Description: NVIDIA Collectives Communication Library (NCCL) Development Files
Description: NVIDIA Collective Communication Library (NCCL) Development Files
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
communication routines for GPUs, implementing all-reduce, all-gather, reduce,
broadcast, and reduce-scatter.
+3 -3
Voir le fichier
@@ -1,7 +1,7 @@
Name: libnccl
Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
Group: Development/Libraries
License: BSD
@@ -18,13 +18,13 @@ NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
sockets.
%package devel
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
Group: Development/Libraries
%description devel
NCCL development files
%package static
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
Group: Development/Libraries
%description static
NCCL static library
+272 -207
Voir le fichier
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -18,144 +19,77 @@
#include "clique/Hash.h"
// [/RCCL]
struct bootstrapNetComm {
int fd;
};
/* Init functions */
static char bootstrapNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
static union socketAddress bootstrapNetIfAddrs[MAX_IFS];
static int bootstrapNetIfs = -1;
static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
static union socketAddress bootstrapNetIfAddr;
static int bootstrapNetInitDone = 0;
pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
ncclResult_t bootstrapNetInit() {
if (bootstrapNetIfs == -1) {
if (bootstrapNetInitDone == 0) {
pthread_mutex_lock(&bootstrapNetLock);
if (bootstrapNetIfs == -1) {
bootstrapNetIfs = findInterfaces(bootstrapNetIfNames, bootstrapNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
if (bootstrapNetIfs <= 0) {
WARN("Bootstrap : no socket interface found");
return ncclInternalError;
} else {
char line[1024];
char addrline[1024];
line[0] = '\0';
for (int i=0; i<bootstrapNetIfs; i++) {
snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, bootstrapNetIfNames+i*MAX_IF_NAME_SIZE,
socketToString(&bootstrapNetIfAddrs[i].sa, addrline));
if (bootstrapNetInitDone == 0) {
char* env = getenv("NCCL_COMM_ID");
if (env) {
union socketAddress remoteAddr;
if (GetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
if (findInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
WARN("NET/Socket : No usable listening interface found");
return ncclSystemError;
}
} else {
int nIfs = findInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
if (nIfs <= 0) {
WARN("Bootstrap : no socket interface found");
return ncclInternalError;
}
line[1023] = '\0';
INFO(NCCL_INIT, "Bootstrap : Using%s", line);
}
char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2];
sprintf(line, " %s:", bootstrapNetIfName);
socketToString(&bootstrapNetIfAddr.sa, line+strlen(line));
INFO(NCCL_INIT, "Bootstrap : Using%s", line);
bootstrapNetInitDone = 1;
}
pthread_mutex_unlock(&bootstrapNetLock);
}
return ncclSuccess;
}
static ncclResult_t bootstrapNetNewComm(struct bootstrapNetComm** comm) {
NCCLCHECK(ncclCalloc(comm, 1));
(*comm)->fd = -1;
return ncclSuccess;
}
static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr) {
if (dev >= bootstrapNetIfs) return ncclInternalError;
memcpy(addr, bootstrapNetIfAddrs+dev, sizeof(*addr));
return ncclSuccess;
}
/* Socket Interface Selection type */
enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
static ncclResult_t bootstrapNetListen(int dev, ncclNetHandle_t* netHandle, void** listenComm) {
union socketAddress* connectAddr = (union socketAddress*) netHandle;
static_assert(sizeof(union socketAddress) < NCCL_NET_HANDLE_MAXSIZE, "union socketAddress size is too large");
// if dev >= 0, listen based on dev
if (dev >= 0) {
NCCLCHECK(bootstrapNetGetSocketAddr(dev, connectAddr));
} else if (dev == findSubnetIf) {
// handle stores a remote address
// need to find a local addr that is in the same network as the remote addr
union socketAddress localAddr;
char ifName[MAX_IF_NAME_SIZE];
if (findInterfaceMatchSubnet(ifName, &localAddr, connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
WARN("NET/Socket : No usable listening interface found");
return ncclSystemError;
}
// pass the local address back
memcpy(connectAddr, &localAddr, sizeof(localAddr));
} // Otherwise, handle stores a local address
struct bootstrapNetComm* comm;
NCCLCHECK(bootstrapNetNewComm(&comm));
NCCLCHECK(createListenSocket(&comm->fd, connectAddr));
*listenComm = comm;
return ncclSuccess;
}
static ncclResult_t bootstrapNetConnect(int dev, ncclNetHandle_t* netHandle, void** sendComm) {
union socketAddress* connectAddr = (union socketAddress*) netHandle;
struct bootstrapNetComm* comm;
NCCLCHECK(bootstrapNetNewComm(&comm));
NCCLCHECK(connectAddress(&comm->fd, connectAddr));
*sendComm = comm;
return ncclSuccess;
}
static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) {
struct bootstrapNetComm* lComm = (struct bootstrapNetComm*)listenComm;
struct bootstrapNetComm* rComm;
NCCLCHECK(bootstrapNetNewComm(&rComm));
static ncclResult_t bootstrapNetAccept(int listenFd, int* recvFd) {
struct sockaddr_in sockaddr;
socklen_t socklen = sizeof(struct sockaddr_in);
SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
*recvComm = rComm;
SYSCHECKVAL(accept(listenFd, (struct sockaddr*)&sockaddr, &socklen), "accept", *recvFd);
return ncclSuccess;
}
static ncclResult_t bootstrapNetClose(void* opaqueComm) {
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)opaqueComm;
if (comm) {
close(comm->fd);
free(comm);
}
return ncclSuccess;
}
static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(bootstrapNetClose(sendComm)); return ncclSuccess; }
static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(bootstrapNetClose(recvComm)); return ncclSuccess; }
static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(bootstrapNetClose(listenComm)); return ncclSuccess; }
// Additional sync functions
static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) {
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)sendComm;
NCCLCHECK(socketSend(comm->fd, &size, sizeof(int)));
NCCLCHECK(socketSend(comm->fd, data, size));
static ncclResult_t bootstrapNetSend(int fd, void* data, int size) {
NCCLCHECK(socketSend(fd, &size, sizeof(int)));
NCCLCHECK(socketSend(fd, data, size));
return ncclSuccess;
}
static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)recvComm;
static ncclResult_t bootstrapNetRecv(int fd, void* data, int size) {
int recvSize;
NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int)));
NCCLCHECK(socketRecv(fd, &recvSize, sizeof(int)));
if (recvSize > size) {
WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
return ncclInternalError;
}
NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size)));
return ncclSuccess;
}
ncclResult_t bootstrapNetCreateHandle(ncclNetHandle_t* netHandle, const char* str) {
union socketAddress* connectAddr = (union socketAddress*) netHandle;
NCCLCHECK(GetSocketAddrFromString(connectAddr, str));
NCCLCHECK(socketRecv(fd, data, std::min(recvSize, size)));
return ncclSuccess;
}
struct extInfo {
int rank;
int nranks;
ncclNetHandle_t extHandleListenRoot;
ncclNetHandle_t extHandleListen;
union socketAddress extAddressListenRoot;
union socketAddress extAddressListen;
};
#include <sys/resource.h>
@@ -169,33 +103,37 @@ static ncclResult_t setFilesLimit() {
}
static void *bootstrapRoot(void* bootstrapRootStruct) { // [RCCL] Modified to include hash argument)
// [RCCL] Unpack bootstrapRootStruct
struct bootstrapRootStruct* rootStruct = (struct bootstrapRootStruct*) bootstrapRootStruct;
void* listenComm = rootStruct->listenComm;
unsigned long hash = rootStruct->hash;
struct bootstrapRootStruct rootStruct = *(struct bootstrapRootStruct*)bootstrapRootStruct;
int listenFd = rootStruct.listenFd;
unsigned long hash = rootStruct.hash;
int pid = getpid(); // sharing PID to other ranks for creating shared memory files for CliqueManager
free(bootstrapRootStruct);
// [/RCCL]
ncclResult_t res = ncclSuccess;
int nranks = 0, c = 0;
struct extInfo info;
ncclNetHandle_t *rankHandles = NULL;
ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
ncclNetHandle_t zero = { 0 }; // for sanity checking
void* tmpComm;
ncclResult_t res;
union socketAddress *rankAddresses = NULL;
union socketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
union socketAddress *zero = NULL;
NCCLCHECKGOTO(ncclCalloc(&zero, 1), res, out);
setFilesLimit();
TRACE(NCCL_INIT, "BEGIN");
/* Receive addresses from all ranks */
int nranks = 0, c = 0;
do {
NCCLCHECKGOTO(bootstrapNetAccept(listenComm, &tmpComm), res, out);
NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
int tmpFd;
NCCLCHECKGOTO(bootstrapNetAccept(listenFd, &tmpFd), res, out);
NCCLCHECKGOTO(bootstrapNetRecv(tmpFd, &info, sizeof(info)), res, out);
close(tmpFd);
if (c == 0) {
nranks = info.nranks;
NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out);
NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out);
NCCLCHECKGOTO(ncclCalloc(&rankAddresses, nranks), res, out);
NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nranks), res, out);
}
if (nranks != info.nranks) {
@@ -203,14 +141,14 @@ static void *bootstrapRoot(void* bootstrapRootStruct) { // [RCCL] Modified to in
goto out;
}
if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) {
if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union socketAddress)) != 0) {
WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
goto out;
}
// Save the connection handle for that rank
memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t));
memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union socketAddress));
memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union socketAddress));
++c;
TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks);
@@ -224,35 +162,37 @@ static void *bootstrapRoot(void* bootstrapRootStruct) { // [RCCL] Modified to in
// Send the connect handle for the next rank in the AllGather ring
for (int r=0; r<nranks; ++r) {
int next = (r+1) % nranks;
void *tmpSendComm;
NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot+r, &tmpSendComm), res, out);
NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
int tmpSendFd;
NCCLCHECKGOTO(connectAddress(&tmpSendFd, rankAddressesRoot+r), res, out);
NCCLCHECKGOTO(bootstrapNetSend(tmpSendFd, rankAddresses+next, sizeof(union socketAddress)), res, out);
{ // [RCCL] Send the root pid for shared file naming
NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, &pid, sizeof(int)), res, out);
NCCLCHECKGOTO(bootstrapNetSend(tmpSendFd, &pid, sizeof(int)), res, out);
} // [/RCCL]
NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
close(tmpSendFd);
}
TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
out:
bootstrapNetCloseListen(listenComm);
if (rankHandles) free(rankHandles);
if (rankHandlesRoot) free(rankHandlesRoot);
close(listenFd);
if (rankAddresses) free(rankAddresses);
if (rankAddressesRoot) free(rankAddressesRoot);
if (zero) free(zero);
TRACE(NCCL_INIT, "DONE");
return NULL;
}
ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
void* listenComm;
NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, netHandle, &listenComm));
union socketAddress* connectAddr = (union socketAddress*) id;
int listenFd;
NCCLCHECK(createListenSocket(&listenFd, connectAddr));
pthread_t thread;
// [RCCL] Use the ncclUniqueId to get a hash for bootstrap
struct bootstrapRootStruct* rootStruct = new bootstrapRootStruct;
struct bootstrapRootStruct* rootStruct = new struct bootstrapRootStruct;
rootStruct->hash = djb2Hash(id->internal);
rootStruct->listenComm = listenComm;
rootStruct->listenFd = listenFd;
pthread_create(&thread, NULL, bootstrapRoot, (void *)rootStruct);
// [/RCCL]
@@ -260,18 +200,19 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
}
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
static_assert(sizeof(ncclNetHandle_t) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
static_assert(sizeof(union socketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
memset(id, 0, sizeof(ncclUniqueId));
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
union socketAddress* connectAddr = (union socketAddress*) id;
char* env = getenv("NCCL_COMM_ID");
if (env) {
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
if (bootstrapNetCreateHandle(netHandle, env) != 0) {
if (GetSocketAddrFromString(connectAddr, env) != ncclSuccess) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
} else {
memcpy(id, &bootstrapNetIfAddr, sizeof(union socketAddress));
NCCLCHECK(bootstrapCreateRoot(id, false));
}
@@ -280,25 +221,135 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
struct unexConn {
int peer;
void* comm;
int fd;
struct unexConn* next;
};
struct extState {
void* extBstrapListenComm;
void* extBstrapRingRecvComm;
void* extBstrapRingSendComm;
ncclNetHandle_t* peerBstrapHandles;
struct unexConn* unexpectedConnections;
int rank;
int nranks;
int dev;
int rootPid; // [RCCL] PID of root
// Remote allocator state
struct remAllocState {
int cudaDev;
int listenFd;
int stop;
};
struct extState {
int extListenFd;
int extRingRecvFd;
int extRingSendFd;
union socketAddress* peerCommAddresses;
union socketAddress* peerAllocAddresses;
struct unexConn* unexpectedConnections;
int cudaDev;
int rank;
int nranks;
// Intermediate memory allocation service
struct remAllocState* allocState;
pthread_t allocThread;
};
#define MAX_SEGMENTS 128
static ncclResult_t remoteAlloc(void** ptr, int fd) {
size_t size;
NCCLCHECK(socketRecv(fd, &size, sizeof(size_t)));
hipIpcMemHandle_t devIpc;
NCCLCHECK(ncclCudaCalloc((char**)ptr, size, true));
hipError_t res = hipIpcGetMemHandle(&devIpc, *ptr);
if (res != hipSuccess) {
WARN("[Rem Allocator] hipIpcGetMemHandle failed : %s", hipGetErrorString(res));
hipFree(*ptr);
CUDACHECK(res);
}
// The CUDA IPC
NCCLCHECK(socketSend(fd, &devIpc, sizeof(hipIpcMemHandle_t)));
// And the direct pointer
NCCLCHECK(socketSend(fd, ptr, sizeof(void*)));
return ncclSuccess;
}
#include <poll.h>
// Service thread to allocate memory for other GPUs, used as intermediate step.
void* ncclRemoteMemAllocationService(void* args) {
struct remAllocState* state = (struct remAllocState *) args;
if (hipSetDevice(state->cudaDev) != hipSuccess) {
WARN("[Rem Allocator] Failed to set CUDA device %d", state->cudaDev);
}
// Prepare poll descriptor
void* segments[MAX_SEGMENTS];
struct pollfd pollfds[MAX_SEGMENTS+1];
for (int s=0; s<MAX_SEGMENTS; s++) segments[s] = NULL;
for (int s=0; s<MAX_SEGMENTS; s++) {
pollfds[s].fd = -1;
pollfds[s].events = POLLHUP;
}
pollfds[MAX_SEGMENTS].fd = state->listenFd;
pollfds[MAX_SEGMENTS].events = POLLIN;
int nbuffers = 0;
while (state->stop == 0 || (state->stop == 1 && nbuffers > 0)) {
if (int error = poll(pollfds, MAX_SEGMENTS+1, 100/*ms*/) < 0) {
WARN("[Rem Allocator] Poll failed with error %d", error);
return NULL;
}
if (pollfds[MAX_SEGMENTS].revents) {
int s = 0;
while (segments[s] != NULL && s < MAX_SEGMENTS) s++;
if (bootstrapNetAccept(pollfds[MAX_SEGMENTS].fd, &pollfds[s].fd) != ncclSuccess) {
pollfds[s].fd = -1;
} else {
if (s == MAX_SEGMENTS || (remoteAlloc(segments+s, pollfds[s].fd) != ncclSuccess)) {
WARN("[Rem Allocator] Allocation failed (segment %d, fd %d)", s, pollfds[s].fd);
close(pollfds[s].fd);
pollfds[s].fd = -1;
} else {
nbuffers++;
}
}
}
for (int s=0; s<MAX_SEGMENTS; s++) {
if (pollfds[s].revents & POLLHUP) {
if (hipFree(segments[s]) != hipSuccess) {
WARN("[Rem Allocator] hipFree %p failed", segments[s]);
}
segments[s] = NULL;
close(pollfds[s].fd);
pollfds[s].fd = -1;
nbuffers--;
}
}
}
for (int s=0; s<MAX_SEGMENTS; s++) {
if (segments[s]) hipFree(segments[s]);
close(pollfds[s].fd);
}
close(state->listenFd);
free(state);
return NULL;
}
ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr) {
struct extState* state = (struct extState*)commState;
int fd;
ncclResult_t res;
*id = -1;
NCCLCHECK(connectAddress(&fd, state->peerAllocAddresses+rank));
NCCLCHECKGOTO(socketSend(fd, &size, sizeof(size_t)), res, end);
NCCLCHECKGOTO(socketRecv(fd, ipc, sizeof(hipIpcMemHandle_t)), res, end);
NCCLCHECKGOTO(socketRecv(fd, ptr, sizeof(void*)), res, end);
*id = fd;
end:
return res;
}
ncclResult_t bootstrapRemFree(int id, int rank, void* commState) {
SYSCHECK(close(id), "close");
return ncclSuccess;
}
ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState, int* rootPid) { // [RCCL] Adding rootPid
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
bool idFromEnv = getenv("NCCL_COMM_ID") != NULL;
struct extState* state;
NCCLCHECK(ncclCalloc(&state, 1));
state->rank = rank;
@@ -310,19 +361,15 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
struct extInfo info = { 0 };
info.rank = rank;
info.nranks = nranks;
void *tmpSendComm, *tmpRecvComm;
// Pass the remote address to listen via info
if (idFromEnv) {
memcpy(&info.extHandleListen, netHandle, sizeof(ncclNetHandle_t));
memcpy(&info.extHandleListenRoot, netHandle, sizeof(ncclNetHandle_t));
}
// listen will return the local address via info (specify interface type 'findSubnetIf')
state->dev = idFromEnv ? findSubnetIf : 0;
void* extBstrapListenCommRoot;
NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm));
NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot));
int tmpSendFd, tmpRecvFd;
// stagger connection times to avoid an overload of the root at very high rank counts
int extListenFdRoot;
memcpy(&info.extAddressListen, &bootstrapNetIfAddr, sizeof(union socketAddress));
memcpy(&info.extAddressListenRoot, &bootstrapNetIfAddr, sizeof(union socketAddress));
NCCLCHECK(createListenSocket(&state->extListenFd, &info.extAddressListen));
NCCLCHECK(createListenSocket(&extListenFdRoot, &info.extAddressListenRoot));
// stagger connection times to avoid an overload of the root
if (nranks > 128) {
long msec = rank;
struct timespec tv;
@@ -333,28 +380,38 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
}
// send info on my listening socket to root
NCCLCHECK(bootstrapNetConnect(state->dev, netHandle, &tmpSendComm));
NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
union socketAddress* rootAddr = (union socketAddress*)id;
NCCLCHECK(connectAddress(&tmpSendFd, rootAddr));
NCCLCHECK(bootstrapNetSend(tmpSendFd, &info, sizeof(info)));
close(tmpSendFd);
// get info on my "next" rank in the bootstrap ring from root
ncclNetHandle_t extHandleNext;
NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm));
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
union socketAddress extAddressNext;
NCCLCHECK(bootstrapNetAccept(extListenFdRoot, &tmpRecvFd));
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &extAddressNext, sizeof(extAddressNext)));
{ // [RCCL] Receive PID from root
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, rootPid, sizeof(int)));
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, rootPid, sizeof(int)));
} // [/RCCL]
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
close(tmpRecvFd);
close(extListenFdRoot);
NCCLCHECK(bootstrapNetConnect(state->dev, &extHandleNext, &state->extBstrapRingSendComm));
NCCLCHECK(connectAddress(&state->extRingSendFd, &extAddressNext));
// Accept the connect request from the previous rank in the AllGather ring
NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
NCCLCHECK(bootstrapNetAccept(state->extListenFd, &state->extRingRecvFd));
// AllGather all listen handlers
NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks));
memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t));
NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t)));
NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks));
memcpy(state->peerCommAddresses+rank, &info.extAddressListen, sizeof(union socketAddress));
NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union socketAddress)));
// Create the memory allocation service
NCCLCHECK(ncclCalloc(&state->peerAllocAddresses, nranks));
memcpy(state->peerAllocAddresses+rank, &bootstrapNetIfAddr, sizeof(union socketAddress));
NCCLCHECK(ncclCalloc(&state->allocState, 1));
CUDACHECK(hipGetDevice(&state->allocState->cudaDev));
NCCLCHECK(createListenSocket(&state->allocState->listenFd, state->peerAllocAddresses+rank));
pthread_create(&state->allocThread, NULL, ncclRemoteMemAllocationService, state->allocState);
NCCLCHECK(bootstrapAllGather(state, state->peerAllocAddresses, sizeof(union socketAddress)));
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
@@ -378,9 +435,9 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
size_t sslice = (rank - i + nranks) % nranks;
// Send slice to the right
NCCLCHECK(bootstrapNetSend(state->extBstrapRingSendComm, data+sslice*size, size));
NCCLCHECK(bootstrapNetSend(state->extRingSendFd, data+sslice*size, size));
// Recv slice from the left
NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
NCCLCHECK(bootstrapNetRecv(state->extRingRecvFd, data+rslice*size, size));
}
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
@@ -389,20 +446,20 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
struct extState* state = (struct extState*)commState;
void* tmpSendComm;
NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles+peer, &tmpSendComm));
NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
int tmpSendFd;
NCCLCHECK(connectAddress(&tmpSendFd, state->peerCommAddresses+peer));
NCCLCHECK(bootstrapNetSend(tmpSendFd, &state->rank, sizeof(int)));
NCCLCHECK(bootstrapNetSend(tmpSendFd, data, size));
close(tmpSendFd);
return ncclSuccess;
}
ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int fd) {
// New unex
struct unexConn* unex;
NCCLCHECK(ncclCalloc(&unex, 1));
unex->peer = peer;
unex->comm = comm;
unex->fd = fd;
// Enqueue
struct unexConn* list = state->unexpectedConnections;
@@ -415,7 +472,7 @@ ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
return ncclSuccess;
}
void* unexpectedDequeue(struct extState* state, int peer) {
int unexpectedDequeue(struct extState* state, int peer) {
struct unexConn* elem = state->unexpectedConnections;
struct unexConn* prev = NULL;
while (elem) {
@@ -425,55 +482,61 @@ void* unexpectedDequeue(struct extState* state, int peer) {
} else {
prev->next = elem->next;
}
void* comm = elem->comm;
int fd = elem->fd;
free(elem);
return comm;
return fd;
}
prev = elem;
elem = elem->next;
}
return NULL;
return -1;
}
// We can't know who we'll receive from, so we need to receive everything at once
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
struct extState* state = (struct extState*)commState;
void* tmpRecvComm;
int tmpRecvFd;
// Search unexpected connections first
if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) {
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
if ((tmpRecvFd = unexpectedDequeue(state, peer)) != -1) {
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size));
close(tmpRecvFd);
return ncclSuccess;
}
// Then look for new connections
while (1) {
NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm));
NCCLCHECK(bootstrapNetAccept(state->extListenFd, &tmpRecvFd));
int newPeer;
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int)));
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &newPeer, sizeof(int)));
if (newPeer == peer) {
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size));
close(tmpRecvFd);
return ncclSuccess;
}
// Unexpected connection. Save for later.
NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm));
NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvFd));
}
}
ncclResult_t bootstrapClose(void* commState) {
struct extState* state = (struct extState*)commState;
if (state->unexpectedConnections != NULL) {
WARN("Unexpected connections are not empty.\n");
WARN("Unexpected connections are not empty");
return ncclInternalError;
}
NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm));
NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm));
NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm));
close(state->extListenFd);
close(state->extRingSendFd);
close(state->extRingRecvFd);
free(state->peerBstrapHandles);
state->allocState->stop = 1;
// Join the allocThread so we catch resource leaks as being hung here
// pthread_join(state->allocThread, nullptr);
free(state->peerCommAddresses);
free(state->peerAllocAddresses);
free(state);
return ncclSuccess;
@@ -481,10 +544,12 @@ ncclResult_t bootstrapClose(void* commState) {
ncclResult_t bootstrapAbort(void* commState) {
struct extState* state = (struct extState*)commState;
bootstrapNetCloseListen(state->extBstrapListenComm);
bootstrapNetCloseSend(state->extBstrapRingSendComm);
bootstrapNetCloseRecv(state->extBstrapRingRecvComm);
free(state->peerBstrapHandles);
close(state->extListenFd);
close(state->extRingSendFd);
close(state->extRingRecvFd);
state->allocState->stop = 2;
free(state->peerCommAddresses);
free(state->peerAllocAddresses);
free(state);
return ncclSuccess;
}
+4 -4
Voir le fichier
@@ -26,16 +26,16 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
}
// Per-channel operation list.
NCCLCHECK(ncclCudaHostCalloc(&channel->collectives, NCCL_MAX_OPS));
NCCLCHECK(ncclCudaHostCalloc(&channel->collectivesExtra, comm->nRanks*NCCL_MAX_OPS*4));
NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
NCCLCHECK(ncclCudaHostCalloc(&channel->a2avParams, comm->nRanks*NCCL_MAX_OPS*4));
return ncclSuccess;
}
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
if (channel->id == -1) return ncclSuccess;
// Operation list
NCCLCHECK(ncclCudaHostFree(channel->collectivesExtra));
NCCLCHECK(ncclCudaHostFree(channel->collectives));
NCCLCHECK(ncclCudaHostFree(channel->a2avParams));
NCCLCHECK(ncclCudaHostFree(channel->workFifo));
// Free Ring index to rank tables
free(channel->ring.userRanks);
+3 -13
Voir le fichier
@@ -1,5 +1,5 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -29,14 +29,13 @@ THE SOFTWARE.
#include "common_kernel.h"
template <class FUNC, typename T, int NUM_RANKS>
__device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
__device__ void AllReduceCliqueSplitKernel(struct ncclWorkElem* args)
{
// Clique-specific kernel arguments
cliqueDevicePtrs_t* cliquePtrs = args->clique.ptrs; // Collection of all input/output pointers across ranks in clique
size_t const N = args->clique.count; // Total number of elements to reduce
int const nBlocks = args->clique.nChannels; // Total number of blocks assigned to this kernel (may be different than gridDim.x)
int const blockId = args->clique.bid; // 0-indexed blockIdx for this threadblock (may be different than blockIdx.x)
int const verbose = args->clique.verbose; // For debug purposes
int const rank = args->comm->rank; // Current rank
// Each threadblock works independently of others on a subsection of the input
@@ -47,10 +46,6 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
size_t const currBlockStop = min(currBlockStart + perBlockN, N);
size_t const blockN = currBlockStop - currBlockStart;
if (verbose && threadIdx.x == 0)
{
printf("Rank %d block %d of %d %lu -> %lu [%lu]\n", rank, blockId, nBlocks, currBlockStart, currBlockStop, blockN);
}
if (blockN > 0)
{
// Prepare input / output subarrays
@@ -74,12 +69,7 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
// Even if there was nothing for this GPU to do, it must participate in a barrier
// because other GPUs may be modifying this GPUs output buffer still
if (blockId == 0)
{
if (verbose && threadIdx.x == 0) printf("Rank %d enters GPU barrier\n", rank);
WaitForBarrier<NUM_RANKS>(cliquePtrs->barrier, rank, verbose);
if (verbose && threadIdx.x == 0) printf("Rank %d exits GPU barrier\n", rank);
}
if (blockId == 0) WaitForBarrier<NUM_RANKS>(cliquePtrs->barrier);
}
#endif
+3 -15
Voir le fichier
@@ -1,5 +1,5 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -62,7 +62,7 @@ typedef struct
// Multi-GPU (on same node) barrier. One thread per grid per GPU updates barrier / waits
template <int NUM_RANKS>
__forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier, int const rank, int const verbose)
__forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier)
{
if (threadIdx.x == 0)
{
@@ -71,7 +71,6 @@ __forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier, int
int localSense = *barrier.localSense;
int val = __atomic_add_fetch(barrier.globalCount, 1, __ATOMIC_SEQ_CST);
if (verbose) printf("Rank %d arrived at GPU barrier %d\n", rank, val);
if (val == NUM_RANKS)
{
// Last arrival resets barrier
@@ -81,18 +80,7 @@ __forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier, int
else
{
// Wait for all ranks to reach barrier
int counter = 0;
while (__atomic_load_n(barrier.globalSense, __ATOMIC_SEQ_CST) != localSense)
{
if (verbose)
{
counter++;
if (counter == 100000000)
{
printf("Rank %d waiting on GPU barrier: (%d != %d)", rank, *barrier.globalSense, localSense);
}
}
}
while (__atomic_load_n(barrier.globalSense, __ATOMIC_SEQ_CST) != localSense);
}
}
}
+7 -31
Voir le fichier
@@ -1,5 +1,5 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -49,7 +49,6 @@ int* CliqueManager::m_staticGpuBarrierMem = NULL;
RCCL_PARAM(EnableClique, "ENABLE_CLIQUE", 0); // Opt-in environment variable for clique-based kernels
RCCL_PARAM(AllReduceCliqueByteLimit, "CLIQUE_ALLREDUCE_BYTE_LIMIT", 16777216); // Max number of bytes to use clique-based kernels for all reduce
RCCL_PARAM(AllReduceNumChannels, "CLIQUE_ALLREDUCE_NCHANNELS", 0); // Number of channels to use for all-reduce. (0 for auto-select)
RCCL_PARAM(CliqueDebug, "CLIQUE_DEBUG", 0); // Emit debug messages
CliqueManager::CliqueManager(int const rank,
int const numRanks,
@@ -83,11 +82,7 @@ void CliqueManager::CleanUp()
{
// Release caches
if (m_ipcHandleSendCache) delete m_ipcHandleSendCache;
if (m_ipcHandleRecvCache)
{
m_ipcHandleRecvCache->close();
delete m_ipcHandleRecvCache;
}
if (m_ipcHandleRecvCache) delete m_ipcHandleRecvCache;
// Close shared memory
m_shmHandles.Close();
@@ -220,7 +215,6 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
if (m_rank == 0)
{
NCCLCHECKGOTO(ncclCudaCalloc(&m_staticGpuBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), true), res, dropback);
// Prepare all barriers
for (int opIndex = 0; opIndex < NCCL_MAX_OPS; opIndex++)
{
@@ -253,7 +247,7 @@ bool CliqueManager::IsSupported(ncclFunc_t const coll,
// Filter based on total input size for each collective type
size_t totalBytes = count * ncclTypeSize(datatype);
if (coll == ncclCollAllReduce && (totalBytes <= rcclParamAllReduceCliqueByteLimit())) return true;
if (coll == ncclFuncAllReduce && (totalBytes <= rcclParamAllReduceCliqueByteLimit())) return true;
return false;
}
@@ -321,7 +315,7 @@ ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll,
size_t const totalBytes = count * ncclTypeSize(datatype);
*numChannelstoUse = 1;
if (coll == ncclCollAllReduce) {
if (coll == ncclFuncAllReduce) {
if (rcclParamAllReduceNumChannels() == 0)
{
// NOTE: These are currently based on collected data and not necessarily ideal for all hardware
@@ -345,7 +339,7 @@ ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll,
ncclResult_t CliqueManager::SetCliqueCollectiveArgs(CollectiveArgs* args)
ncclResult_t CliqueManager::SetCliqueArgs(ncclWorkElem* args)
{
// Do nothing if disabled
if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
@@ -358,7 +352,6 @@ ncclResult_t CliqueManager::SetCliqueCollectiveArgs(CollectiveArgs* args)
// Prepare clique argments (NOTE: clique pointers are not ready yet)
int opIndex = args->opCount % NCCL_MAX_OPS;
args->clique.ptrs = &m_pinnedCliquePtrs[opIndex];
args->clique.verbose = rcclParamCliqueDebug();
return ncclSuccess;
}
@@ -495,33 +488,16 @@ ncclResult_t CliqueManager::CheckCacheForHandle(std::pair<hipIpcMemHandle_t, siz
void CliqueManager::WaitForBarrier()
{
int const verbose = rcclParamCliqueDebug();
// Sense inversion barrier
m_cpuBarrierLocalSense = 1 - m_cpuBarrierLocalSense;
int val = __sync_add_and_fetch(m_cpuBarrierGlobalCount, 1);
if (verbose) INFO(NCCL_INIT, "Rank %d reaches barrier at %d", m_rank, val);
if (val == m_numRanks)
if (__sync_add_and_fetch(m_cpuBarrierGlobalCount, 1) == m_numRanks)
{
// Reset the barrier
STORE(m_cpuBarrierGlobalCount, 0);
STORE(m_cpuBarrierGlobalSense, m_cpuBarrierLocalSense);
} else {
size_t counter = 0;
while (LOAD(m_cpuBarrierGlobalSense) != m_cpuBarrierLocalSense)
{
if (verbose)
{
counter++;
if (counter == 100000000)
{
WARN("Rank %d waiting in CPU barrier: (%d != %d)", m_rank, *m_cpuBarrierGlobalSense, m_cpuBarrierLocalSense);
}
}
}
if (verbose) INFO(NCCL_INIT, "Rank %d leaves CPU barrier", m_rank);
while (LOAD(m_cpuBarrierGlobalSense) != m_cpuBarrierLocalSense);
}
}
+4 -4
Voir le fichier
@@ -1,5 +1,5 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -72,7 +72,7 @@ public:
// Set pointers for where clique-related arguments will be found
// This sets pointers to device-accessible memory where the arguments will eventually reside
ncclResult_t SetCliqueCollectiveArgs(CollectiveArgs* args);
ncclResult_t SetCliqueArgs(ncclWorkElem* args);
// Blocking call that only returns after all out-standing clique pointers are ready
ncclResult_t WaitForPointers();
@@ -121,8 +121,8 @@ protected:
// For use in bootstrapping code
struct bootstrapRootStruct {
void* listenComm;
unsigned long hash;
int listenFd;
unsigned long hash;
};
#endif
+9 -63
Voir le fichier
@@ -103,16 +103,14 @@ public:
return std::pair<iterator, bool>(it, inserted);
}
ncclResult_t close();
private:
// tag for dispatch
template<class U>
struct CloseTag{};
hipError_t CloseIfPointer(CloseTag<hipIpcMemHandle_t> tag, iterator it);
hipError_t CloseIfPointer(CloseTag<void*> tag, iterator it);
void pop();
void pop()
{
typename LRUCache::iterator it = m_cache.find(m_lruHistory.front());
m_cache.erase(it);
m_lruHistory.pop_front();
}
void updateHistory(const iterator& it)
{
@@ -135,60 +133,8 @@ auto hipIpcMemHandleEqual = [](const hipIpcMemHandle_t& l, const hipIpcMemHandle
return memcmp(l.reserved, r.reserved, sizeof(l.reserved)) == 0;
};
template <
class Key,
class Value,
class Hash,
class KeyEqual,
class Allocator
>
ncclResult_t NcclIpcHandleCache<Key, Value, Hash, KeyEqual, Allocator>::close()
{
for (auto it = m_cache.begin(); it != m_cache.end(); it ++)
{
CUDACHECK(CloseIfPointer(CloseTag<Value>{}, it));
}
return ncclSuccess;
}
template <
class Key,
class Value,
class Hash,
class KeyEqual,
class Allocator
>
void NcclIpcHandleCache<Key, Value, Hash, KeyEqual, Allocator>::pop()
{
typename LRUCache::iterator it = m_cache.find(m_lruHistory.front());
CloseIfPointer(CloseTag<Value>{}, it);
m_cache.erase(it);
m_lruHistory.pop_front();
}
template <
class Key,
class Value,
class Hash,
class KeyEqual,
class Allocator
>
hipError_t NcclIpcHandleCache<Key, Value, Hash, KeyEqual, Allocator>::CloseIfPointer(CloseTag<void*> tag, iterator it)
{
return hipIpcCloseMemHandle(it->second.first);
}
template <
class Key,
class Value,
class Hash,
class KeyEqual,
class Allocator
>
hipError_t NcclIpcHandleCache<Key, Value, Hash, KeyEqual, Allocator>::CloseIfPointer(CloseTag<hipIpcMemHandle_t> tag, iterator it)
{
return hipSuccess;
}
//typedef llvm::DenseMap<uint64_t, hipIpcMemHandle_t> SendCache;
//typedef llvm::DenseMap<hipIpcMemHandle_t, void*, decltype(&HandleHash), decltype(HandleEqual)> RecvCache;
typedef NcclIpcHandleCache<uint64_t, hipIpcMemHandle_t, std::hash<uint64_t>, std::equal_to<uint64_t>, std::allocator< std::pair<const uint64_t, std::pair<hipIpcMemHandle_t, std::list<uint64_t>::iterator>>>> NcclIpcHandleSendCache;
typedef NcclIpcHandleCache<hipIpcMemHandle_t, void*, decltype(&hipIpcMemHandleHash), decltype(hipIpcMemHandleEqual), std::allocator< std::pair<const hipIpcMemHandle_t, std::pair<void*, std::list<hipIpcMemHandle_t>::iterator>>>> NcclIpcHandleRecvCache;
+3 -2
Voir le fichier
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollAllGather, "AllGather",
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
return ncclEnqueueCheck(&info);
+3 -2
Voir le fichier
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -11,7 +11,8 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
return ncclEnqueueCheck(&info);
+1 -1
Voir le fichier
@@ -25,7 +25,7 @@ ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, nc
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;
} else {
struct ncclInfo info = { ncclCollAllToAll, "AllToAll",
struct ncclInfo info = { ncclFuncAllToAll, "AllToAll",
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
return ncclEnqueueCheck(&info);
+1 -1
Voir le fichier
@@ -37,7 +37,7 @@ ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], cons
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;
} else {
struct ncclInfo info = { ncclCollAllToAllv, "AllToAllv",
struct ncclInfo info = { ncclFuncAllToAllv, "AllToAllv",
sendbuff, recvbuff, 0, datatype, ncclSum, 0, comm, stream, /* Args */
ALLTOALLV_CHUNKSTEPS, ALLTOALLV_SLICESTEPS, sendcounts, sdispls, recvcounts, rdispls };
return ncclEnqueueCheck(&info);
+3 -2
Voir le fichier
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
return ncclEnqueueCheck(&info);
+2 -2
Voir le fichier
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,4 +8,4 @@
#include "common.h"
#include "collectives.h"
IMPL_COLL_C(ncclAllGather, ncclCollAllGather);
IMPL_COLL_C(AllGather);
+175 -180
Voir le fichier
@@ -9,206 +9,201 @@
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm);
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int nelem = min(realChunkSize, size-chunkOffset);
int rankDest;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int nelem = min(realChunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
if (thisInput + chunkOffset == thisOutput + offset) { // In place
prims.directSend(thisInput+chunkOffset, offset, nelem);
} else {
prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
if (thisInput + chunkOffset == thisOutput + offset) { // In place
prims.directSend(thisInput+chunkOffset, offset, nelem);
} else {
prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
}
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
}
// Make final copy from buffer to dest.
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
// Final wait/copy.
prims.directRecv(thisOutput+offset, offset, nelem);
}
}
};
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
}
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
if (thisInput + chunkOffset == thisOutput + offset) { // In place
LLprims.send(thisInput+chunkOffset, nelem);
} else {
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
}
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
LLprims.recvCopySend(thisOutput+offset, nelem);
}
// step k-1: final store
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
LLprims.recv(thisOutput+offset, nelem);
}
}
// Make final copy from buffer to dest.
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
// Final wait/copy.
prims.directRecv(thisOutput+offset, offset, nelem);
}
}
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
}
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
if (thisInput + chunkOffset == thisOutput + offset) { // In place
LLprims.send(thisInput+chunkOffset, nelem);
} else {
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
}
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
LLprims.recvCopySend(thisOutput+offset, nelem);
}
// step k-1: final store
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
LLprims.recv(thisOutput+offset, nelem);
}
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { }
};
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t chunkOffset = gridOffset + bid*chunkSize;
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
if (thisInput + chunkOffset == thisOutput + offset) { // In place
LLprims.send(thisInput+chunkOffset, nelem);
} else {
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
if (thisInput + chunkOffset == thisOutput + offset) { // In place
LLprims.send(thisInput+chunkOffset, nelem);
} else {
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
}
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
LLprims.recvCopySend(thisOutput+offset, nelem);
}
// step k-1: final store
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
LLprims.recv(thisOutput+offset, nelem);
}
}
};
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
template<int PROTO, class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_TREE, PROTO, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
LLprims.recvCopySend(thisOutput+offset, nelem);
}
template<int PROTO, class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_COLLNET, PROTO, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
// step k-1: final store
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
LLprims.recv(thisOutput+offset, nelem);
}
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherCollNetLL128Kernel(struct CollectiveArgs* args) { }
+6 -3
Voir le fichier
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
* See LICENSE.txt for license information
************************************************************************/
@@ -8,4 +8,7 @@
#include "common.h"
#include "collectives.h"
IMPL_COLL_R(ncclAllReduce, ncclCollAllReduce);
// [RCCL]
// IMPL_COLL_R(AllReduce);
IMPL_COLL_CLIQUE(AllReduce);
// [/RCCL]
+494 -433
Voir le fichier
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,478 +10,460 @@
#include "collectives.h"
#include "clique/AllReduceCliqueKernel.h" // [RCCL] AllReduce Clique-based kernel support
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
#ifdef ENABLE_PROFILING
auto devProf = comm->devProf;
uint64_t clk, t0 = 0ULL, ws;
if (tid == 0) clk = __rtc64();
auto devProf = comm->devProf;
uint64_t clk, t0 = 0ULL, ws;
if (tid == 0) clk = __builtin_amdgcn_s_memrealtime();
#endif
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm);
ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
ssize_t realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
ssize_t realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
int nelem;
int chunk;
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
int nelem;
int chunk;
// step 0: push data to next GPU
chunk = ring->devUserRanks[nranks-1];
offset = chunkOffset + chunk * realChunkSize;
nelem = min(realChunkSize, size-offset);
INIT_COUNTER;
prims.send(thisInput+offset, nelem);
ACCUMULATE_COUNTER(send);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
chunk = ring->devUserRanks[nranks-j];
// step 0: push data to next GPU
chunk = ring->devUserRanks[nranks-1];
offset = chunkOffset + chunk * realChunkSize;
nelem = min(realChunkSize, size-offset);
INIT_COUNTER;
prims.recvReduceSend(thisInput+offset, nelem);
ACCUMULATE_COUNTER(recvReduceSend);
}
prims.send(thisInput+offset, nelem);
ACCUMULATE_COUNTER(send);
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
chunk = ring->devUserRanks[0];
offset = chunkOffset + chunk * realChunkSize;
nelem = min(realChunkSize, size-offset);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
chunk = ring->devUserRanks[nranks-j];
offset = chunkOffset + chunk * realChunkSize;
nelem = min(realChunkSize, size-offset);
INIT_COUNTER;
prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
ACCUMULATE_COUNTER(directRecvReduceCopySend);
INIT_COUNTER;
prims.recvReduceSend(thisInput+offset, nelem);
ACCUMULATE_COUNTER(recvReduceSend);
}
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
chunk = ring->devUserRanks[nranks-j];
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
chunk = ring->devUserRanks[0];
offset = chunkOffset + chunk * realChunkSize;
nelem = min(realChunkSize, size-offset);
INIT_COUNTER;
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
ACCUMULATE_COUNTER(directRecvCopySend);
}
prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
ACCUMULATE_COUNTER(directRecvReduceCopySend);
// Make final copy from buffer to dest.
chunk = ring->devUserRanks[1];
offset = chunkOffset + chunk * realChunkSize;
nelem = min(realChunkSize, size-offset);
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
chunk = ring->devUserRanks[nranks-j];
offset = chunkOffset + chunk * realChunkSize;
nelem = min(realChunkSize, size-offset);
INIT_COUNTER;
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
ACCUMULATE_COUNTER(directRecvCopySend);
}
// Make final copy from buffer to dest.
chunk = ring->devUserRanks[1];
offset = chunkOffset + chunk * realChunkSize;
nelem = min(realChunkSize, size-offset);
// Final wait/copy.
INIT_COUNTER;
prims.directRecv(thisOutput+offset, offset, nelem);
ACCUMULATE_COUNTER(directRecv);
}
INIT_COUNTER;
prims.directRecv(thisOutput+offset, offset, nelem);
ACCUMULATE_COUNTER(directRecv);
}
#ifdef ENABLE_PROFILING
if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), __rtc64() - clk, __ATOMIC_SEQ_CST);
if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), __builtin_amdgcn_s_memrealtime() - clk, __ATOMIC_SEQ_CST);
#endif
}
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
int chunkSize = args->coll.lastChunkSize;
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
if (loopSize > size) {
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
};
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->tree;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
int chunkSize = args->coll.lastChunkSize;
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
do {
struct ncclTree* tree = &channel->treeUp;
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
ncclPrimitives<UNROLL/2, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, 0, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
prims.send(thisInput+offset, nelem);
} else {
prims.recvReduceSend(thisInput+offset, nelem);
}
}
} while(0);
do {
struct ncclTree* tree = &channel->treeDn;
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
ncclPrimitives<UNROLL/2, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, &tree->up, tree->down, thisOutput, stepSize, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
prims.directSend(thisOutput+offset, offset, nelem);
} else if (tree->down[0] == -1) {
prims.directRecv(thisOutput+offset, offset, nelem);
} else {
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
}
}
} while(0);
}
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
int chunkSize = args->coll.lastChunkSize;
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
if (loopSize > size) {
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
if (blockIdx.x < nChannels) { // first half of the channels do reduce
struct ncclTree* tree = &channel->collTreeUp;
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
prims.send(thisInput+offset, nelem);
} else {
prims.recvReduceSend(thisInput+offset, nelem);
}
}
}
if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
struct ncclTree* tree = &channel->collTreeDn;
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
prims.send(thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
prims.recv(thisOutput+offset, nelem);
} else {
prims.recvCopySend(thisOutput+offset, nelem);
}
}
}
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T);
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*nranks*chunkSize;
const ssize_t size = args->coll.count;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
int nelem;
int chunk;
// step 0: push data to next GPU
chunk = ring->devUserRanks[nranks-1];
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
chunk = ring->devUserRanks[nranks-j];
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceSend(thisInput+offset, nelem);
if (loopSize > size) {
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
chunk = ring->devUserRanks[0];
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
chunk = ring->devUserRanks[nranks-j];
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvCopySend(thisOutput+offset, nelem);
}
// Make final copy from buffer to dest.
chunk = ring->devUserRanks[1];
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.
LLprims.recv(thisOutput+offset, nelem);
}
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
if (loopSize > size) {
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
do {
struct ncclTree* tree = &channel->treeUp;
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
} while(0);
do {
struct ncclTree* tree = &channel->treeDn;
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
LLprims.send(thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
LLprims.recv(thisOutput+offset, nelem);
} else {
LLprims.recvCopySend(thisOutput+offset, nelem);
}
}
} while(0);
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
if (loopSize > size) {
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
if (blockIdx.x < nChannels) { // first half of the channels do reduce
struct ncclTree* tree = &channel->collTreeUp;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
}
if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
struct ncclTree* tree = &channel->collTreeDn;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
LLprims.send(thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
LLprims.recv(thisOutput+offset, nelem);
} else {
LLprims.recvCopySend(thisOutput+offset, nelem);
}
}
}
}
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
// [RCCL] RingLL128 is re-purposed as clique-based kernel
LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, FUNC, T, args);
// [/RCCL]
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* treeUp = &channel->treeUp;
struct ncclTree* treeDn = &channel->treeDn;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = args->coll.lastChunkSize;
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8;
const ssize_t loopSize = nChannels*chunkSize;
int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
const ssize_t size = args->coll.count;
if (loopSize > size) {
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
if (treeUp->up == -1) {
// ReduceAndBroadcast : max number of recv is 3, max number of send is 3
ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, stepSize, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
}
} else {
if (tid < nthreadsSplit) {
#if 1
if (tid < nthreads) {
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, stepSize, channel, comm);
ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_DEV_ARITY, 1, 0, FUNC>
prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (treeUp->down[0] == -1) {
if (tree->up == -1) {
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
prims.send(thisInput+offset, nelem);
} else {
prims.recvReduceSend(thisInput+offset, nelem);
}
}
}
if (tid < nthreads) {
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_DEV_ARITY, 1, FUNC>
prims(tid, nthreads, &tree->up, tree->down, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
prims.directSend(thisOutput+offset, offset, nelem);
} else if (tree->down[0] == -1) {
prims.directRecv(thisOutput+offset, offset, nelem);
} else {
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
}
}
}
#else
int nthreadsSplit = nthreads/2;
if (nthreadsSplit == 256) nthreadsSplit += 64;
if (tree->up == -1) {
if (tid < nthreads) {
// ReduceAndBroadcast : max number of recv is 3, max number of send is 3
ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_DEV_ARITY, NCCL_MAX_DEV_ARITY, 1, FUNC>
prims(tid, nthreads, tree->down, tree->down, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
}
}
} else {
if (tid < nthreadsSplit) {
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_DEV_ARITY, 1, 0, FUNC>
prims(tid, nthreadsSplit, tree->down, &tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->down[0] == -1) {
prims.send(thisInput+offset, nelem);
} else {
prims.recvReduceSend(thisInput+offset, nelem);
}
}
} else {
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_DEV_ARITY, 1, FUNC>
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 2);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->down[0] == -1) {
prims.directRecv(thisOutput+offset, offset, nelem);
} else {
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
}
}
}
}
#endif
}
};
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_COLLNET, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->collTree;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
int chunkSize = args->coll.lastChunkSize;
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
if (loopSize > size) {
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
if (blockIdx.x < nChannels) { // first half of the channels do reduce
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC>
prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
prims.send(thisInput+offset, nelem);
} else {
prims.recvReduceSend(thisInput+offset, nelem);
}
}
}
if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
prims.send(thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
prims.recv(thisOutput+offset, nelem);
} else {
prims.recvCopySend(thisOutput+offset, nelem);
}
}
}
}
};
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T);
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*nranks*chunkSize;
const ssize_t size = args->coll.count;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
int nelem;
int chunk;
// step 0: push data to next GPU
chunk = ring->devUserRanks[nranks-1];
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
chunk = ring->devUserRanks[nranks-j];
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
chunk = ring->devUserRanks[0];
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
chunk = ring->devUserRanks[nranks-j];
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvCopySend(thisOutput+offset, nelem);
}
// Make final copy from buffer to dest.
chunk = ring->devUserRanks[1];
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.
LLprims.recv(thisOutput+offset, nelem);
}
}
};
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_LL, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->tree;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
if (loopSize > size) {
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
do {
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
ncclLLPrimitives<T, FUNC, NCCL_MAX_DEV_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
} else {
} while(0);
do {
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, stepSize, channel, comm);
ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_DEV_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (treeDn->down[0] == -1) {
if (tree->up == -1) {
LLprims.send(thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
LLprims.recv(thisOutput+offset, nelem);
} else {
LLprims.recvCopySend(thisOutput+offset, nelem);
}
}
} while(0);
}
};
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_COLLNET, NCCL_PROTO_LL, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->collTree;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
if (loopSize > size) {
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
if (blockIdx.x < nChannels) { // first half of the channels do reduce
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
}
if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
LLprims.send(thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
LLprims.recv(thisOutput+offset, nelem);
} else {
LLprims.recvCopySend(thisOutput+offset, nelem);
@@ -489,8 +471,87 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
}
}
}
}
};
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
#include "prims_ll128.h"
// [RCCL] RingLL128 is re-purposed as clique-based kernel
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_CLIQUE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, FUNC, T, args);
}
};
// [/RCCL]
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->tree;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = args->coll.lastChunkSize;
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8;
const ssize_t loopSize = nChannels*chunkSize;
int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
const ssize_t size = args->coll.count;
if (loopSize > size) {
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
if (tree->up == -1) {
// ReduceAndBroadcast : max number of recv is 3, max number of send is 3
ncclLL128Primitives<T, FUNC, NCCL_MAX_DEV_ARITY, NCCL_MAX_DEV_ARITY> LLprims(tid, nthreads, tree->down, tree->down, stepSize, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
}
} else {
if (tid < nthreadsSplit) {
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
ncclLL128Primitives<T, FUNC, NCCL_MAX_DEV_ARITY, 1> LLprims(tid, nthreadsSplit, tree->down, &tree->up, stepSize, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->down[0] == -1) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
} else {
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_DEV_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, stepSize, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->down[0] == -1) {
LLprims.recv(thisOutput+offset, nelem);
} else {
LLprims.recvCopySend(thisOutput+offset, nelem);
}
}
}
}
}
};
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_COLLNET, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) { }
};
+1 -2
Voir le fichier
@@ -8,5 +8,4 @@
#include "common.h"
#include "collectives.h"
IMPL_COLL_FUNC(ncclAllToAll, copy, FuncSum, i8, int8_t);
IMPL_COLL_KERN(ncclAllToAll, copy, FuncSum, i8, int8_t, 0);
IMPL_COLL_FUNC(AllToAll, RING, SIMPLE, Sum, int8_t);
+68 -67
Voir le fichier
@@ -9,80 +9,81 @@
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllToAllKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->coll.count;
const int nranks = comm->nRanks;
const int bid = args->coll.bid;
const int rank = ring->devUserRanks[0];
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLTOALL_CHUNKSTEPS;
const int peersPerChan = DIVUP(nranks, nChannels);
const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllToAll, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->coll.count;
const int nranks = comm->nRanks;
const int bid = args->coll.bid;
const int rank = ring->devUserRanks[0];
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLTOALL_CHUNKSTEPS;
const int peersPerChan = DIVUP(nranks, nChannels);
const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
for (int i = 0; i < peersPerChan; i++) {
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
continue;
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
int nelem = min(realChunkSize, size-chunkOffset);
if ((blockIdx.x*peersPerChan+i)%nranks == 0) {
if (tid < nthreads && thisInput != thisOutput) {
const T* sendbuff = thisInput+chunkOffset+rank*size;
T* recvbuff = thisOutput+chunkOffset+rank*size;
// local copy
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
}
}
}
}
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
for (int i = 0; i < peersPerChan; i++) {
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
continue;
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
int nelem = min(realChunkSize, size-chunkOffset);
if ((blockIdx.x*peersPerChan+i)%nranks == 0) {
if (tid < nthreads && thisInput != thisOutput) {
const T* sendbuff = thisInput+chunkOffset+rank*size;
T* recvbuff = thisOutput+chunkOffset+rank*size;
// local copy
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
if ((blockIdx.x*peersPerChan+i)%nranks != 0) {
int nthreadsSplit = nthreads/2;
if (tid < nthreadsSplit ) {
int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 0, 1, 0, FUNC>
prims(tid, nthreadsSplit, NULL, &peerSend, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
int nelem = min(realChunkSize, size-chunkOffset);
ssize_t send_offset = chunkOffset + peerSend*size;
prims.send(thisInput+send_offset, nelem);
}
} else {
int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 1, 0, 0, FUNC>
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, NULL, NULL, stepSize, channel, comm, ncclShmem->ptrs, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
int nelem = min(realChunkSize, size-chunkOffset);
ssize_t recv_offset = chunkOffset + peerRecv*size;
prims.recv(thisOutput+recv_offset, nelem);
}
}
}
}
}
for (int i = 0; i < peersPerChan; i++) {
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
continue;
if ((blockIdx.x*peersPerChan+i)%nranks != 0) {
int nthreadsSplit = nthreads/2;
int peerNone[2] = {-1,-1};
if (tid < nthreadsSplit ) {
int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 2, 1, 0, FUNC>
prims(tid, nthreadsSplit, peerNone, &peerSend, NULL, stepSize, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
int nelem = min(realChunkSize, size-chunkOffset);
ssize_t send_offset = chunkOffset + peerSend*size;
prims.send(thisInput+send_offset, nelem);
}
} else {
int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 1, 2, 0, FUNC>
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, peerNone, NULL, stepSize, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
int nelem = min(realChunkSize, size-chunkOffset);
ssize_t recv_offset = chunkOffset + peerRecv*size;
prims.recv(thisOutput+recv_offset, nelem);
}
}
}
}
}
};
Fichier exécutable → Fichier normal
+1 -2
Voir le fichier
@@ -8,5 +8,4 @@
#include "common.h"
#include "collectives.h"
IMPL_COLL_FUNC(ncclAllToAllv, copy, FuncSum, i8, int8_t);
IMPL_COLL_KERN(ncclAllToAllv, copy, FuncSum, i8, int8_t, 0);
IMPL_COLL_FUNC(AllToAllv, RING, SIMPLE, Sum, int8_t);
Fichier exécutable → Fichier normal
+76 -74
Voir le fichier
@@ -9,87 +9,89 @@
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllToAllvKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->a2av.nThreads;
const int nChannels = args->a2av.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t typesize = args->a2av.count;
const int nranks = comm->nRanks;
const int bid = args->a2av.bid;
const int rank = ring->devUserRanks[0];
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLTOALLV_CHUNKSTEPS;
const int peersPerChan = DIVUP(nranks, nChannels);
const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncAllToAllv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int nChannels = args->a2av.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t typesize = args->a2av.count;
const int nranks = comm->nRanks;
const int bid = args->a2av.bid;
const int rank = ring->devUserRanks[0];
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLTOALLV_CHUNKSTEPS;
const int peersPerChan = DIVUP(nranks, nChannels);
const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
size_t* params = channel->a2avParams + nranks*4*args->index;
size_t *sendcounts = params;
size_t *sdispls = params + nranks;
size_t *recvcounts = params + nranks*2;
size_t *rdispls = params + nranks*3;
ssize_t size = sendcounts[rank]*typesize;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
for (int i = 0; i < peersPerChan; i++) {
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
continue;
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
int nelem = min(realChunkSize, size-chunkOffset);
if ((blockIdx.x*peersPerChan+i)%nranks == 0) {
if (tid < nthreads && thisInput != thisOutput) {
const T* sendbuff = thisInput+chunkOffset+sdispls[rank]*typesize;
T* recvbuff = thisOutput+chunkOffset+rdispls[rank]*typesize;
// local copy
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
}
}
}
}
size_t *sendcounts = args->a2av.extra;
size_t *sdispls = args->a2av.extra + nranks;
size_t *recvcounts = args->a2av.extra + nranks*2;
size_t *rdispls = args->a2av.extra + nranks*3;
ssize_t size = sendcounts[rank]*typesize;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
for (int i = 0; i < peersPerChan; i++) {
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
continue;
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
int nelem = min(realChunkSize, size-chunkOffset);
if ((blockIdx.x*peersPerChan+i)%nranks == 0) {
if (tid < nthreads && thisInput != thisOutput) {
const T* sendbuff = thisInput+chunkOffset+sdispls[rank]*typesize;
T* recvbuff = thisOutput+chunkOffset+rdispls[rank]*typesize;
// local copy
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
if ((blockIdx.x*peersPerChan+i)%nranks != 0) {
int nthreadsSplit = nthreads/2;
if (tid < nthreadsSplit ) {
int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
ncclPrimitives<UNROLL, ALLTOALLV_CHUNKSTEPS/ALLTOALLV_SLICESTEPS, ALLTOALLV_SLICESTEPS, T, 0, 1, 0, FUNC>
prims(tid, nthreadsSplit, NULL, &peerSend, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
size = sendcounts[peerSend]*typesize;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
ALIGN_SIZE(realChunkSize, nthreadsSplit*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
int nelem = min(realChunkSize, size-chunkOffset);
ssize_t send_offset = chunkOffset + sdispls[peerSend]*typesize;
prims.send(thisInput+send_offset, nelem);
}
} else {
int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
ncclPrimitives<UNROLL, ALLTOALLV_CHUNKSTEPS/ALLTOALLV_SLICESTEPS, ALLTOALLV_SLICESTEPS, T, 1, 0, 0, FUNC>
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, NULL, NULL, stepSize, channel, comm, ncclShmem->ptrs, 1);
size = recvcounts[peerRecv]*typesize;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
ALIGN_SIZE(realChunkSize, (nthreads-nthreadsSplit)*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
int nelem = min(realChunkSize, size-chunkOffset);
ssize_t recv_offset = chunkOffset + rdispls[peerRecv]*typesize;
prims.recv(thisOutput+recv_offset, nelem);
}
}
}
}
}
for (int i = 0; i < peersPerChan; i++) {
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
continue;
if ((blockIdx.x*peersPerChan+i)%nranks != 0) {
int nthreadsSplit = nthreads/2;
int peerNone[2] = {-1,-1};
if (tid < nthreadsSplit ) {
int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
ncclPrimitives<UNROLL, ALLTOALLV_CHUNKSTEPS/ALLTOALLV_SLICESTEPS, ALLTOALLV_SLICESTEPS, T, 2, 1, 0, FUNC>
prims(tid, nthreadsSplit, peerNone, &peerSend, NULL, stepSize, channel, comm);
size = sendcounts[peerSend]*typesize;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
ALIGN_SIZE(realChunkSize, nthreadsSplit*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
int nelem = min(realChunkSize, size-chunkOffset);
ssize_t send_offset = chunkOffset + sdispls[peerSend]*typesize;
prims.send(thisInput+send_offset, nelem);
}
} else {
int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
ncclPrimitives<UNROLL, ALLTOALLV_CHUNKSTEPS/ALLTOALLV_SLICESTEPS, ALLTOALLV_SLICESTEPS, T, 1, 2, 0, FUNC>
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, peerNone, NULL, stepSize, channel, comm);
size = recvcounts[peerRecv]*typesize;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
ALIGN_SIZE(realChunkSize, (nthreads-nthreadsSplit)*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
int nelem = min(realChunkSize, size-chunkOffset);
ssize_t recv_offset = chunkOffset + rdispls[peerRecv]*typesize;
prims.recv(thisOutput+recv_offset, nelem);
}
}
}
}
}
};
+2 -2
Voir le fichier
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,4 +8,4 @@
#include "common.h"
#include "collectives.h"
IMPL_COLL_C(ncclBroadcast, ncclCollBroadcast);
IMPL_COLL_C(Broadcast);
+129 -151
Voir le fichier
@@ -9,177 +9,155 @@
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->coll.root;
#ifdef ENABLE_PROFILING
auto devProf = comm->devProf;
uint64_t clk, t0 = 0ULL, ws;
if (tid == 0) clk = __rtc64();
#endif
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->coll.root;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*realChunkSize;
int nelem = min(realChunkSize, size-offset);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*realChunkSize;
int nelem = min(realChunkSize, size-offset);
if (rank == root) {
if (thisInput == thisOutput) {
INIT_COUNTER;
prims.send(thisInput+offset, nelem);
ACCUMULATE_COUNTER(send);
} else {
INIT_COUNTER;
prims.copySend(thisInput+offset, thisOutput+offset, nelem);
ACCUMULATE_COUNTER(copySend);
if (rank == root) {
if (thisInput == thisOutput) {
prims.send(thisInput+offset, nelem);
} else {
prims.copySend(thisInput+offset, thisOutput+offset, nelem);
}
} else if (nextRank == root) {
prims.recv(thisOutput+offset, nelem);
} else {
prims.recvCopySend(thisOutput+offset, nelem);
}
}
} else if (nextRank == root) {
INIT_COUNTER;
prims.recv(thisOutput+offset, nelem);
ACCUMULATE_COUNTER(recv);
} else {
INIT_COUNTER;
prims.recvCopySend(thisOutput+offset, nelem);
ACCUMULATE_COUNTER(recvCopySend);
}
}
#ifdef ENABLE_PROFILING
if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), __rtc64() - clk, __ATOMIC_SEQ_CST);
#endif
}
};
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->coll.root;
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastCollNetKernel(struct CollectiveArgs* args) { }
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->coll.root;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
}
ssize_t offset = gridOffset + bid*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
}
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (rank == root) {
if (thisInput == thisOutput) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
int nelem = min(chunkSize, size-offset);
if (rank == root) {
if (thisInput == thisOutput) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
}
} else if (nextRank == root) {
LLprims.recv(thisOutput + offset, nelem);
} else {
LLprims.recvCopySend(thisOutput + offset, nelem);
}
}
} else if (nextRank == root) {
LLprims.recv(thisOutput + offset, nelem);
} else {
LLprims.recvCopySend(thisOutput + offset, nelem);
}
}
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastCollNetLLKernel(struct CollectiveArgs* args) { }
};
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->coll.root;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->coll.root;
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t offset = gridOffset + bid*chunkSize;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (rank == root) {
if (thisInput == thisOutput) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
int nelem = min(chunkSize, size-offset);
if (rank == root) {
if (thisInput == thisOutput) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
}
} else if (nextRank == root) {
LLprims.recv(thisOutput + offset, nelem);
} else {
LLprims.recvCopySend(thisOutput + offset, nelem);
}
}
} else if (nextRank == root) {
LLprims.recv(thisOutput + offset, nelem);
} else {
LLprims.recvCopySend(thisOutput + offset, nelem);
}
}
}
};
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
template<int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastCollNetLL128Kernel(struct CollectiveArgs* args) { }
template<int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
+302 -253
Voir le fichier
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,154 +11,129 @@
#include "collectives.h"
#include "devcomm.h"
__device__
inline __attribute((always_inline))
long long int __rtc64() {
#if __HIP__
return (long long int) __builtin_amdgcn_s_memrealtime();
#else
return (long long int) __clock_u64();
#endif
}
#define COLL_UNROLL 2
#define NCCL_MAX_DEV_ARITY NCCL_MAX_TREE_ARITY
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
// Each thread sets a predicate to true if abort == 1
// all CTA's threads enter the barrier and do a popc on their predicates being True
// If any of the thread's predicate was True, all the threads call exit()
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#define exitIfAbortBarrier(abort, abortCount) \
if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \
__syncthreads(); \
if (LOAD(abortCount)) { /*asm volatile ("s_endpgm");*/ return false; }
#define __syncwarp()
#else
static inline __device__ void exitIfAbortBarrier(int abort) {
uint32_t popc;
asm ("{");
asm volatile (" .reg .pred barr_pred;");
asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
asm volatile (" bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc));
asm ("}");
if (popc) { asm volatile ("exit;"); }
}
#endif
#define NCCL_FUNC5(coll, op, dtype) \
NCCL_COLL_NAME(coll##LL, op, dtype), \
NCCL_COLL_NAME(coll##LL, op, dtype), \
NCCL_COLL_NAME(coll, op, dtype)
#define NCCL_FUNC5(func, algo, redop, type) \
NCCL_FUNC_NAME(func, algo, LL, redop, type), \
NCCL_FUNC_NAME(func, algo, LL, redop, type), \
NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)
#define NCCL_FUNC4(coll, op, dtype) \
NCCL_FUNC5(coll##Tree, op, dtype), \
NCCL_FUNC5(coll##Ring, op, dtype), \
NCCL_FUNC5(coll##CollNet, op, dtype)
#define NCCL_FUNC4(func, redop, type) \
NCCL_FUNC5(func, TREE, redop, type), \
NCCL_FUNC5(func, RING, redop, type), \
NCCL_FUNC5(func, COLLNET, redop, type)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, u8), \
NCCL_FUNC4(coll, op, i32), \
NCCL_FUNC4(coll, op, u32), \
NCCL_FUNC4(coll, op, i64), \
NCCL_FUNC4(coll, op, u64), \
NCCL_FUNC4(coll, op, f16), \
NCCL_FUNC4(coll, op, f32), \
NCCL_FUNC4(coll, op, f64), \
NCCL_FUNC4(coll, op, b16)
#define NCCL_FUNCS3B(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8)
#define NCCL_FUNCS3A(func, redop) \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, uint8_t), \
NCCL_FUNC4(func, redop, int32_t), \
NCCL_FUNC4(func, redop, uint32_t), \
NCCL_FUNC4(func, redop, int64_t), \
NCCL_FUNC4(func, redop, uint64_t), \
NCCL_FUNC4(func, redop, half), \
NCCL_FUNC4(func, redop, float), \
NCCL_FUNC4(func, redop, double), \
NCCL_FUNC4(func, redop, rccl_bfloat16)
#define NCCL_FUNCS3B(func, redop) \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t)
// Must be consistent with ncclRedOp_t
#define NCCL_FUNCS2A(coll) \
NCCL_FUNCS3A(coll, sum ), \
NCCL_FUNCS3A(coll, prod), \
NCCL_FUNCS3A(coll, max ), \
NCCL_FUNCS3A(coll, min )
#define NCCL_FUNCS2B(coll) \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy)
#define NCCL_FUNCS2A(func) \
NCCL_FUNCS3A(func, Sum ), \
NCCL_FUNCS3A(func, Prod), \
NCCL_FUNCS3A(func, Max ), \
NCCL_FUNCS3A(func, Min )
#define NCCL_FUNCS2B(func) \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum)
// [RCCL] Adding clique-based kernels for AllReduce, in-place of unused RingLL28 kernels
#define NCCL_FUNC5B(coll, op, dtype) \
NCCL_COLL_NAME(coll##LL, op, dtype), \
NCCL_COLL_NAME(coll##LL128, op, dtype), \
NCCL_COLL_NAME(coll, op, dtype)
#define NCCL_FUNC5B(func, algo, redop, type) \
NCCL_FUNC_NAME(func, algo, LL, redop, type), \
NCCL_FUNC_NAME(func, algo, LL128, redop, type), \
NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)
#define NCCL_FUNC4B(coll, op, dtype) \
NCCL_FUNC5(coll##Tree, op, dtype), \
NCCL_FUNC5B(coll##Ring, op, dtype), \
NCCL_FUNC5(coll##CollNet, op, dtype)
#define NCCL_FUNC4B(func, redop, type) \
NCCL_FUNC5(func, TREE, redop, type), \
NCCL_FUNC5B(func, RING, redop, type), \
NCCL_FUNC5(func, COLLNET, redop, type)
#define NCCL_FUNCS3C(coll, op) \
NCCL_FUNC4B(coll, op, i8), \
NCCL_FUNC4B(coll, op, u8), \
NCCL_FUNC4B(coll, op, i32), \
NCCL_FUNC4B(coll, op, u32), \
NCCL_FUNC4B(coll, op, i64), \
NCCL_FUNC4B(coll, op, u64), \
NCCL_FUNC4B(coll, op, f16), \
NCCL_FUNC4B(coll, op, f32), \
NCCL_FUNC4B(coll, op, f64), \
NCCL_FUNC4B(coll, op, b16)
#define NCCL_FUNCS2C(coll) \
NCCL_FUNCS3C(coll, sum ), \
NCCL_FUNCS3C(coll, prod), \
NCCL_FUNCS3C(coll, max ), \
NCCL_FUNCS3C(coll, min )
// [/RCCL]
#define NCCL_FUNCS3C(func, redop) \
NCCL_FUNC4B(func, redop, int8_t), \
NCCL_FUNC4B(func, redop, uint8_t), \
NCCL_FUNC4B(func, redop, int32_t), \
NCCL_FUNC4B(func, redop, uint32_t), \
NCCL_FUNC4B(func, redop, int64_t), \
NCCL_FUNC4B(func, redop, uint64_t), \
NCCL_FUNC4B(func, redop, half), \
NCCL_FUNC4B(func, redop, float), \
NCCL_FUNC4B(func, redop, double), \
NCCL_FUNC4B(func, redop, rccl_bfloat16)
#define NCCL_FUNCS2C(func) \
NCCL_FUNCS3C(func, Sum ), \
NCCL_FUNCS3C(func, Prod), \
NCCL_FUNCS3C(func, Max ), \
NCCL_FUNCS3C(func, Min )
// Must be consistent with ncclFunc_t
#define NCCL_FUNCS() { \
NCCL_FUNCS2B(ncclBroadcast), \
NCCL_FUNCS2A(ncclReduce), \
NCCL_FUNCS2B(ncclAllGather), \
NCCL_FUNCS2A(ncclReduceScatter), \
NCCL_FUNCS2C(ncclAllReduce), \
NCCL_COLL_NAME(ncclGather, copy, i8), \
NCCL_COLL_NAME(ncclScatter, copy, i8), \
NCCL_COLL_NAME(ncclAllToAll, copy, i8), \
NCCL_COLL_NAME(ncclAllToAllv, copy, i8), \
NCCL_COLL_NAME(ncclSendRecv, copy, i8) }
NCCL_FUNCS2B(Broadcast), \
NCCL_FUNCS2A(Reduce), \
NCCL_FUNCS2B(AllGather), \
NCCL_FUNCS2A(ReduceScatter), \
NCCL_FUNCS2C(AllReduce), \
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), \
NCCL_FUNC_NAME(AllToAll, RING, SIMPLE, Sum, int8_t), \
NCCL_FUNC_NAME(AllToAllv, RING, SIMPLE, Sum, int8_t) }
// [/RCCL]
// Must be consistent with the ncclFuncSet enum
using ncclKernelFunc_t = void (*)(struct CollectiveArgs*);
using ncclKernelFunc_t = void (*)(struct ncclWorkElem* args);
static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
// Don't try to initialize the host shadow copy of this device-side global
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
#if defined(__HIP_DEVICE_COMPILE__)
NCCL_FUNCS2B(ncclBroadcast),
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
NCCL_FUNCS2A(ncclReduceScatter),
NCCL_FUNCS2C(ncclAllReduce),
NCCL_COLL_NAME(ncclGather, copy, i8),
NCCL_COLL_NAME(ncclScatter, copy, i8),
NCCL_COLL_NAME(ncclAllToAll, copy, i8),
NCCL_COLL_NAME(ncclAllToAllv, copy, i8),
NCCL_COLL_NAME(ncclSendRecv, copy, i8)
NCCL_FUNCS2B(Broadcast),
NCCL_FUNCS2A(Reduce),
NCCL_FUNCS2B(AllGather),
NCCL_FUNCS2A(ReduceScatter),
NCCL_FUNCS2C(AllReduce),
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
NCCL_FUNC_NAME(AllToAll, RING, SIMPLE, Sum, int8_t),
NCCL_FUNC_NAME(AllToAllv, RING, SIMPLE, Sum, int8_t),
#endif
};
template<unsigned short f, unsigned short l>
struct Caller {
static __device__ __host__
void call(ncclColl* const c) noexcept
void call(struct ncclWorkElem* const c) noexcept
{
constexpr unsigned short m = f + (l - f) / 2;
@@ -169,78 +144,88 @@ struct Caller {
template<unsigned short f>
struct Caller<f, f + 1>{
static __device__ __host__
void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
void call(struct ncclWorkElem* const c) noexcept { ncclFuncs[f](c); }
};
static_assert(FUNC_INDEX_P2P == 1800, "Wrong P2P function index");
inline
__device__
void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
void NCCL_CALL_FUNCTIONS(struct ncclWorkElem* const c) noexcept {
if (c->funcIndex < 360) {
if (c->funcIndex % 9 == 0) ncclBroadcastTreeLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 1) ncclBroadcastTreeLL128_copy_i8(&c->args);
else if (c->funcIndex % 9 == 2) ncclBroadcastTree_copy_i8(&c->args);
else if (c->funcIndex % 9 == 3) ncclBroadcastRingLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 4) ncclBroadcastRingLL128_copy_i8(&c->args);
else if (c->funcIndex % 9 == 5) ncclBroadcastRing_copy_i8(&c->args);
else if (c->funcIndex % 9 == 6) ncclBroadcastCollNetLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 7) ncclBroadcastCollNetLL128_copy_i8(&c->args);
else ncclBroadcastCollNet_copy_i8(&c->args);
if (c->funcIndex % 9 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t(c);
else if (c->funcIndex % 9 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t(c);
else if (c->funcIndex % 9 == 6) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 7) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(c);
else ncclFunction_Broadcast_COLLNET_SIMPLE_Sum_int8_t(c);
}
else if (c->funcIndex < 720) Caller<360, 720>::call(c);
else if (c->funcIndex < 1080) {
if (c->funcIndex % 9 == 0) ncclAllGatherTreeLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 1) ncclAllGatherTreeLL128_copy_i8(&c->args);
else if (c->funcIndex % 9 == 2) ncclAllGatherTree_copy_i8(&c->args);
else if (c->funcIndex % 9 == 3) ncclAllGatherRingLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 4) ncclAllGatherRingLL128_copy_i8(&c->args);
else if (c->funcIndex % 9 == 5) ncclAllGatherRing_copy_i8(&c->args);
else if (c->funcIndex % 9 == 6) ncclAllGatherCollNetLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 7) ncclAllGatherCollNetLL128_copy_i8(&c->args);
else ncclAllGatherCollNet_copy_i8(&c->args);
if (c->funcIndex % 9 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t(c);
else if (c->funcIndex % 9 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t(c);
else if (c->funcIndex % 9 == 6) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(c);
else if (c->funcIndex % 9 == 7) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(c);
else ncclFunction_AllGather_COLLNET_SIMPLE_Sum_int8_t(c);
}
else if (c->funcIndex < 1800) Caller<1080, 1800>::call(c);
else if (c->funcIndex == 1800) {
ncclGather_copy_i8(&c->args);
}
else if (c->funcIndex == 1801) {
ncclScatter_copy_i8(&c->args);
}
else if (c->funcIndex == 1802) {
ncclAllToAll_copy_i8(&c->args);
}
else if (c->funcIndex == 1803) {
ncclAllToAllv_copy_i8(&c->args);
}
else ncclSendRecv_copy_i8(&c->args);
else if (c->funcIndex == 1800) ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t(c);
else if (c->funcIndex == 1801) ncclFunction_AllToAll_RING_SIMPLE_Sum_int8_t(c);
else if (c->funcIndex == 1802) ncclFunction_AllToAllv_RING_SIMPLE_Sum_int8_t(c);
}
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
int* d = (int*)dst;
int* s = (int*)src;
for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
}
static __device__ bool load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, struct ncclDevComm* comm, uint32_t* abortCount) {
// Check whether the last operation was aborted and make sure all threads exit
int abort = tid == 0 ? *(comm->abortFlag) : 0;
exitIfAbortBarrier(abort, abortCount);
load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid, abortCount);
static __device__ bool load_coll(struct ncclWork* localWork, struct ncclWork* hostWork, int tid, struct ncclDevComm* comm, uint32_t* abortCount) {
__syncthreads();
if (tid == 0) hostColl->active = 0;
load_parallel(localWork, hostWork, sizeof(struct ncclWork), tid);
// Check whether the last operation was aborted and make sure all threads exit
int abort = tid == 0 ? LOAD(comm->abortFlag) : 0;
exitIfAbortBarrier(abort, abortCount);
if (tid == 0) hostWork->elems[0].active = 0;
return true;
}
template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction {
public:
__device__ void run(struct ncclWorkElem* args) {}
};
#ifdef ENABLE_COLLTRACE
#define traceColl(fIdx) \
uint32_t pos = __atomic_fetch_add(comm->collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
comm->collTrace[pos].timeStamp = __rtc64(); \
comm->collTrace[pos].opCount = localColl.args.opCount; \
comm->collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
comm->collTrace[pos].opCount = w->opCount; \
comm->collTrace[pos].bid = bid; \
comm->collTrace[pos].funcIndex = fIdx;
comm->collTrace[pos].funcIndex = fIdx; \
if (fIdx == FUNC_INDEX_P2P) { \
comm->collTrace[pos].p2p.nThreads = w->p2p.nThreads; \
comm->collTrace[pos].p2p.delta = (uint16_t)(w->p2p.delta); \
} else if (fIdx == FUNC_INDEX_A2AV) { \
comm->collTrace[pos].coll.nThreads = w->nThreads; \
comm->collTrace[pos].coll.bid = w->a2av.bid; \
comm->collTrace[pos].coll.nChannels = w->a2av.nChannels; \
} else { \
comm->collTrace[pos].coll.nThreads = w->nThreads; \
comm->collTrace[pos].coll.bid = w->coll.bid; \
comm->collTrace[pos].coll.nChannels = w->coll.nChannels; \
}
#define traceKernelLaunch(fIdx) { \
traceColl(fIdx); \
comm->collTrace[pos].type = ncclCollTraceKernelLaunchType; \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (comm->collTrace[pos].data_0)); \
comm->collTrace[pos].type = ncclCollTraceKernelLaunchType; \
}
#define traceCollEnd(fIdx) { \
traceColl(fIdx); \
@@ -250,124 +235,188 @@ static __device__ bool load_coll(struct ncclColl* localColl, struct ncclColl* ho
traceColl(fIdx); \
comm->collTrace[pos].type = ncclCollTraceAbortType; \
}
// traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1)
#define traceData(data2, data4, data8_0, data8_1) { \
uint32_t pos = __atomic_fetch_add(comm->collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
comm->collTrace[pos].bid = blockIdx.x; \
comm->collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
comm->collTrace[pos].funcIndex = data2; \
comm->collTrace[pos].data_0 = data4; \
comm->collTrace[pos].opCount = data8_0; \
comm->collTrace[pos].data_1 = data8_1; \
comm->collTrace[pos].type = ncclCollTraceDataType; \
}
#else
#define traceKernelLaunch()
#define traceCollEnd()
#define traceAbort()
#define traceKernelLaunch(fIdx)
#define traceCollEnd(fIdx)
#define traceAbort(fIdx)
#define traceData(data2, data4, data8_0, data8_1)
#endif
extern __device__ volatile uint64_t* ncclShmem;
#define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
struct ncclShmemPtrs {
void* srcs[NCCL_MAX_DEV_ARITY+1];
void* dsts[NCCL_MAX_DEV_ARITY+1];
uint64_t barrier;
uint64_t barrier_next[MAXWARPS];
};
struct ncclShmemData {
union {
#ifdef ENABLE_LL128
#define ALLOCATE_SHMEM \
__shared__ volatile uint64_t shmem[NCCL_LL128_SHMEM_SIZE]; \
ncclShmem = shmem; \
__shared__ uint32_t sync[NCCL_LL128_MAX_NTHREADS/WARP_SIZE];
volatile uint64_t data[NCCL_LL128_SHMEM_SIZE];
#else
#define ALLOCATE_SHMEM \
uint32_t* sync = 0;
volatile uint64_t* data;
#endif
struct ncclShmemPtrs ptrs[NCCL_MAX_GROUPS];
};
uint32_t sync[MAXWARPS];
struct ncclWork localWork;
};
/* Functions for aggregation case */
#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(args); \
extern __device__ struct ncclShmemData *ncclShmem;
template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL, int FINDEX, bool COLLTRACE>
__device__ void ncclKernel(struct ncclWorkElem first) {
int tid = threadIdx.x;
int bid = blockIdx.x;
__shared__ struct ncclShmemData shmem;
ncclShmem = &shmem;
__shared__ uint32_t abortCount;
if (tid == 0) {
abortCount = 0;
for (auto i = 0; i < NCCL_MAX_GROUPS; i++) {
shmem.ptrs[i].barrier = 0;
for (auto j = 0; j < MAXWARPS; j++) shmem.ptrs[i].barrier_next[j] = 0;
}
}
__syncthreads();
auto f = ncclFunction<FUNCTION, ALGO, PROTO, REDOP, T, UNROLL>();
struct ncclDevComm* comm = first.comm;
struct ncclChannel* channel = comm->channels+bid;
struct ncclWorkElem* w = NULL;
uint16_t index = first.index;
bool firstLaunch = true;
if (bid == 0 && first.funcIndex != FUNC_INDEX_P2P) w = &first;
while (1) {
if (w == NULL) {
w = shmem.localWork.elems;
if (!load_coll(&shmem.localWork, channel->workFifo+index, tid, comm, &abortCount)) {
if (COLLTRACE && tid == 0) traceAbort(0xffff);
return;
}
if (COLLTRACE && tid == 0) {
if (firstLaunch) traceKernelLaunch(w->funcIndex);
if (!firstLaunch) traceCollEnd(w->funcIndex);
firstLaunch = false;
}
} else if (COLLTRACE && tid == 0) {
traceKernelLaunch(w->funcIndex);
firstLaunch = false;
}
if (tid < w->nThreads) {
if (w->funcIndex == FINDEX) {
f.run(w);
} else {
NCCL_CALL_FUNCTIONS(w);
}
}
index = (index+1) % NCCL_MAX_OPS;
if (w->active == 2) {
if (COLLTRACE && tid == 0) traceCollEnd(0xffff);
return;
}
w = NULL;
}
}
/* Kernels with the first operation inlined */
#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
#define IMPL_COLL_KERN(func, algo, proto, redop, type, fIndex) \
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm) { \
int tid = threadIdx.x; \
int bid = blockIdx.x; \
ALLOCATE_SHMEM; \
__shared__ struct ncclColl localColl; \
__shared__ uint32_t abortCount; \
__shared__ uint64_t barrier[MAXBARRIERS]; \
__shared__ uint64_t barrier_next[MAXBARRIERS*MAXWARPS]; \
if (tid == 0) abortCount = 0; \
__syncthreads(); \
\
struct ncclChannel* channel = comm->channels+bid; \
if (tid == 0) { \
channel->sync = sync; \
channel->barrier = barrier; \
channel->barrier_next = barrier_next; \
for (auto i = 0; i < MAXBARRIERS; i++) barrier[i] = 0; \
for (auto i = 0; i < MAXBARRIERS*MAXWARPS; i++) barrier_next[i] = 0; \
} \
if (!load_coll(&localColl, channel->collectives+channel->collFifoHead, tid, comm, &abortCount)) { \
if (tid == 0) traceAbort(-1); \
return; \
} \
if (tid == 0) traceKernelLaunch(localColl.funcIndex); \
while (1) { \
if (tid < localColl.args.common.nThreads) { \
if (localColl.funcIndex == fIndex) { \
coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&localColl.args); \
} else { \
NCCL_CALL_FUNCTIONS(&localColl); \
} \
} \
int nextIndex = localColl.nextIndex; \
if (tid == 0) channel->collFifoHead = nextIndex; \
\
if (localColl.active == 2) { \
if (tid == 0) traceCollEnd(-1); \
return; \
} \
\
/* Load next collective operation*/ \
if (!load_coll(&localColl, channel->collectives+nextIndex, tid, comm, &abortCount)) { \
if (tid == 0) traceAbort(-1); \
break; \
} \
if (tid == 0) traceCollEnd(localColl.funcIndex); \
} \
__global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(struct ncclWorkElem first) { \
if (first.comm->collTraceThread) \
ncclKernel<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL, fIndex, true>(first); \
else \
ncclKernel<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL, fIndex, false>(first); \
}
#define IMPL_COLL_KERN_sum(coll, op, ncclFunc, dtype, ctype, fIndex) \
IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
#define IMPL_COLL_KERN_copy(coll, op, ncclFunc, dtype, ctype, fIndex) \
IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
#define IMPL_COLL_KERN_prod(coll, op, ncclFunc, dtype, ctype, fIndex)
#define IMPL_COLL_KERN_min(coll, op, ncclFunc, dtype, ctype, fIndex)
#define IMPL_COLL_KERN_max(coll, op, ncclFunc, dtype, ctype, fIndex)
// Examples : AllReduce, RING, LL, Sum, uint8
/* Functions for aggregation case */
#define IMPL_COLL_FUNC(func, algo, proto, redop, type) \
__device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, redop, type)(struct ncclWorkElem* args) { \
auto f = ncclFunction<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL>(); \
f.run(args); \
}
// Only generate inline kernels for LL
#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
IMPL_COLL_FUNC(coll##LL128, op, ncclFunc, dtype, ctype) \
IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
#define IMPL_COLL4(func, algo, redop, type, ncclType) \
IMPL_COLL_FUNC(func, algo, LL, redop, type) \
IMPL_COLL_FUNC(func, algo, SIMPLE, redop, type)
#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING) \
IMPL_COLL4(coll##CollNet, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_COLLNET)
#define IMPL_COLL3(func, redop, type, ncclType) \
IMPL_COLL4(func, TREE, redop, type, ncclType) \
IMPL_COLL4(func, RING, redop, type, ncclType) \
IMPL_COLL4(func, COLLNET, redop, type, ncclType)
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \
IMPL_COLL3(coll, op, ncclFunc, u8, uint8_t, ncclColl, ncclOp, ncclUint8) \
IMPL_COLL3(coll, op, ncclFunc, i32, int32_t, ncclColl, ncclOp, ncclInt32) \
IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \
IMPL_COLL3(coll, op, ncclFunc, i64, int64_t, ncclColl, ncclOp, ncclInt64) \
IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \
IMPL_COLL3(coll, op, ncclFunc, f16, half, ncclColl, ncclOp, ncclFloat16) \
IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32) \
IMPL_COLL3(coll, op, ncclFunc, f64, double, ncclColl, ncclOp, ncclFloat64) \
IMPL_COLL3(coll, op, ncclFunc, b16, rccl_bfloat16, ncclColl, ncclOp, ncclBfloat16)
#define IMPL_COLL2(func, redop) \
IMPL_COLL3(func, redop, int8_t, ncclInt8) \
IMPL_COLL3(func, redop, uint8_t, ncclUint8) \
IMPL_COLL3(func, redop, int32_t, ncclInt32) \
IMPL_COLL3(func, redop, uint32_t, ncclUint32) \
IMPL_COLL3(func, redop, int64_t, ncclInt64) \
IMPL_COLL3(func, redop, uint64_t, ncclUint64) \
IMPL_COLL3(func, redop, half, ncclFloat16) \
IMPL_COLL3(func, redop, float, ncclFloat32) \
IMPL_COLL3(func, redop, double, ncclFloat64) \
IMPL_COLL3(func, redop, rccl_bfloat16, ncclBfloat16)
// Reduction define all functions
#define IMPL_COLL_R(collf, colln) \
IMPL_COLL2(collf, sum, FuncSum, colln, ncclSum); \
IMPL_COLL2(collf, prod, FuncProd, colln, ncclProd); \
IMPL_COLL2(collf, min, FuncMin, colln, ncclMin); \
IMPL_COLL2(collf, max, FuncMax, colln, ncclMax);
#define IMPL_COLL_R(func) \
IMPL_COLL2(func, Sum) \
IMPL_COLL2(func, Prod) \
IMPL_COLL2(func, Min) \
IMPL_COLL2(func, Max)
// Copy primitives only define one
#define IMPL_COLL_C(collf, colln) \
IMPL_COLL3(collf, copy, FuncSum, i8, int8_t, colln, ncclSum, ncclInt8);
// [RCCL] Define clique-based implementations (repurposed LL128)
#define IMPL_COLL4_CLIQUE(func, algo, redop, type, ncclType) \
IMPL_COLL_FUNC(func, algo, LL, redop, type) \
IMPL_COLL_FUNC(func, algo, LL128, redop, type) \
IMPL_COLL_FUNC(func, algo, SIMPLE, redop, type)
#define COLL_UNROLL 2
#define IMPL_COLL3_CLIQUE(func, redop, type, ncclType) \
IMPL_COLL4(func, TREE, redop, type, ncclType) \
IMPL_COLL4_CLIQUE(func, RING, redop, type, ncclType) \
IMPL_COLL4(func, COLLNET, redop, type, ncclType)
#define IMPL_COLL2_CLIQUE(func, redop) \
IMPL_COLL3_CLIQUE(func, redop, int8_t, ncclInt8) \
IMPL_COLL3_CLIQUE(func, redop, uint8_t, ncclUint8) \
IMPL_COLL3_CLIQUE(func, redop, int32_t, ncclInt32) \
IMPL_COLL3_CLIQUE(func, redop, uint32_t, ncclUint32) \
IMPL_COLL3_CLIQUE(func, redop, int64_t, ncclInt64) \
IMPL_COLL3_CLIQUE(func, redop, uint64_t, ncclUint64) \
IMPL_COLL3_CLIQUE(func, redop, half, ncclFloat16) \
IMPL_COLL3_CLIQUE(func, redop, float, ncclFloat32) \
IMPL_COLL3_CLIQUE(func, redop, double, ncclFloat64) \
IMPL_COLL3_CLIQUE(func, redop, rccl_bfloat16, ncclBfloat16)
#define IMPL_COLL_CLIQUE(func) \
IMPL_COLL2_CLIQUE(func, Sum) \
IMPL_COLL2_CLIQUE(func, Prod) \
IMPL_COLL2_CLIQUE(func, Min) \
IMPL_COLL2_CLIQUE(func, Max)
// [/RCCL]
// Copy primitives only define one function for copy
#define IMPL_COLL_C(func) IMPL_COLL3(func, Sum, int8_t, ncclInt8);
// Point-to-point primitives only have one function/kernel.
#define IMPL_COLL_P(func) \
IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t); \
IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);
#endif
+95 -77
Voir le fichier
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -282,28 +282,57 @@ inline __device__ void Store128(Pack128* p, Pack128& v) {
#endif
}
template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceCopyMulti(const int tid, const int nthreads,
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
const int offset, const int N) {
for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
T val = vFetch(srcs[0]+idx);
#pragma unroll
for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
#pragma unroll 1
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ __forceinline__ void ReduceCopyMulti(const int w, const int nw, const int t,
int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Nelem) {
const int inc = nw * UNROLL * WARP_SIZE;
int offset = w * UNROLL * WARP_SIZE + t;
const T* srcs[MAXSRCS];
for (int i=0; i<MAXSRCS; i++) srcs[i] = s[i]+elemOffset+offset;
T* dsts[MAXDSTS];
for (int i=0; i<MAXDSTS; i++) dsts[i] = d[i]+elemOffset+offset;
while (offset < Nelem) {
T vals[UNROLL];
// Load and reduce
for (int u = 0; u < UNROLL; ++u) vals[u] = vFetch(srcs[0]+u*WARP_SIZE);
#pragma unroll
for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
#pragma unroll 1
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
for (int i=1; i<MINSRCS; i++) {
T vals2[UNROLL];
for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
for (int u = 0; u < UNROLL; ++u) vals[u] = FUNC()(vals[u], vals2[u]);
}
#pragma unroll
for (int i=MINSRCS; i<MAXSRCS; i++) {
if (i<nsrcs) {
T vals2[UNROLL];
for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
for (int u = 0; u < UNROLL; ++u) vals[u] = FUNC()(vals[u], vals2[u]);
}
}
// Store
#pragma unroll
for (int i = 0; i < MINDSTS; i++) {
for (int u = 0; u < UNROLL; ++u) vStore(dsts[i]+u*WARP_SIZE, vals[u]);
}
#pragma unroll
for (int i=MINDSTS; i<MAXDSTS; i++) {
if (i<ndsts) {
for (int u = 0; u < UNROLL; ++u) vStore(dsts[i]+u*WARP_SIZE, vals[u]);
}
}
for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
offset += inc;
}
}
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
const int elemOffset, const int Npack) {
__device__ void ReduceCopy128bMulti(const int w, const int nw, const int t,
int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Npack) {
const int inc = nw * UNROLL * WARP_SIZE;
int offset = w * UNROLL * WARP_SIZE + t;
@@ -334,8 +363,10 @@ __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
}
#pragma unroll 1
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
for (int i=MINDSTS; i<MAXDSTS; i++) {
if (i<ndsts) {
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
}
}
for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
@@ -343,12 +374,10 @@ __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
}
}
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
template <typename T>
__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(int32_t); }
#else
__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
#endif
#define PACKELEMS (sizeof(Pack128) / sizeof(T))
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
// Multiply UNROLL by 2 if single source/single destination
@@ -360,73 +389,62 @@ __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
#endif
template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
__device__ __forceinline__ void ReduceOrCopyMulti(const int tid, const int nthreads,
int nsrcs, const T** srcs, int ndsts, T** dsts,
int N) {
int Nrem = N;
if (Nrem <= 0) return;
int alignDiff = 0;
int align = ptrAlign128(srcs[0]);
#pragma unroll
for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
#pragma unroll
for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
int Npreamble = alignDiff ? Nrem :
N < alignof(int32_t) ? N :
(alignof(int32_t) - align) % alignof(int32_t);
#else
int Npreamble = alignDiff ? Nrem :
N < alignof(Pack128) ? N :
(alignof(Pack128) - align) % alignof(Pack128);
#endif
// stage 1: preamble: handle any elements up to the point of everything coming
// into alignment
if (Npreamble) {
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
Nrem -= Npreamble;
if (Nrem == 0) return;
}
int offset = Npreamble;
// stage 2: fast path: use 128b loads/stores to do the bulk of the work,
// assuming the pointers we have are all 128-bit alignable.
int w = tid / WARP_SIZE; // Warp number
int nw = nthreads / WARP_SIZE; // Number of warps
int t = tid % WARP_SIZE; // Thread (inside the warp)
const int packFactor = sizeof(Pack128) / sizeof(T);
// Check that all is 16B aligned. If not don't use 16B load/stores.
int align = 0;
#pragma unroll
for (int i=0; i<MINSRCS; i++) align |= ptrAlign128(srcs[i]);
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) align |= ptrAlign128(srcs[i]);
#pragma unroll
for (int i=0; i<MINDSTS; i++) align |= ptrAlign128(dsts[i]);
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) align |= ptrAlign128(dsts[i]);
// stage 2a: main loop
int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
* (AUTOUNROLL * WARP_SIZE); // round down
int Nelem2a = Npack2a * packFactor;
int offset = 0;
if (align == 0) {
// fast path: use 128b loads/stores to do the bulk of the work,
// assuming the pointers we have are all 128-bit aligned.
ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
// main loop
int Npack = (Nrem / (PACKELEMS*AUTOUNROLL*WARP_SIZE)) * (AUTOUNROLL*WARP_SIZE); // round down
int Nelem = Npack * PACKELEMS;
Nrem -= Nelem2a;
ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack);
Nrem -= Nelem;
if (Nrem == 0) return;
offset += Nelem;
// slightly less optimized for section when we don't have full unrolling
Npack = Nrem / PACKELEMS;
Nelem = Npack * PACKELEMS;
ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack);
Nrem -= Nelem;
if (Nrem == 0) return;
offset += Nelem;
}
// unrolled, by-type (mostly for unaligned buffers)
int Nelem = (Nrem / (UNROLL*PACKELEMS/2*WARP_SIZE)) * (UNROLL*PACKELEMS/2*WARP_SIZE); // round down
ReduceCopyMulti<FUNC, T, UNROLL*PACKELEMS/2, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nelem);
Nrem -= Nelem;
if (Nrem == 0) return;
offset += Nelem2a;
offset += Nelem;
// stage 2b: slightly less optimized for section when we don't have full
// unrolling
int Npack2b = Nrem / packFactor;
int Nelem2b = Npack2b * packFactor;
ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
Nrem -= Nelem2b;
if (Nrem == 0) return;
offset += Nelem2b;
// stage 2c: tail
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
// no unroll, by type. Should finish what's remaining.
ReduceCopyMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nrem);
}
#endif // COMMON_KERNEL_H_
+51 -51
Voir le fichier
@@ -9,62 +9,62 @@
#include "collectives.h"
#include "common.h"
__device__ volatile uint64_t* ncclShmem;
__device__ struct ncclShmemData* ncclShmem;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
NCCL_FUNC5(coll, op, dtype) \
NCCL_COLL_NAME(coll##LL, op, dtype), \
NCCL_COLL_NAME(coll##LL128, op, dtype), \
NCCL_COLL_NAME(coll, op, dtype)
#define NCCL_FUNC5(func, algo, redop, type) \
NCCL_FUNC_NAME(func, algo, LL, redop, type), \
NCCL_FUNC_NAME(func, algo, LL128, redop, type), \
NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)
#define NCCL_FUNC4(coll, op, dtype) \
NCCL_FUNC5(coll##Tree, op, dtype), \
NCCL_FUNC5(coll##Ring, op, dtype), \
NCCL_FUNC5(coll##CollNet, op, dtype)
#define NCCL_FUNC4(func, redop, type) \
NCCL_FUNC5(func, TREE, redop, type), \
NCCL_FUNC5(func, RING, redop, type), \
NCCL_FUNC5(func, COLLNET, redop, type)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, u8), \
NCCL_FUNC4(coll, op, i32), \
NCCL_FUNC4(coll, op, u32), \
NCCL_FUNC4(coll, op, i64), \
NCCL_FUNC4(coll, op, u64), \
NCCL_FUNC4(coll, op, f16), \
NCCL_FUNC4(coll, op, f32), \
NCCL_FUNC4(coll, op, f64)
#define NCCL_FUNCS3B(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8)
#define NCCL_FUNCS3A(func, redop) \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, uint8_t), \
NCCL_FUNC4(func, redop, int32_t), \
NCCL_FUNC4(func, redop, uint32_t), \
NCCL_FUNC4(func, redop, int64_t), \
NCCL_FUNC4(func, redop, uint64_t), \
NCCL_FUNC4(func, redop, half), \
NCCL_FUNC4(func, redop, float), \
NCCL_FUNC4(func, redop, double)
#define NCCL_FUNCS3B(func, redop) \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t)
// Must be consistent with ncclRedOp_t
#define NCCL_FUNCS2A(coll) \
NCCL_FUNCS3A(coll, sum ), \
NCCL_FUNCS3A(coll, prod), \
NCCL_FUNCS3A(coll, max ), \
NCCL_FUNCS3A(coll, min )
#define NCCL_FUNCS2B(coll) \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy)
#define NCCL_FUNCS2A(func) \
NCCL_FUNCS3A(func, Sum ), \
NCCL_FUNCS3A(func, Prod), \
NCCL_FUNCS3A(func, Max ), \
NCCL_FUNCS3A(func, Min )
#define NCCL_FUNCS2B(func) \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum)
// Must be consistent with ncclFunc_t
#define NCCL_FUNCS() { \
NCCL_COLL_NAME(ncclSendRecv, copy, i8),\
NCCL_FUNCS2B(ncclBroadcast), \
NCCL_FUNCS2A(ncclReduce), \
NCCL_FUNCS2B(ncclAllGather), \
NCCL_FUNCS2A(ncclReduceScatter), \
NCCL_FUNCS2A(ncclAllReduce) }
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),\
NCCL_FUNCS2B(Broadcast), \
NCCL_FUNCS2A(Reduce), \
NCCL_FUNCS2B(AllGather), \
NCCL_FUNCS2A(ReduceScatter), \
NCCL_FUNCS2A(AllReduce) }
// Must be consistent with the ncclFuncSet enum
__device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
@@ -72,12 +72,12 @@ __device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCC
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
#if __CUDA_ARCH__
NCCL_COLL_NAME(ncclSendRecv, copy, i8),
NCCL_FUNCS2B(ncclBroadcast),
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
NCCL_FUNCS2A(ncclReduceScatter),
NCCL_FUNCS2A(ncclAllReduce)
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
NCCL_FUNCS2B(Broadcast),
NCCL_FUNCS2A(Reduce),
NCCL_FUNCS2B(AllGather),
NCCL_FUNCS2A(ReduceScatter),
NCCL_FUNCS2A(AllReduce)
#endif
};
#endif
-12
Voir le fichier
@@ -1,12 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "gather.h"
#include "common.h"
#include "collectives.h"
IMPL_COLL_FUNC(ncclGather, copy, FuncSum, i8, int8_t);
IMPL_COLL_KERN(ncclGather, copy, FuncSum, i8, int8_t, 0);
-75
Voir le fichier
@@ -1,75 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclGatherKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->coll.count;
const int nranks = comm->nRanks;
const int bid = args->coll.bid;
const int rank = ring->devUserRanks[0];
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * GATHER_CHUNKSTEPS;
const int peersPerChan = DIVUP(nranks, nChannels);
const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
const int root = args->coll.root;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
for (int i = 0; i < peersPerChan; i++) {
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
continue;
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
int nelem = min(realChunkSize, size-chunkOffset);
if ((blockIdx.x*peersPerChan+i)%nranks == 0 && rank == root) {
const T* sendbuff = thisInput+chunkOffset;
T* recvbuff = thisOutput+chunkOffset+rank*size;
if (tid < nthreads && sendbuff != recvbuff) {
// local copy
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
}
}
else {
int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
int peerNone = -1;
if (rank == root) {
ncclPrimitives<UNROLL, GATHER_CHUNKSTEPS/GATHER_SLICESTEPS, GATHER_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &peerRecv, &peerNone, NULL, stepSize, channel, comm);
ssize_t recv_offset = chunkOffset + peerRecv*size;
prims.recv(thisOutput+recv_offset, nelem);
}
else {
if (peerSend == root) {
ncclPrimitives<UNROLL, GATHER_CHUNKSTEPS/GATHER_SLICESTEPS, GATHER_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &peerNone, &peerSend, NULL, stepSize, channel, comm);
ssize_t send_offset = chunkOffset;
prims.send(thisInput+send_offset, nelem);
}
}
}
}
}
}
-20
Voir le fichier
@@ -9,29 +9,9 @@
#define OP128_H_
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
v0=LOAD(ptr);
v1=LOAD(ptr+1);
}
inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
STORE(ptr, v0);
STORE(ptr+1, v1);
}
inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
return (uint64_t*)shmemGenericPtr;
}
inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
v0=LOAD(shmemAsmPtr);
v1=LOAD(shmemAsmPtr+1);
}
inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
STORE(shmemAsmPtr, v0);
STORE(shmemAsmPtr+1, v1);
}
#else
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
+170 -237
Voir le fichier
@@ -32,87 +32,79 @@
} \
} while (0)
#define barrier_by_id(id) do { \
#define barrier_by_group() do { \
const int w = threadIdx.x/WARP_SIZE; \
barrier_next[id*MAXWARPS+w] += nthreads/WARP_SIZE; \
__atomic_fetch_add(barriers+id, 1, __ATOMIC_SEQ_CST); \
while (LOAD(barriers+id) < barrier_next[id*MAXWARPS+w]) /* spin */; \
const int wid = threadIdx.x%WARP_SIZE; \
if (wid == 0) { \
barrier_next[w] += nthreads/WARP_SIZE; \
__atomic_fetch_add(barriers, 1, __ATOMIC_SEQ_CST); \
while (LOAD(barriers) < barrier_next[w]) /* spin */; \
} \
} while (0)
#define ROLE_SRC 0x01
#define ROLE_DST 0x02
#define ROLE_WAIT_RECV 0x04
#define ROLE_WAIT_SEND 0x08
#define ROLE_POST_SEND 0x10
#define ROLE_POST_RECV 0x20
// Implementation of primitive types
template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, int DIRECT, class FUNC>
class ncclPrimitives {
private:
const int tid;
const int nthreads;
const int wid;
int nthreads;
int nworkers;
const int stepSize;
int nrecv = 0;
int nsend = 0;
struct ncclConnInfo* recvConn = NULL;
volatile uint64_t* recvConnHeadPtr = NULL;
uint64_t recvConnHead;
volatile uint64_t* recvConnTailPtr = NULL;
uint64_t recvConnTail;
uint64_t recvConnTailCache; // Cache last seen value
struct ncclConnInfo* conn = NULL;
volatile int* connSizesFifoPtr = NULL;
void** connPtrsFifoPtr = NULL;
volatile uint64_t* connHeadPtr = NULL;
volatile uint64_t* connTailPtr = NULL;
uint64_t connTailCache; // Cache last seen value
uint64_t connHeadCache; // Cache last seen value
struct ncclConnInfo* sendConn = NULL;
volatile int* sendConnFifoPtr = NULL;
volatile uint64_t* sendConnTailPtr = NULL;
uint64_t sendConnTail;
volatile uint64_t* sendConnHeadPtr = NULL;
uint64_t sendConnHead;
uint64_t sendConnHeadCache; // Cache last seen value
uint64_t recvStep[NRECV];
uint64_t sendStep[NSEND];
#if defined(RCCL_USE_DIRECT_BUFFER)
const T* recvDirectBuff[NRECV];
T* sendDirectBuff[NSEND];
#endif
const T* recvBuff[NRECV];
T* sendBuff[NSEND];
int index; // Peer index I'm responsible for
int peer = -1;
int role = 0;
int group;
uint64_t step;
T* direct = NULL;
T* buff;
struct ncclDevComm* comm;
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
inline __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
const T** srcs;
T** dsts;
uint64_t* barriers;
uint64_t* barrier_next;
// Don't use barrier 0 as it's used by the final sync
inline __device__ void barrier() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if (wid == 0) {
if (NRECV < NSEND) barrier_by_id(0);
else barrier_by_id(1);
}
if (nthreads == WARP_SIZE) __syncwarp();
else barrier_by_group();
#else
if (NSEND>NRECV) {
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads+WARP_SIZE));
} else {
asm volatile ("bar.sync 2, %0;" :: "r"(nthreads+WARP_SIZE));
}
if (nthreads == WARP_SIZE) __syncwarp();
else asm volatile ("bar.sync %0, %1;" :: "r"(group+1), "r"(nthreads));
#endif
}
inline __device__ void subBarrier() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
__syncthreads();
barrier();
#else
if (NSEND>NRECV) {
asm volatile ("bar.sync 3, %0;" :: "r"(nthreads));
} else {
asm volatile ("bar.sync 4, %0;" :: "r"(nthreads));
}
if (nworkers == nthreads) barrier();
else asm volatile ("bar.sync %0, %1;" :: "r"(group+2), "r"(nworkers));
#endif
}
uint32_t spins = 0;
uint32_t abort = 0;
inline __device__ int checkAbort(int i, int send) {
inline __device__ int checkAbort() {
spins++;
if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
abort = LOAD(comm->abortFlag);
@@ -121,90 +113,54 @@ class ncclPrimitives {
return abort;
}
inline __device__ void waitSend(int nbytes) {
spins = 0;
if (sendConnHeadPtr) {
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + SLICESTEPS) {
sendConnHeadCache = LOAD(sendConnHeadPtr);
if (checkAbort(wid, 1)) break;
}
if (sendConnFifoPtr) {
STORE(sendConnFifoPtr+sendConnHead%NCCL_STEPS, nbytes);
}
sendConnHead += SLICESTEPS;
}
template <int DIRECTPTR>
inline __device__ T* directPtr(ssize_t directOffset) {
return DIRECTPTR && direct ? direct+directOffset : buff+(step%NCCL_STEPS)*stepSize;
}
inline __device__ void waitRecv() {
template <int DST, int DIRECTSEND>
inline __device__ void waitSend(ssize_t directOffset, int nbytes) {
spins = 0;
if (recvConnTailPtr) {
#ifdef ENABLE_PROFILING
uint64_t t0 = __rtc64();
#endif
while (recvConnTailCache < recvConnTail + SLICESTEPS) {
recvConnTailCache = LOAD(recvConnTailPtr);
if (checkAbort(wid, 0)) break;
}
#ifdef ENABLE_PROFILING
__atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __rtc64() - t0, __ATOMIC_SEQ_CST);
#endif
recvConnTail += SLICESTEPS;
while (connHeadCache + NCCL_STEPS < step + SLICESTEPS) {
connHeadCache = LOAD(connHeadPtr);
if (checkAbort()) break;
}
if (connSizesFifoPtr) {
STORE(connSizesFifoPtr+step%NCCL_STEPS, nbytes);
}
if (connPtrsFifoPtr) dsts[DST+index] = ((T **)connPtrsFifoPtr)[step%NCCL_STEPS];
else dsts[DST+index] = directPtr<DIRECTSEND>(directOffset);
step += SLICESTEPS;
}
inline __device__ void incRecv(int i) {
recvStep[i] += SLICESTEPS;
template <int SRC, int DIRECTRECV>
inline __device__ void waitRecv(ssize_t directOffset) {
spins = 0;
#ifdef ENABLE_PROFILING
uint64_t t0 = __builtin_amdgcn_s_memrealtime();
#endif
while (connTailCache < step + SLICESTEPS) {
connTailCache = LOAD(connTailPtr);
if (checkAbort()) break;
}
#ifdef ENABLE_PROFILING
if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST);
#endif
if (connPtrsFifoPtr) srcs[SRC+index] = ((T **)connPtrsFifoPtr)[step%NCCL_STEPS];
else srcs[SRC+index] = directPtr<DIRECTRECV>(directOffset);
step += SLICESTEPS;
}
inline __device__ void postRecv() {
if (recvConnHeadPtr) STORE(recvConnHeadPtr, recvConnHead += SLICESTEPS);
STORE(connHeadPtr, step += SLICESTEPS);
}
inline __device__ void incSend(int i) {
sendStep[i] += SLICESTEPS;
}
inline __device__ void postSend() {
if (sendConnTailPtr) {
if (sendConn->next_hdp_reg) STORE(sendConn->next_hdp_reg, 0x1);
STORE(sendConnTailPtr, sendConnTail += SLICESTEPS);
}
if (conn->next_hdp_reg) STORE(conn->next_hdp_reg, 0x1);
STORE(connTailPtr, step += SLICESTEPS);
}
template <int DIRECTRECV>
inline __device__ const T* directRecvPtr(int i, ssize_t directOffset) {
#if defined(RCCL_USE_DIRECT_BUFFER)
return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
#else
return recvPtr(i);
#endif
}
template <int DIRECTSEND>
inline __device__ T* directSendPtr(int i, ssize_t directOffset) {
#if defined(RCCL_USE_DIRECT_BUFFER)
return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
#else
return sendPtr(i);
#endif
}
template <int DIRECTRECV>
inline __device__ int directRecvInc(int i, int directInc, int sliceInc) {
#if defined(RCCL_USE_DIRECT_BUFFER)
return DIRECTRECV && recvDirectBuff[i] ? directInc : sliceInc;
#else
return sliceInc;
#endif
}
template <int DIRECTSEND>
inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
#if defined(RCCL_USE_DIRECT_BUFFER)
return DIRECTSEND && sendDirectBuff[i] ? directInc : sliceInc;
#else
return sliceInc;
#endif
}
template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
inline __device__ void
GenericOp(const T* srcPtr, T* dstPtr, int nelem, ssize_t directOffset) {
@@ -212,148 +168,126 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
int sliceSize = stepSize*SLICESTEPS;
int dataSize = max(DIVUP(nelem, 16*SLICESPERCHUNK)*16, sliceSize/32);
const T* srcs[RECV*NRECV+SRC];
srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
if (RECV) {
if (SRC) srcs[1] = recvPtr(0);
for (int i=1; i<NRECV && i<nrecv; i++) srcs[SRC+i] = recvPtr(i);
}
T* dsts[SEND*NSEND+DST];
dsts[0] = DST ? dstPtr : directSendPtr<DIRECTSEND>(0, directOffset);
if (SEND) {
if (DST) dsts[1] = directSendPtr<DIRECTSEND>(0, directOffset);
for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
}
#pragma unroll
for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
int realSize = max(0, min(dataSize, nelem-offset));
#ifdef ENABLE_PROFILING
uint64_t t0 = __rtc64();
uint64_t t0 = __builtin_amdgcn_s_memrealtime();
#endif
if (SEND) waitSend(realSize*sizeof(T));
if (RECV) waitRecv();
if (realSize > 0) {
barrier();
if (tid < nworkers) {
if (SRC && (role & ROLE_SRC)) srcs[0] = srcPtr+offset;
if (RECV && (role & ROLE_WAIT_RECV)) waitRecv<SRC, DIRECTRECV>(directOffset+offset);
if (DST && (role & ROLE_DST)) dsts[0] = dstPtr+offset;
if (SEND && (role & ROLE_WAIT_SEND)) waitSend<DST, DIRECTSEND>(directOffset+offset, realSize*sizeof(T));
if (realSize > 0) {
#ifdef ENABLE_PROFILING
if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_cycle[blockIdx.x], __rtc64() - t0, __ATOMIC_SEQ_CST);
if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST);
#endif
#if defined(RCCL_USE_DIRECT_BUFFER)
if (DIRECTRECV && recvDirectBuff[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (SEND) {
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
}
} else {
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
subBarrier();
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nworkers, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
}
#else
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
#endif
}
barrier();
FOR_SEND(incSend);
FOR_RECV(incRecv);
if (tid >= nthreads-WARP_SIZE) {
if (SEND) {
if (realSize > 0 && wid == 0) __threadfence_system();
__syncwarp();
postSend();
}
if (RECV) postRecv();
}
srcs[0] += SRC ? realSize : directRecvInc<DIRECTRECV>(0, realSize, sliceSize);
for (int i=1-SRC; i<RECV*NRECV; i++) srcs[SRC+i] += sliceSize;
dsts[0] += DST ? realSize : directSendInc<DIRECTSEND>(0, realSize, sliceSize);
for (int i=1-DST; i<SEND*NSEND; i++) dsts[DST+i] += directSendInc<DIRECTSEND>(i, realSize, sliceSize);
if (SEND && (role & ROLE_POST_SEND) && realSize > 0 && index == 0) __threadfence_system();
__syncwarp();
if (SEND && (role & ROLE_POST_SEND)) postSend();
if (RECV && (role & ROLE_POST_RECV)) postRecv();
offset += realSize;
}
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
recvBuff[i] = (const T*)LOAD(conn->buffs+NCCL_PROTO_SIMPLE);
recvStep[i] = LOAD(&conn->step);
recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
#if defined(RCCL_USE_DIRECT_BUFFER)
recvDirectBuff[i] = NULL;
if (DIRECT && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
recvDirectBuff[i] = directBuff;
if (tid == 0) STORE(conn->ptrExchange, directBuff);
}
#endif
if (wid == i) recvConn = conn;
if (wid == i) recvConnTail = recvConnHead = recvStep[i]; // Make sure we set this after rounding up
nrecv++;
}
__device__ __forceinline__ void loadRecvSync() {
if (tid >= WARP_SIZE && tid < 2*WARP_SIZE && wid<nrecv) {
recvConnTailPtr = LOAD(&recvConn->tail);
recvConnTailCache = LOAD(recvConnTailPtr);
}
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
recvConnHeadPtr = LOAD(&recvConn->head);
// Return credits in case we rounded up.
STORE(recvConnHeadPtr, recvConnHead);
__device__ __forceinline__ void loadRecvConn(struct ncclChannel* channel, T* directBuff) {
if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) {
conn = &channel->devPeers[peer].recv.conn;
step = conn->step;
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
if (role & ROLE_POST_RECV) {
connHeadPtr = conn->head;
// Return credits in case we rounded up.
STORE(connHeadPtr, step);
}
if (role & ROLE_WAIT_RECV) {
buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
//if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
// direct = directBuff;
// *conn->ptrExchange = directBuff;
//}
connTailPtr = conn->tail;
connTailCache = LOAD(connTailPtr);
connPtrsFifoPtr = conn->ptrsFifo;
}
}
}
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
sendBuff[i] = (T*)LOAD(conn->buffs+NCCL_PROTO_SIMPLE);
sendStep[i] = LOAD(&conn->step);
sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
#if defined(RCCL_USE_DIRECT_BUFFER)
sendDirectBuff[i] = NULL;
if (DIRECT && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
void* volatile* ptr = LOAD(&conn->ptrExchange);
while ((sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL);
barrier();
if (tid == 0) STORE(ptr, NULL);
}
#endif
if (wid == i) sendConn = conn;
if (wid == i) sendConnTail = sendConnHead = sendStep[i]; // Make sure we set this after rounding up
nsend++;
}
__device__ __forceinline__ void loadSendSync() {
if (tid < nsend) {
sendConnHeadPtr = LOAD(&sendConn->head);
sendConnHeadCache = LOAD(sendConnHeadPtr);
sendConnFifoPtr = LOAD(&sendConn->fifo);
}
if (tid >= nthreads-WARP_SIZE && wid < nsend) {
sendConnTailPtr = LOAD(&sendConn->tail);
__device__ __forceinline__ void loadSendConn(struct ncclChannel* channel) {
if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) {
conn = &channel->devPeers[peer].send.conn;
step = conn->step;
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
if (role & ROLE_POST_SEND) {
connTailPtr = conn->tail;
}
if (role & ROLE_WAIT_SEND) {
buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
//if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
// void* volatile* ptr = conn->ptrExchange;
// while ((direct = (T*)(*ptr)) == NULL);
// *ptr = NULL;
//}
connHeadPtr = conn->head;
connHeadCache = LOAD(connHeadPtr);
connSizesFifoPtr = conn->sizesFifo;
connPtrsFifoPtr = conn->ptrsFifo;
}
}
}
__device__ __forceinline__ void saveRecvSync() {
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
STORE(&recvConn->step, recvConnHead);
__threadfence_system();
}
}
__device__ __forceinline__ void saveSendSync() {
if (tid < nsend) {
STORE(&sendConn->step, sendConnHead);
__device__ __forceinline__ void saveSync() {
if (role & (ROLE_POST_SEND|ROLE_POST_RECV)) {
conn->step = step;
__threadfence_system();
}
}
public:
__device__ __forceinline__
ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm)
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepSize(stepSize) {
barriers = channel->barrier;
barrier_next = channel->barrier_next;
ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group)
: comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group), barriers(&ptrs[group].barrier), barrier_next(ptrs[group].barrier_next) {
nthreads = nworkers;
// For send operations, we need an extra warp to overlap the threadfence and the copy
// int postThreads = NSEND && nworkers >= 64 ? WARP_SIZE : 0;
// nthreads += postThreads;
// Make sure step is updated before we read it.
barrier();
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, 0);
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
loadRecvSync();
loadSendSync();
for (int i=0; i<NRECV; i++) if (recvPeers[i] != -1) nrecv++;
for (int i=0; i<NSEND; i++) if (sendPeers[i] != -1) nsend++;
#define SYNC_GROUP 8
static_assert(NSEND < SYNC_GROUP && NRECV < SYNC_GROUP, "Not enough threads to cover all peers");
int g = tid / SYNC_GROUP;
int ng = nthreads / SYNC_GROUP;
index = tid % SYNC_GROUP;
if (g == 0) {
if (index < nrecv) role |= ROLE_WAIT_RECV;
if (index == nrecv) role |= ROLE_SRC;
} else if (g == 1) {
if (index < nsend) role |= ROLE_WAIT_SEND;
if (index == nsend) role |= ROLE_DST;
} else if (g == ng - 2) {
if (index < nrecv) role |= ROLE_POST_RECV;
} else if (g == ng - 1) {
if (index < nsend) role |= ROLE_POST_SEND;
}
if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) peer = recvPeers[index];
if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) peer = sendPeers[index];
loadRecvConn(channel, directBuff);
loadSendConn(channel);
}
__device__ __forceinline__ void
@@ -414,8 +348,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
__device__ __forceinline__ ~ncclPrimitives() {
// Save steps for the next operation
saveRecvSync();
saveSendSync();
saveSync();
}
};
@@ -424,10 +357,10 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
#ifdef ENABLE_PROFILING
#define INIT_COUNTER \
if (tid == 0) { t0 = __rtc64(); ws = LOAD(&(devProf->wait_cycle[blockIdx.x])); }
if (tid == 0) { t0 = __builtin_amdgcn_s_memrealtime(); ws = LOAD(&(devProf->wait_cycle[blockIdx.x])); }
#define ACCUMULATE_COUNTER(prim) \
if (tid == 0) { __atomic_fetch_add(&(devProf->prim##_cycle), __rtc64() - t0 \
if (tid == 0) { __atomic_fetch_add(&(devProf->prim##_cycle), __builtin_amdgcn_s_memrealtime() - t0 \
+ ws - LOAD(&(devProf->wait_cycle[blockIdx.x])), __ATOMIC_SEQ_CST); \
__atomic_fetch_add(&(devProf->prim##_byte), nelem * sizeof(T), __ATOMIC_SEQ_CST); }
#else
+1 -1
Voir le fichier
@@ -205,7 +205,7 @@ class ncclLLPrimitives {
sendConnHeadPtr = LOAD(&sendConn->head);
sendConnHeadCache = LOAD(sendConnHeadPtr);
sendConnHead = LOAD(&sendConn->step);
sendConnFifoPtr = LOAD(&sendConn->fifo);
sendConnFifoPtr = LOAD(&sendConn->sizesFifo);
}
}
+65 -37
Voir le fichier
@@ -118,9 +118,14 @@ class ncclLL128Primitives {
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
if (u*WARP_SIZE < maxOffset) {
uint64_t v0, v1;
load128(src64Ptr+u*WARP_SIZE, v0, v1);
storeShmem128(shmemAsmPtr+u*WARP_SIZE, v0, v1);
using Vec = uint64_t __attribute__((ext_vector_type(2)));
Vec i2;
//load128(src64Ptr+u*WARP_SIZE, v0, v1);
asm volatile ("flat_load_dwordx4 %0, %1\n"
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(src64Ptr+u*WARP_SIZE));
//storeShmem128(shmemAsmPtr+u*WARP_SIZE, i2[0], i2[1]);
*(shmemAsmPtr+u*WARP_SIZE) = i2[0];
*(shmemAsmPtr+u*WARP_SIZE+1) = i2[1];
}
}
#endif
@@ -135,15 +140,24 @@ class ncclLL128Primitives {
template <int ELEMS_PER_THREAD>
inline __device__ void storeShmemToDst128(int maxOffset, uint64_t* dst64Ptr) {
uint64_t v[ELEMS_PER_THREAD];
using Velem = uint64_t __attribute__((ext_vector_type(ELEMS_PER_THREAD)));
Velem v;
uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
v[u] = *(shmemAsmPtr+u*WARP_SIZE);
v[u+1] = *(shmemAsmPtr+u*WARP_SIZE+1);
//loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
}
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]);
//if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]);
using Vec = uint64_t __attribute__((ext_vector_type(2)));
Vec i2;
i2[0] = v[u];
i2[1] = v[u+1];//
if (u*WARP_SIZE < maxOffset) asm volatile ("flat_store_dwordx4 %0, %1\n"
"s_waitcnt vmcnt(0)\n" : : "v"(dst64Ptr+u*WARP_SIZE), "v"(i2));
}
}
@@ -176,45 +190,52 @@ class ncclLL128Primitives {
uint64_t flag = recvFlag(0);
uint64_t* ptr = recvPtr(0)+ll128Offset;
bool needReload;
uint64_t v0, v1;
using Vec = uint64_t __attribute__((ext_vector_type(2)));
Vec i2;
do {
if (wid == 0) STORE(sync, 0);
needReload = false;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
load128(ptr+u*WARP_SIZE, v0, v1);
needReload |= flagThread && (v1 != flag);
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
//load128(ptr+u*WARP_SIZE, v0, v1);
needReload |= flagThread && (i2[1] != flag);
}
if (needReload) __atomic_fetch_add(sync, 1, __ATOMIC_SEQ_CST);
if (LOAD(sync) == 0) break;
} while (checkAbort(0, 0) == 0);
} while (LOAD(sync) && checkAbort(0, 0) == 0);
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
load128(ptr+u*WARP_SIZE, v0, v1);
v[u] = SRC ? MULTI<FUNC, T>()(v0, v[u]) : v0;
v[u+1] = SRC ? MULTI<FUNC, T>()(v1, v[u+1]) : v1;
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
//load128(ptr+u*WARP_SIZE, v0, v1);
v[u] = SRC ? MULTI<FUNC, T>()(i2[0], v[u]) : i2[0];
v[u+1] = SRC ? MULTI<FUNC, T>()(i2[1], v[u+1]) : i2[1];
}
for (int i=1; i<NRECV && i<nrecv; i++) {
uint64_t flag = recvFlag(i);
uint64_t* ptr = recvPtr(i)+ll128Offset;
uint64_t v0, v1;
Vec i2;
do {
if (wid == 0) STORE(sync, 0);
needReload = false;
needReload = 0;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
load128(ptr+u*WARP_SIZE, v0, v1);
needReload |= flagThread && (v1 != flag);
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
//load128(ptr+u*WARP_SIZE, v0, v1);
needReload |= flagThread && (i2[1] != flag);
}
if (needReload) __atomic_fetch_add(sync, 1, __ATOMIC_SEQ_CST);
if (LOAD(sync) == 0) break;
} while (checkAbort(i, 0) == 0);
} while (LOAD(sync) && checkAbort(i, 0) == 0);
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
load128(ptr+u*WARP_SIZE, v0, v1);
v[u] = MULTI<FUNC, T>()(v0, v[u]);
v[u+1] = MULTI<FUNC, T>()(v1, v[u+1]);
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
//load128(ptr+u*WARP_SIZE, v0, v1);
v[u] = MULTI<FUNC, T>()(i2[0], v[u]);
v[u+1] = MULTI<FUNC, T>()(i2[1], v[u+1]);
}
}
}
@@ -223,18 +244,30 @@ class ncclLL128Primitives {
/************************ Send **************************/
if (SEND) {
for (int i=1; i<NSEND && i<nsend; i++) {
int flag = sendFlag(i);
uint64_t flag = sendFlag(i);
uint64_t* ptr = sendPtr(i)+ll128Offset;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
//store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
using Vec = uint64_t __attribute__((ext_vector_type(2)));
Vec i2;
i2[0] = v[u];
i2[1] = flagThread ? flag : v[u+1];//
asm volatile ("flat_store_dwordx4 %0, %1, glc, slc\n"
"s_waitcnt vmcnt(0)\n" : : "v"(ptr+u*WARP_SIZE), "v"(i2));
}
}
int flag = sendFlag(0);
uint64_t flag = sendFlag(0);
uint64_t* ptr = sendPtr(0)+ll128Offset;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
//store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
using Vec = uint64_t __attribute__((ext_vector_type(2)));
Vec i2;
i2[0] = v[u];
i2[1] = flagThread ? flag : v[u+1];//
asm volatile ("flat_store_dwordx4 %0, %1, glc, slc\n"
"s_waitcnt vmcnt(0)\n" : : "v"(ptr+u*WARP_SIZE), "v"(i2));
}
}
/********************** End Send ************************/
@@ -279,7 +312,7 @@ class ncclLL128Primitives {
const int maxOffset = min(nelem-(elemOffset*((int)(sizeof(uint64_t)/sizeof(T)))), (int)(ELEMINC*(sizeof(uint64_t)/sizeof(T))));
if (SRC) {
int done = 0;
if ((((uint64_t)srcPtr)&0xf) == 0) {
if ((((uint64_t)srcPtr)&0x3) == 0) {
loadSrcToShmem128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, src64Ptr+elemOffset+2*wid);
done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
}
@@ -290,7 +323,7 @@ class ncclLL128Primitives {
__syncwarp();
if (DST) {
int done = 0;
if ((((uint64_t)dstPtr)&0xf) == 0) {
if ((((uint64_t)dstPtr)&0x3) == 0) {
storeShmemToDst128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, dst64Ptr+elemOffset+2*wid);
done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
}
@@ -330,10 +363,10 @@ class ncclLL128Primitives {
sendConnHeadPtr = LOAD(&sendConn->head);
sendConnHeadCache = LOAD(sendConnHeadPtr);
sendConnHead = LOAD(&sendConn->step);
sendConnFifoPtr = LOAD(&sendConn->fifo);
sendConnFifoPtr = LOAD(&sendConn->sizesFifo);
}
if (tid >= nthreads-WARP_SIZE && wid<nsend) {
if (sendConn->fifo) {
if (sendConn->sizesFifo) {
sendConnTailPtr = LOAD(&sendConn->tail);
sendConnTail = LOAD(&sendConn->step);
}
@@ -357,12 +390,7 @@ class ncclLL128Primitives {
public:
__device__ __forceinline__
ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm)
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
// for __any_sync
if (NSEND > NRECV)
sync = channel->sync + 2 + tid/WARP_SIZE;
else
sync = channel->sync + tid/WARP_SIZE;
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), shmem(ncclShmem->data+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid), sync(ncclShmem->sync+warp) {
// Make sure step is updated before we read it.
barrier();
+2 -2
Voir le fichier
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,4 +8,4 @@
#include "common.h"
#include "collectives.h"
IMPL_COLL_R(ncclReduce, ncclCollReduce);
IMPL_COLL_R(Reduce);
+123 -129
Voir le fichier
@@ -9,151 +9,145 @@
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->coll.root;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->coll.root;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*realChunkSize;
int nelem = min(realChunkSize, size-offset);
if (prevRank == root) {
prims.send(thisInput+offset, nelem);
} else if (rank == root) {
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
prims.recvReduceSend(thisInput+offset, nelem);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*realChunkSize;
int nelem = min(realChunkSize, size-offset);
if (prevRank == root) {
prims.send(thisInput+offset, nelem);
} else if (rank == root) {
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
prims.recvReduceSend(thisInput+offset, nelem);
}
}
}
}
}
};
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = comm->rank;
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->coll.root;
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceCollNetKernel(struct CollectiveArgs* args) { }
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = comm->rank;
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->coll.root;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
}
ssize_t offset = gridOffset + bid*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
int nelem = min(chunkSize, size-offset);
if (prevRank == root) {
LLprims.send(thisInput+offset, nelem);
} else if (rank == root) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
}
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (prevRank == root) {
LLprims.send(thisInput+offset, nelem);
} else if (rank == root) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceCollNetLLKernel(struct CollectiveArgs* args) { }
};
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = comm->rank;
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->coll.root;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
const int rank = comm->rank;
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->coll.root;
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t offset = gridOffset + bid*chunkSize;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (prevRank == root) {
LLprims.send(thisInput+offset, nelem);
} else if (rank == root) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
int nelem = min(chunkSize, size-offset);
if (prevRank == root) {
LLprims.send(thisInput+offset, nelem);
} else if (rank == root) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
}
}
}
};
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { }
template<int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction<ncclFuncReduce, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
template<int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction<ncclFuncReduce, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
+2 -2
Voir le fichier
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,4 +8,4 @@
#include "common.h"
#include "collectives.h"
IMPL_COLL_R(ncclReduceScatter, ncclCollReduceScatter);
IMPL_COLL_R(ReduceScatter);
+154 -160
Voir le fichier
@@ -9,195 +9,189 @@
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int nelem = min(realChunkSize, size-chunkOffset);
int rankDest;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int nelem = min(realChunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
prims.send(thisInput+offset, nelem);
prims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
prims.recvReduceSend(thisInput+offset, nelem);
prims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final result
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
}
};
// step k-1: reduce this buffer and data, which will produce the final result
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
}
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterCollNetKernel(struct CollectiveArgs* args) { }
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
}
ssize_t chunkOffset = gridOffset + bid*chunkSize;
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
LLprims.send(thisInput+offset, nelem);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
chunkSize = args->coll.lastChunkSize;
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
LLprims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
}
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
LLprims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
LLprims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterCollNetLLKernel(struct CollectiveArgs* args) { }
};
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t chunkOffset = gridOffset + bid*chunkSize;
ssize_t chunkOffset = gridOffset + bid*chunkSize;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
LLprims.send(thisInput+offset, nelem);
LLprims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
LLprims.recvReduceSend(thisInput+offset, nelem);
LLprims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
}
};
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
template<int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterCollNetLL128Kernel(struct CollectiveArgs* args) { }
template<int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
};
-12
Voir le fichier
@@ -1,12 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "scatter.h"
#include "common.h"
#include "collectives.h"
IMPL_COLL_FUNC(ncclScatter, copy, FuncSum, i8, int8_t);
IMPL_COLL_KERN(ncclScatter, copy, FuncSum, i8, int8_t, 0);
-75
Voir le fichier
@@ -1,75 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclScatterKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->coll.count;
const int nranks = comm->nRanks;
const int bid = args->coll.bid;
const int rank = ring->devUserRanks[0];
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * SCATTER_CHUNKSTEPS;
const int peersPerChan = DIVUP(nranks, nChannels);
const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
const int root = args->coll.root;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
for (int i = 0; i < peersPerChan; i++) {
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
continue;
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
int nelem = min(realChunkSize, size-chunkOffset);
if ((blockIdx.x*peersPerChan+i)%nranks == 0 && rank == root) {
const T* sendbuff = thisInput+chunkOffset+rank*size;
T* recvbuff = thisOutput+chunkOffset;
if (tid < nthreads && sendbuff != recvbuff) {
// local copy
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
}
}
else {
int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
int peerNone = -1;
if (rank == root) {
ncclPrimitives<UNROLL, SCATTER_CHUNKSTEPS/SCATTER_SLICESTEPS, SCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &peerNone, &peerSend, NULL, stepSize, channel, comm);
ssize_t send_offset = chunkOffset + peerSend*size;
prims.send(thisInput+send_offset, nelem);
}
else {
if (peerRecv == root) {
ncclPrimitives<UNROLL, SCATTER_CHUNKSTEPS/SCATTER_SLICESTEPS, SCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &peerRecv, &peerNone, NULL, stepSize, channel, comm);
ssize_t recv_offset = chunkOffset;
prims.recv(thisOutput+recv_offset, nelem);
}
}
}
}
}
}
+1 -3
Voir le fichier
@@ -1,6 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,5 +8,4 @@
#include "common.h"
#include "collectives.h"
IMPL_COLL_FUNC(ncclSendRecv, copy, FuncSum, i8, int8_t);
IMPL_COLL_KERN(ncclSendRecv, copy, FuncSum, i8, int8_t, 0);
IMPL_COLL_P(SendRecv);
+77 -67
Voir le fichier
@@ -1,6 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,74 +8,85 @@
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__device__ void ncclSendRecvKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->p2p.nThreads;
template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
public:
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* firstArgs) {
struct ncclWorkElem* args = firstArgs;
int tid = threadIdx.x;
int group = 0;
for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
int nThreadsSegment = args->p2p.nThreads;
if (nThreadsSegment == 0) return; // Nothing else to do
int groupRecv = group;
group += 1;
int groupSend = group;
group += 1;
if (tid < nThreadsSegment) {
const int nThreads = nThreadsSegment;
// Compute pointers
const T* sendbuff = (const T*)args->sendbuff;
T* recvbuff = (T*)args->recvbuff;
// Compute pointers
const T* sendbuff = (const T*)args->sendbuff;
T* recvbuff = (T*)args->recvbuff;
const ssize_t sendCount = args->p2p.sendCount;
const ssize_t recvCount = args->p2p.recvCount;
if (args->p2p.delta < 0 ) return; // No-op
const int delta = args->p2p.delta;
if (delta == 0) {
if (tid < nThreads && sendbuff != recvbuff) {
// local copy : ReduceOrCopyMulti takes an int as number of elements,
// so we split it in blocks of 1G elements.
int blockSize = 1<<30;
for (size_t offset=0; offset<sendCount; offset += blockSize) {
size_t remaining = sendCount - offset;
if (remaining < blockSize) blockSize = remaining;
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nThreads, 1, &sendbuff, 1, &recvbuff, blockSize);
sendbuff += blockSize; recvbuff += blockSize;
}
}
} else {
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
if (args->p2p.delta == 0) {
if (tid < nthreads && sendbuff != recvbuff) {
// local copy : ReduceOrCopyMulti takes an int as number of elements,
// so we split it in blocks of 1G elements.
int blockSize = 1<<30;
for (size_t offset=0; offset<args->p2p.sendCount; offset += blockSize) {
size_t remaining = args->p2p.sendCount - offset;
if (remaining < blockSize) blockSize = remaining;
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, blockSize);
sendbuff += blockSize; recvbuff += blockSize;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize/SENDRECV_SLICEFACTOR;
int nThreadsSplit = nThreads/2;
if ((tid < nThreadsSplit) && recvCount >= 0) {
int peer = (comm->rank-delta+comm->nRanks)%comm->nRanks;
int nt = nThreadsSplit;
ncclPrimitives<UNROLL, 1, 1, T, 1, 0, 1, FUNC>
prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv);
if (recvCount == 0) {
prims.recv(recvbuff, 0);
} else for (ssize_t offset = 0; offset < recvCount; offset += chunkSize) {
int realChunkSize = min(chunkSize, recvCount-offset);
ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
int nelem = min(realChunkSize, recvCount-offset);
prims.directRecv(recvbuff+offset, offset, nelem);
}
}
if ((tid >= nThreadsSplit) && sendCount >= 0) {
int peer = (comm->rank+delta)%comm->nRanks;
int nt = nThreads-nThreadsSplit;
ncclPrimitives<UNROLL, 1, 1, T, 0, 1, 1, FUNC>
prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend);
if (sendCount == 0) {
prims.send(sendbuff, 0);
} else for (ssize_t offset = 0; offset < sendCount; offset += chunkSize) {
int realChunkSize = min(chunkSize, sendCount-offset);
ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
int nelem = min(realChunkSize, sendCount-offset);
prims.directSend(sendbuff+offset, offset, nelem);
}
}
}
}
tid -= nThreadsSegment;
if (tid < 0) return;
args++;
}
}
return;
}
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS)/SENDRECV_SLICEFACTOR;
int nthreadsSplit = nthreads/2;
// We set NRECV or NSEND to 2 to use different barriers in primitives for the send threads and
// receive threads, but then we define all peers to -1 since sender threads don't receive and
// receive threads don't send.
int peerNone[2] = {-1,-1};
if (tid < nthreadsSplit ) {
const ssize_t sendSize = args->p2p.sendCount;
if (sendSize < 0) return;
int peer = (comm->rank+(int)args->p2p.delta)%comm->nRanks;
ncclPrimitives<UNROLL, 1, 1, T, 2, 1, 1, FUNC>
prims(tid, nthreadsSplit, peerNone, &peer, recvbuff, stepSize*SENDRECV_SLICEFACTOR, channel, comm);
if (sendSize == 0) {
prims.send(sendbuff, 0);
} else for (ssize_t offset = 0; offset < sendSize; offset += stepSize) {
int realChunkSize = min(stepSize, sendSize-offset);
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
int nelem = min(realChunkSize, sendSize-offset);
prims.directSend(sendbuff+offset, offset, nelem);
}
} else {
const ssize_t recvSize = args->p2p.recvCount;
if (recvSize < 0) return;
int peer = (comm->rank-(int)args->p2p.delta+comm->nRanks)%comm->nRanks;
ncclPrimitives<UNROLL, 1, 1, T, 1, 2, 1, FUNC>
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peer, peerNone, recvbuff, stepSize*SENDRECV_SLICEFACTOR, channel, comm);
if (recvSize == 0) {
prims.recv(recvbuff, 0);
} else for (ssize_t offset = 0; offset < recvSize; offset += stepSize) {
int realChunkSize = min(stepSize, recvSize-offset);
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
int nelem = min(realChunkSize, recvSize-offset);
prims.directRecv(recvbuff+offset, offset, nelem);
}
}
}
};
-8
Voir le fichier
@@ -12,7 +12,6 @@ NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) {
if (comm->alltoallDisable) {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
size_t rankOffset = sendcount * ncclTypeSize(datatype);
@@ -27,11 +26,4 @@ ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, root, comm, stream));
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;
}
else {
struct ncclInfo info = { ncclCollGather, "Gather",
sendbuff, recvbuff, sendcount, datatype, ncclSum, root, comm, stream, /* Args */
GATHER_CHUNKSTEPS, GATHER_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
}
+3 -2
Voir le fichier
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollReduce, "Reduce",
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct ncclInfo info = { ncclFuncReduce, "Reduce",
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
return ncclEnqueueCheck(&info);
+3 -2
Voir le fichier
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
return ncclEnqueueCheck(&info);
-8
Voir le fichier
@@ -12,7 +12,6 @@ NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream) {
if (comm->alltoallDisable) {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
size_t rankOffset = recvcount * ncclTypeSize(datatype);
@@ -27,11 +26,4 @@ ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
NCCLCHECK(ncclRecv(recvbuff, recvcount, datatype, root, comm, stream));
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;
}
else {
struct ncclInfo info = { ncclCollScatter, "Scatter",
sendbuff, recvbuff, recvcount, datatype, ncclSum, root, comm, stream, /* Args */
SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
}
+4 -2
Voir le fichier
@@ -13,7 +13,8 @@ NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataTyp
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollSendRecv, "Send",
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct ncclInfo info = { ncclFuncSendRecv, "Send",
sendbuff, NULL, count, datatype, ncclSum, peer, comm, stream, /* Args */
1, 1 };
ncclResult_t ret;
@@ -27,7 +28,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollSendRecv, "Recv",
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct ncclInfo info = { ncclFuncSendRecv, "Recv",
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
1, 1 };
ncclResult_t ret;
+3 -3
Voir le fichier
@@ -128,7 +128,7 @@ void ncclDebugInit() {
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
if (ncclDebugLevel == -1) ncclDebugInit();
if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
if (ncclDebugLevel < level) return;
if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;
// Gather the rank information. This can take > 1us so we want to make sure
// we only do it when needed.
@@ -145,11 +145,11 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
if (level == NCCL_LOG_WARN)
len = snprintf(buffer, sizeof(buffer),
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line);
else if (level == NCCL_LOG_INFO && (flags & ncclDebugMask))
else if (level == NCCL_LOG_INFO)
len = snprintf(buffer, sizeof(buffer),
"%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
#ifdef ENABLE_TRACE
else if (level == NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
else if (level == NCCL_LOG_TRACE) {
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
len = snprintf(buffer, sizeof(buffer),
+298 -212
Voir le fichier
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,61 +8,61 @@
#include "enqueue.h"
#include "argcheck.h"
#include "coll_net.h"
#include "../graph/topo.h"
#include "graph/topo.h"
#include <hip/hip_runtime.h>
#include <hip/hip_ext.h>
// Only generate inline kernels for LL
#define NCCL_FUNC5(coll, op, dtype) \
NCCL_KERN_NAME(coll##LL, op, dtype), \
NCCL_KERN_NAME(coll##LL, op, dtype), \
NCCL_KERN_NAME(coll##LL, op, dtype)
#define NCCL_FUNC5(func, algo, redop, dtype) \
NCCL_KERN_NAME(func, algo, LL, redop, dtype), \
NCCL_KERN_NAME(func, algo, LL, redop, dtype), \
NCCL_KERN_NAME(func, algo, LL, redop, dtype)
#define NCCL_FUNC4(coll, op, dtype) \
NCCL_FUNC5(coll##Tree, op, dtype), \
NCCL_FUNC5(coll##Ring, op, dtype), \
NCCL_FUNC5(coll##CollNet, op, dtype)
#define NCCL_FUNC4(func, redop, type) \
NCCL_FUNC5(func, TREE, redop, type), \
NCCL_FUNC5(func, RING, redop, type), \
NCCL_FUNC5(func, COLLNET, redop, type)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, u8), \
NCCL_FUNC4(coll, op, i32), \
NCCL_FUNC4(coll, op, u32), \
NCCL_FUNC4(coll, op, i64), \
NCCL_FUNC4(coll, op, u64), \
NCCL_FUNC4(coll, op, f16), \
NCCL_FUNC4(coll, op, f32), \
NCCL_FUNC4(coll, op, f64), \
NCCL_FUNC4(coll, op, b16)
#define NCCL_FUNCS3B(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8)
#define NCCL_FUNCS3A(func, redop) \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, uint8_t), \
NCCL_FUNC4(func, redop, int32_t), \
NCCL_FUNC4(func, redop, uint32_t), \
NCCL_FUNC4(func, redop, int64_t), \
NCCL_FUNC4(func, redop, uint64_t), \
NCCL_FUNC4(func, redop, half), \
NCCL_FUNC4(func, redop, float), \
NCCL_FUNC4(func, redop, double), \
NCCL_FUNC4(func, redop, rccl_bfloat16)
#define NCCL_FUNCS3B(func, redop) \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t), \
NCCL_FUNC4(func, redop, int8_t)
// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
#define NCCL_FUNCS2A(coll) \
NCCL_FUNCS3A(coll, sum), \
NCCL_FUNCS3A(coll, sum), \
NCCL_FUNCS3A(coll, sum), \
NCCL_FUNCS3A(coll, sum)
#define NCCL_FUNCS2B(coll) \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy)
#define NCCL_FUNCS2A(func) \
NCCL_FUNCS3A(func, Sum), \
NCCL_FUNCS3A(func, Sum), \
NCCL_FUNCS3A(func, Sum), \
NCCL_FUNCS3A(func, Sum)
#define NCCL_FUNCS2B(func) \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum)
typedef void(*ncclKern_t)(struct ncclDevComm*);
typedef void(*ncclKern_t)(struct ncclWorkElem first);
// Must be consistent with the ncclFuncSet enum
static ncclKern_t const ncclKerns[1] = {
NCCL_KERN_NAME(ncclSendRecv, copy, i8)
NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
};
/*****************************************************************************/
@@ -72,12 +72,8 @@ static ncclKern_t const ncclKerns[1] = {
ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
if (cgMode & 0x01) {
CUDACHECK(hipExtLaunchMultiKernelMultiDevice(paramsList, numDevices,
// These flags are to reduce the latency of using this API
#if __HIP__
hipCooperativeLaunchMultiDeviceNoPreSync|hipCooperativeLaunchMultiDeviceNoPostSync));
#else
0));
#endif
// These flags are to reduce the latency of using this API
hipCooperativeLaunchMultiDeviceNoPreSync|hipCooperativeLaunchMultiDeviceNoPostSync));
return ncclSuccess;
}
int savedDev;
@@ -85,37 +81,51 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList,
for (int i = 0; i < numDevices; i++) {
hipLaunchParams* params = paramsList+i;
CUDACHECK(hipSetDevice(cudaDevs[i]));
hipLaunchKernelGGL(((void (*)(struct ncclDevComm*))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclDevComm ***)(params->args)));
hipLaunchKernelGGL(((void (*)(struct ncclWorkElem))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclWorkElem**)params->args));
}
CUDACHECK(hipSetDevice(savedDev));
return ncclSuccess;
}
ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** work, struct ncclWorkElem* base) {
if (channel->workCount == NCCL_MAX_OPS) {
WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
return ncclInvalidUsage;
}
int opIndex = channel->workFifoTail%NCCL_MAX_OPS;
struct ncclWork* w = channel->workFifo+opIndex;
struct ncclWorkElem* e = w->elems;
volatile uint8_t* activePtr = (volatile uint8_t*)&e->active;
while (LOAD(activePtr) != 0) sched_yield();
memset(w, 0, sizeof(struct ncclWork));
// Initialize with work elem if provided
if (base) memcpy(e, base, sizeof(struct ncclWorkElem));
STORE(&e->active, 1);
e->index = opIndex;
channel->workFifoTail++;
channel->workCount++;
if (work) *work = w;
return ncclSuccess;
}
static ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
// Only launch blocks where we have work to do.
for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
if (comm->channels[c].collCount) params->gridDim.x = c+1;
if (comm->channels[c].workCount) params->gridDim.x = c+1;
}
// Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
for (int c=0; c<params->gridDim.x; c++) {
struct ncclChannel* channel = comm->channels+c;
if (channel->collCount == 0) {
int opIndex = channel->collFifoTail;
struct ncclColl* c = channel->collectives+opIndex;
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
while (activePtr[0] != 0) sched_yield();
c->args.p2p.delta = -1; // no-op
c->funcIndex = FUNC_INDEX_P2P;
c->args.comm = comm->devComm;
c->active = 1;
opIndex = (opIndex+1)%NCCL_MAX_OPS;
c->nextIndex = opIndex;
channel->collFifoTail = opIndex;
channel->collCount++;
if (channel->workCount == 0) {
struct ncclWork* w;
NCCLCHECK(getNextOp(channel, &w, NULL));
struct ncclWorkElem* e = w->elems;
e->comm = comm->devComm;
e->funcIndex = FUNC_INDEX_P2P;
e->p2p.nThreads = 0;
}
STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2);
STORE(&channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active, 2);
}
{ // [RCCL] Wait for any clique-based collectives
@@ -124,9 +134,13 @@ ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
// Find the first operation, choose the kernel accordingly and pass it
// as the first argument.
struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
struct ncclChannel* c0 = comm->channels;
struct ncclWork* work = c0->workFifo+((c0->workFifoTail-c0->workCount)%NCCL_MAX_OPS);
struct ncclWorkElem* elem = work->elems;
memcpy(&comm->args, elem, sizeof(struct ncclWorkElem));
// As we inline the first coll directly, we can free it immediately.
if (elem->funcIndex != FUNC_INDEX_P2P) elem->active = 0;
comm->args = comm->devComm;
params->func = (void *)ncclKerns[0];
return ncclSuccess;
}
@@ -137,7 +151,7 @@ ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
bool done = false;
while (done == false) {
if (val >= comm->intraRanks) {
WARN("Trying to launch too many collectives");
WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
return ncclInvalidUsage;
}
if (val+1 == comm->intraRanks) {
@@ -157,7 +171,7 @@ ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
int val = LOAD(ptr);
if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
WARN("Trying to launch too many collectives");
WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
return ncclInternalError;
}
return ncclSuccess;
@@ -219,7 +233,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
hipEvent_t startEvent;
hipEvent_t stopEvent;
if (comm->launchMode == ncclComm::PARALLEL) {
hipLaunchKernelGGL(((void (*)(struct ncclDevComm*))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclDevComm ***)(params->args)));
hipLaunchKernelGGL(((void (*)(struct ncclWorkElem))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclWorkElem**)params->args));
} else {
NCCLCHECK(ncclCpuBarrierOut(comm));
}
@@ -229,13 +243,18 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
// launch and the ncclProxyStart call could cause a deadlock.
// Also, starting the proxies after the CUDA launch seems to be better for
// performance (latency).
uint64_t max = 0ULL;
for (int r=0; r<params->gridDim.x; r++) {
struct ncclChannel* channel = comm->channels+r;
channel->collStart = channel->collFifoTail;
channel->collCount = 0;
max = std::max(max, channel->workFifoTail);
channel->workCount = 0;
}
for (int r=0; r<std::max(comm->nChannels, comm->p2pnChannels); r++) {
struct ncclChannel* channel = comm->channels+r;
channel->workFifoTail = max;
}
params->gridDim.x = params->blockDim.x = 0;
comm->lastOpCount = comm->opCount;
comm->lastOpCount = max;
NCCLCHECK(ncclProxyStart(comm));
return ncclSuccess;
}
@@ -261,6 +280,13 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
struct ncclComm* comm = info->comm;
float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
// Find algorithm / protocol.
if (info->coll == ncclFuncAllToAll || info->coll == ncclFuncAllToAllv) {
info->algorithm = NCCL_ALGO_RING;
info->protocol = NCCL_PROTO_SIMPLE;
info->nChannels = comm->nChannels;
info->nThreads = NCCL_MAX_NTHREADS;
return ncclSuccess;
}
info->algorithm = -1;
info->protocol = -1;
int nAlgos = NCCL_NUM_ALGORITHMS;
@@ -281,10 +307,6 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
}
}
}
if (info->coll == ncclCollAllToAll || info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAllv) {
info->algorithm = NCCL_ALGO_RING;
info->protocol = NCCL_PROTO_SIMPLE;
}
if (info->algorithm == -1 || info->protocol == -1) {
WARN("Error : no algorithm/protocol available");
return ncclInternalError;
@@ -292,16 +314,12 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
//if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
int nc = (info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
if (info->comm->topo->type == RCCL_TOPO_4P2H_ROME && (info->coll == ncclCollAllToAll ||
info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAllv))
nc = 2;
int nc = (info->nChannels > 0) ? info->nChannels :
(info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
int nt = comm->maxThreads[info->algorithm][info->protocol];
int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
while (info->nBytes < nc*nt*threadThreshold) {
// do not reduce channels in case of alltoall
if (info->algorithm != NCCL_ALGO_COLLNET && info->coll != ncclCollAllToAll &&
info->coll != ncclCollGather && info->coll != ncclCollScatter && info->coll != ncclCollAllToAllv && nc >= 2) nc--;
if (info->algorithm != NCCL_ALGO_COLLNET && nc >= 2) nc--;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
// do not reduce threads count on VEGA
#else
@@ -312,6 +330,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
if (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_TREE) nt += WARP_SIZE;
#endif
info->nChannels = nc;
info->nThreads = nt;
@@ -320,19 +339,17 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
static ncclResult_t getPatternInfo(struct ncclInfo* info) {
switch (info->coll) {
case ncclCollBroadcast:
case ncclFuncBroadcast:
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break;
case ncclCollReduce:
case ncclFuncReduce:
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
case ncclCollReduceScatter:
case ncclCollAllGather:
case ncclFuncReduceScatter:
case ncclFuncAllGather:
info->pattern = ncclPatternRing; break;
case ncclCollAllReduce:
case ncclFuncAllReduce:
info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
case ncclCollGather:
case ncclCollScatter:
case ncclCollAllToAll:
case ncclCollAllToAllv:
case ncclFuncAllToAll:
case ncclFuncAllToAllv:
info->pattern = ncclPatternAll; break;
default:
WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
@@ -350,55 +367,47 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
case ncclPatternPipelineTo:
case ncclPatternCollTreeUp:
case ncclPatternCollTreeDown:
case ncclPatternAll:
info->nstepsPerLoop = info->nchunksPerLoop = 1; break;
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
case ncclPatternRing:
info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
case ncclPatternRingTwice:
info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
case ncclPatternAll:
info->nstepsPerLoop = 1;
info->nchunksPerLoop = info->comm->nRanks; break;
default:
WARN("Unknown pattern %d\n", info->pattern);
WARN("Unknown pattern %d", info->pattern);
return ncclInternalError;
}
return ncclSuccess;
}
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
coll->args.sendbuff = info->sendbuff;
coll->args.recvbuff = info->recvbuff;
coll->args.comm = info->comm->devComm;
coll->args.opCount = info->comm->opCount;
if (info->coll == ncclCollSendRecv) {
coll->args.p2p.sendCount = info->sendbytes;
coll->args.p2p.recvCount = info->recvbytes;
coll->args.p2p.delta = info->delta;
coll->funcIndex = FUNC_INDEX_P2P;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
coll->args.p2p.nThreads = info->nThreads = info->comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
#else
coll->args.p2p.nThreads = info->nThreads = info->comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]+2*WARP_SIZE;
#endif
return ncclSuccess;
}
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyArgs* proxyArgs /* output */) {
work->comm = info->comm->devComm;
// Set nstepsPerLoop and nchunksPerLoop
NCCLCHECK(getAlgoInfo(info));
NCCLCHECK(getPatternInfo(info));
NCCLCHECK(getLoopInfo(info));
if (info->coll == ncclCollAllToAllv) {
coll->args.a2av.count = info->count;
coll->args.a2av.nChannels = info->nChannels;
coll->args.a2av.nThreads = info->nThreads;
} else {
coll->args.coll.root = info->root;
coll->args.coll.count = info->count;
coll->args.coll.nChannels = info->nChannels;
coll->args.coll.nThreads = info->nThreads;
}
if ((info->coll == ncclFuncAllToAll || info->coll == ncclFuncAllToAllv)
&& info->comm->topo->nodes[NET].count == 0 && info->comm->topo->type == RCCL_TOPO_4P2H_ROME)
info->nChannels =info->comm->p2pnChannels;
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
work->opCount = info->comm->opCount;
work->sendbuff = info->sendbuff;
work->recvbuff = info->recvbuff;
if (info->coll == ncclFuncAllToAllv) {
work->a2av.count = info->count;
work->a2av.nChannels = info->nChannels;
} else {
work->coll.root = info->root;
work->coll.count = info->count;
work->coll.nChannels = info->nChannels;
}
work->nThreads = info->nThreads;
work->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
{ // [RCCL] Check for clique-based kernel support
if (info->comm->cliqueManager->IsSupported(info->coll,
@@ -416,13 +425,13 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
info->protocol = NCCL_PROTO_CLIQUE;
// Determine the number of channels to use for clique-kernel
NCCLCHECK(info->comm->cliqueManager->GetNumChannelsToUse(info->coll,
info->count,
info->datatype,
info->op,
info->comm->nChannels,
&coll->args.clique.nChannels));
coll->args.clique.count = info->count;
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
info->count,
info->datatype,
info->op,
info->comm->nChannels,
&work->clique.nChannels));
work->clique.count = info->count;
work->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
return ncclSuccess;
}
} // [RCCL]
@@ -436,25 +445,25 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_SIMPLE) {
if (info->pattern == ncclPatternTreeUpDown) {
// Optimize chunkSize / nSteps
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*8 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
}
// Use lastChunkSize as chunkSize
coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
// Optimize chunkSize / nSteps
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*16 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth && chunkSize > 32768) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*16 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*4 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth && chunkSize > 32768) chunkSize /= 2;
// Use lastChunkSize as chunkSize
coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->protocol == NCCL_PROTO_LL) {
const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
coll->args.coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
ALIGN_SIZE(coll->args.coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
coll->args.coll.lastChunkSize /= ncclTypeSize(info->datatype);
work->coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
ALIGN_SIZE(work->coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
work->coll.lastChunkSize /= ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
int nNodes = info->comm->nNodes;
float ppn = info->comm->nRanks / (float)nNodes;
@@ -462,7 +471,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
// Use lastChunkSize as chunkSize
coll->args.coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
work->coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
}
// Compute nSteps for proxies
@@ -470,20 +479,20 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
if (info->protocol == NCCL_PROTO_LL) chunkEffectiveSize /= 2;
if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
//if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
int nLoops;
if (info->pattern != ncclPatternAll)
nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
else
nLoops = (int)(DIVUP(info->nBytes, (((size_t)((info->nChannels >= info->comm->nRanks ? (info->nChannels/info->comm->nRanks) : 1))))*info->comm->nRanks*info->nchunksPerLoop*chunkEffectiveSize));
int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
proxyArgs->sliceSteps = sliceSteps;
proxyArgs->chunkSteps = chunkSteps;
proxyArgs->protocol = info->protocol;
proxyArgs->opCount = info->comm->opCount;
proxyArgs->dtype = info->datatype;
proxyArgs->redOp = info->op;
if (info->coll != ncclCollAllToAllv) TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d ces %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, chunkEffectiveSize, info->nBytes, info->protocol, info->nChannels, info->nThreads,
// This is used by P2P to reduce the receive buffer size. We don't use it in collectives
// because some protocols need to transmit more than the total size, plus they sometimes
// round up
proxyArgs->recvbytes = stepSize*proxyArgs->sliceSteps;
TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
proxyArgs->opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
nLoops, proxyArgs->nsteps, info->comm);
return ncclSuccess;
}
@@ -500,33 +509,26 @@ static ncclResult_t checkSetStream(struct ncclInfo* info) {
}
ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
if (info->comm->nRanks == 1 && info->coll != ncclCollSendRecv) {
if (info->comm->nRanks == 1) {
if (info->sendbuff != info->recvbuff)
CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream));
return ncclSuccess;
}
struct ncclColl coll;
struct ncclWorkElem work;
struct ncclProxyArgs proxyArgs;
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
NCCLCHECK(computeColl(info, &coll, &proxyArgs));
NCCLCHECK(computeColl(info, &work, &proxyArgs));
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
int nChannels = info->coll == ncclCollSendRecv ? 1 : coll.args.coll.nChannels;
int nChannels = (info->coll == ncclFuncAllToAllv) ? work.a2av.nChannels : work.coll.nChannels;
int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;
for (int bid=0; bid<nChannels*nSubChannels; bid++) {
int channelId = (info->coll == ncclCollSendRecv) ? info->channelId :
info->comm->myParams->gridDim.x % info->comm->nChannels;
int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
struct ncclChannel* channel = info->comm->channels+channelId;
if (channel->collCount == NCCL_MAX_OPS) {
WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
return ncclInvalidUsage;
}
// Proxy
proxyArgs.channel = channel;
// Adjust pattern for CollNet based on channel index
@@ -534,84 +536,165 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
info->pattern = (channelId < info->comm->nChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown;
}
if (info->coll == ncclCollSendRecv) {
info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
NCCLCHECK(ncclProxySaveP2p(info, channel));
} else if (info->coll == ncclCollAllToAll || info->coll == ncclCollScatter || info->coll == ncclCollGather || info->coll == ncclCollAllToAllv) {
NCCLCHECK(ncclProxySaveA2a(&proxyArgs, info));
} else {
NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
}
info->comm->myParams->gridDim.x++;
int opIndex = channel->collFifoTail;
struct ncclColl* c = channel->collectives+opIndex;
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
while (LOAD(activePtr) != 0) sched_yield();
if (proxyArgs.nsteps) NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
memcpy(c, &coll, sizeof(struct ncclColl));
if (info->coll == ncclCollAllToAllv) {
c->args.a2av.extra = channel->collectivesExtra + info->comm->nRanks*4*opIndex;
memcpy(c->args.a2av.extra, info->sendcounts, sizeof(size_t*)*(info->comm->nRanks));
memcpy(c->args.a2av.extra+info->comm->nRanks, info->sdispls, sizeof(size_t*)*(info->comm->nRanks));
memcpy(c->args.a2av.extra+info->comm->nRanks*2, info->recvcounts, sizeof(size_t*)*(info->comm->nRanks));
memcpy(c->args.a2av.extra+info->comm->nRanks*3, info->rdispls, sizeof(size_t*)*(info->comm->nRanks));
c->args.a2av.bid = bid % coll.args.coll.nChannels;
} else if (info->coll != ncclCollSendRecv) {
c->args.coll.bid = bid % coll.args.coll.nChannels;
info->comm->myParams->gridDim.x++;
if (info->coll == ncclFuncAllToAllv) {
work.a2av.bid = bid % work.a2av.nChannels;
} else {
work.coll.bid = bid % nChannels;
}
// [RCCL] Setup pointers to where all the input/output pointers will be
if (info->protocol == NCCL_PROTO_CLIQUE) {
NCCLCHECK(info->comm->cliqueManager->SetCliqueCollectiveArgs(&c->args));
NCCLCHECK(info->comm->cliqueManager->SetCliqueArgs(&work));
}
// [/RCCL]
STORE(&c->active, 1);
opIndex = (opIndex+1)%NCCL_MAX_OPS;
c->nextIndex = opIndex;
channel->collFifoTail = opIndex;
channel->collCount++;
struct ncclWork* w;
NCCLCHECK(getNextOp(channel, &w, &work));
if (info->coll == ncclFuncAllToAllv) {
struct ncclWorkElem* e = w->elems;
size_t* params = channel->a2avParams + info->comm->nRanks*4*e->index;
memcpy(params, info->sendcounts, sizeof(size_t*)*(info->comm->nRanks));
memcpy(params+info->comm->nRanks, info->sdispls, sizeof(size_t*)*(info->comm->nRanks));
memcpy(params+info->comm->nRanks*2, info->recvcounts, sizeof(size_t*)*(info->comm->nRanks));
memcpy(params+info->comm->nRanks*3, info->rdispls, sizeof(size_t*)*(info->comm->nRanks));
}
}
info->comm->opCount++;
return ncclSuccess;
}
// Save p2p operations in comm->p2plist. Operations will be posted to channels
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
ncclResult_t ncclSaveCommKernels(ncclComm_t comm) {
if (comm->asyncOpCount == 0) {
return ncclSuccess;
} else if (comm->asyncOpCount == 1) {
// No aggregation
struct ncclInfo* info = comm->asyncOps;
info->nChannels = 0;
NCCLCHECK(ncclSaveKernel(info));
} else {
// Aggregation
size_t channelSize = NCCL_AGG_CHANNEL_SIZE * comm->nRanks; // scale channel size based on nranks as latency increases
// Reduce the per-channel size if we cannot fully utilize the channels
while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2;
for (int c = 0; c < comm->asyncOpCount; c++) {
struct ncclInfo* info = comm->asyncOps+c;
info->nChannels = std::min((int)DIVUP(info->nBytes, channelSize), comm->nChannels); // assign number of channels
NCCLCHECK(ncclSaveKernel(info));
}
}
// Reset counters
comm->asyncOpCount = 0;
comm->asyncTotalSize = 0;
return ncclSuccess;
}
static ncclResult_t ncclSaveAsyncColl(struct ncclInfo* info) {
ncclComm_t comm = info->comm;
if (comm->asyncOpCount >= NCCL_MAX_OPS) {
WARN("Too many async operations in progress, max is %d", NCCL_MAX_OPS);
return ncclInvalidUsage;
}
memcpy(comm->asyncOps+comm->asyncOpCount, info, sizeof(struct ncclInfo));
comm->asyncOpCount++;
comm->asyncTotalSize += info->nBytes;
return ncclSuccess;
}
// Save p2p operations in comm->p2pSends and p2pRecvs. Operations will be posted to channels
// during ncclGroupEnd()
ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
struct ncclComm* comm = info->comm;
struct ncclP2Plist* p2plist = &comm->p2plist;
int peer = info->root;
p2plist->count++;
ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
if (info->recvbuff == NULL) {
if (info->opName[0] == 'S') { // Send
if (peer != comm->rank) {
int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if (comm->channels[channelId].peers[peer].send.connected == 0) {
p2plist->connect.send[channelId*comm->nRanks+p2plist->connect.nsend[channelId]++] = peer;
comm->connectSend[peer] |= (1<<channelId);
comm->connect = 1;
}
}
}
p2plist->peerlist[info->root].sendbytes = nBytes;
p2plist->peerlist[info->root].sendbuff = info->sendbuff;
NCCLCHECK(enqueueP2pInfo(comm->p2pSends+info->root, (void*)info->sendbuff, nBytes));
comm->p2pSendCount++;
} else {
if (peer != comm->rank) {
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if (comm->channels[channelId].peers[peer].recv.connected == 0) {
p2plist->connect.recv[channelId*comm->nRanks+p2plist->connect.nrecv[channelId]++] = peer;
comm->connectRecv[peer] |= (1<<channelId);
comm->connect = 1;
}
}
}
p2plist->peerlist[info->root].recvbytes = nBytes;
p2plist->peerlist[info->root].recvbuff = info->recvbuff;
NCCLCHECK(enqueueP2pInfo(comm->p2pRecvs+info->root, info->recvbuff, nBytes));
comm->p2pRecvCount++;
}
return ncclSuccess;
}
static int getSegment(struct ncclInfo* info, struct ncclWork* work) {
const int e = (info->comm->topo->nodes[NET].count == 0 && info->comm->topo->type == RCCL_TOPO_4P2H_ROME)
? 1 : NCCL_MAX_WORK_ELEMENTS;
for (int s=0; s<e && work->elems[s].p2p.delta != info->delta; s++) {
if (work->elems[s].p2p.nThreads == 0) return s;
}
return -1;
}
static ncclResult_t saveP2pOp(struct ncclInfo* info /* input */, struct ncclWork* work, int s) {
struct ncclWorkElem* elem = work->elems+s;
elem->comm = info->comm->devComm;
elem->funcIndex = FUNC_INDEX_P2P;
elem->nThreads = info->nThreads = NCCL_MAX_NTHREADS;
elem->sendbuff = info->sendbuff;
elem->recvbuff = info->recvbuff;
elem->opCount = info->comm->lastOpCount;
elem->p2p.sendCount = info->sendbytes;
elem->p2p.recvCount = info->recvbytes;
elem->p2p.delta = info->delta;
const int nsegments = s+1;
int nThreads = 512;
while (nsegments*nThreads > 256) nThreads /= 2;
//if (nThreads >= 128) nThreads += WARP_SIZE;
for (int i=0; i<nsegments; i++) work->elems[i].p2p.nThreads = nThreads;
return ncclSuccess;
}
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info) {
int channelId = info->channelId;
struct ncclChannel* channel = info->comm->channels+channelId;
// Try to reuse last p2p operation if not full yet
int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
struct ncclWork* w = channel->workFifo+opIndex;
int segment = -1;
if (channel->workCount && w->elems[0].funcIndex == FUNC_INDEX_P2P && w->elems[NCCL_MAX_WORK_ELEMENTS-1].p2p.nThreads == 0) {
// Try to pack more segments into a single operation
segment = getSegment(info, w);
}
if (segment == -1) {
NCCLCHECK(getNextOp(channel, &w, NULL));
segment = 0;
}
NCCLCHECK(ncclProxySaveP2p(info, channel, segment));
NCCLCHECK(saveP2pOp(info, w, segment));
info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
return ncclSuccess;
}
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
// Launch asynchronously if needed
if (ncclAsyncMode()) {
@@ -629,19 +712,22 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
NCCLCHECKGOTO(checkSetStream(info), ret, end);
if (info->coll == ncclCollAllToAllv)
if (info->coll == ncclFuncAllToAllv)
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p",
info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls,
info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
else
else if (info->coll != ncclFuncSendRecv)
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
if (info->coll == ncclCollSendRecv) { //p2p stored separately
if (info->coll == ncclFuncSendRecv) { //p2p stored separately
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
info->opName, info->comm->lastOpCount, info->sendbuff, info->recvbuff, info->count,
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
NCCLCHECKGOTO(ncclSaveP2p(info), ret, end);
} else {
NCCLCHECKGOTO(ncclSaveKernel(info), ret, end);
NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end);
}
end:
@@ -653,7 +739,7 @@ end:
NCCLCHECK(ArgsCheck(info));
NCCLCHECK(checkSetStream(info));
if (info->coll == ncclCollAllToAllv)
if (info->coll == ncclFuncAllToAllv)
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p",
info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls,
info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
+77 -102
Voir le fichier
@@ -25,14 +25,10 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->ring.prev = channel->ring.next = -1;
channel->treeUp.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1;
channel->treeDn.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1;
channel->collTreeUp.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeUp.down[i] = -1;
channel->collTreeDn.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeDn.down[i] = -1;
channel->tree.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
channel->collTree.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTree.down[i] = -1;
int* ringIntra = ringGraph->intra+c*localRanks;
int* treeIntra = treeGraph->intra+c*localRanks;
@@ -46,33 +42,21 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1];
}
if (treeIntra[i] == rank) {
int recvIndex = 0, sendIndex = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
int parentIndex = 0;
int child0Index = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
int child1Index = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
// Tree loop always flows in the same direction. Other trees are symmetric, i.e.
// up/down go in reverse directions
int sym = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP ? 0 : 1;
// Down tree is common
topoRanks->treeDnRecv[c] = treeIntra[recvIndex];
topoRanks->treeDnSend[c] = treeIntra[sendIndex];
channel->treeDn.up = treeIntra[prev];
channel->treeDn.down[0] = treeIntra[next];
// Up tree depends on the pattern
topoRanks->treeUpRecv[c] = sym ? topoRanks->treeDnSend[c] : topoRanks->treeDnRecv[c];
topoRanks->treeUpSend[c] = sym ? topoRanks->treeDnRecv[c] : topoRanks->treeDnSend[c];
channel->treeUp.down[0] = sym ? channel->treeDn.down[0] : channel->treeDn.up ;
channel->treeUp.up = sym ? channel->treeDn.up : channel->treeDn.down[0];
topoRanks->treeToParent[c] = treeIntra[parentIndex];
topoRanks->treeToChild0[c] = treeIntra[child0Index];
topoRanks->treeToChild1[c] = treeIntra[child1Index];
channel->tree.up = i == 0 ? -1 : treeIntra[i-1];
channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1];
}
if (collNetIntra[i] == rank) {
int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
// CollTrees are always symmetric, i.e.
// up/down go in reverse directions
channel->collTreeDn.up = collNetIntra[prev];
channel->collTreeDn.down[0] = collNetIntra[next];
channel->collTreeUp.down[0] = channel->collTreeDn.down[0];
channel->collTreeUp.up = channel->collTreeDn.up;
channel->collTree.up = collNetIntra[prev];
channel->collTree.down[0] = collNetIntra[next];
}
}
topoRanks->ringPrev[c] = channel->ring.prev;
@@ -122,72 +106,66 @@ static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstR
return ncclSuccess;
}
static ncclResult_t setTreeUp(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int u0, int u1) {
if (u0 != -1) tree0->up = indexes[u0];
if (u1 != -1) tree1->up = indexes[u1];
static ncclResult_t setTreeUp(struct ncclTree* tree, int* indexes, int u) {
if (u == -1) return ncclSuccess;
tree->up = indexes[u];
return ncclSuccess;
}
static ncclResult_t addRanksDown(int* down, int* indexes, int r0, int r1) {
static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
if (d == -1) return ncclSuccess;
int x = 0;
if (down[x] >= 0) x++;
if (down[x] >= 0) {
WARN("Internal error : tree already has more than one child (%d %d %d)\n", down[0], down[1], down[2]);
while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
if (x == NCCL_MAX_TREE_ARITY) {
WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
return ncclInternalError;
}
if (r0 != -1) down[x++] = indexes[r0];
if (r1 != -1) down[x++] = indexes[r1];
tree->down[x] = indexes[d];
return ncclSuccess;
}
static ncclResult_t setTreeDown(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int d0_0, int d0_1, int d1_0, int d1_1) {
NCCLCHECK(addRanksDown(tree0->down, indexes, d0_0, d0_1));
NCCLCHECK(addRanksDown(tree1->down, indexes, d1_0, d1_1));
return ncclSuccess;
}
static ncclResult_t openRing(struct ncclTree* tree, int rank, int upRank) {
if (tree->down[0] == upRank) tree->down[0] = -1;
if (rank == upRank) tree->up = -1;
return ncclSuccess;
}
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* treeUpSend, int* treeDnRecv, int* treeDnSend, int* firstRanks) {
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* firstRanks, int* treePatterns) {
const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
int* indexesSend, *indexesRecv;
NCCLCHECK(ncclCalloc(&indexesSend, nNodes));
NCCLCHECK(ncclCalloc(&indexesRecv, nNodes));
int* ranksToParent, *ranksToChild0, *ranksToChild1;
NCCLCHECK(ncclCalloc(&ranksToParent, nNodes));
NCCLCHECK(ncclCalloc(&ranksToChild0, nNodes));
NCCLCHECK(ncclCalloc(&ranksToChild1, nNodes));
// Compute tree depth. Not an exact value but a good approximation in most
// cases
int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
int u0, d0_0, d0_1, u1, d1_0, d1_1;
NCCLCHECK(ncclGetDtree(nNodes, node, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel0 = comm->channels+c;
struct ncclChannel* channel1 = channel0+nChannels;
NCCLCHECK(getIndexes(treeUpSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeUpRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
NCCLCHECK(openRing(&channel0->treeUp, comm->rank, indexesSend[node]));
NCCLCHECK(openRing(&channel1->treeUp, comm->rank, indexesSend[node]));
int root = indexesSend[node];
if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeUp, &channel1->treeUp, indexesRecv, u0, u1));
if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeUp, &channel1->treeUp, indexesSend, d0_0, d0_1, d1_0, d1_1));
NCCLCHECK(getIndexes(treeDnSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeDnRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
NCCLCHECK(openRing(&channel0->treeDn, comm->rank, u0 == -1 ? root : indexesRecv[node]));
NCCLCHECK(openRing(&channel1->treeDn, comm->rank, u1 == -1 ? root : indexesRecv[node]));
if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeDn, &channel1->treeDn, indexesRecv, d0_0, d0_1, d1_0, d1_1));
if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeDn, &channel1->treeDn, indexesSend, u0, u1));
TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c, channel0->treeUp.up, channel0->treeUp.down[0], channel0->treeUp.down[1], channel0->treeUp.down[2]);
TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c+nChannels, channel1->treeUp.up, channel1->treeUp.down[0], channel1->treeUp.down[1], channel1->treeUp.down[2]);
TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c, channel0->treeDn.up, channel0->treeDn.down[0], channel0->treeDn.down[1], channel0->treeDn.down[2]);
TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c+nChannels, channel1->treeDn.up, channel1->treeDn.down[0], channel1->treeDn.down[1], channel1->treeDn.down[2]);
channel0->treeUp.depth = channel1->treeUp.depth = depth;
NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks));
NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks));
if (comm->rank == ranksToParent[node]) {
NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ranksToChild0 : ranksToChild1, t0u));
NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ranksToChild0 : ranksToChild1, t1u));
}
if (comm->rank == ranksToChild0[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d0));
NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d0));
}
if (comm->rank == ranksToChild1[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d1));
NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d1));
}
if (comm->rank == ranksToParent[node] ||
comm->rank == ranksToChild0[node] ||
comm->rank == ranksToChild1[node]) {
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
}
channel0->tree.depth = channel1->tree.depth = depth;
}
free(indexesSend);
free(indexesRecv);
free(ranksToParent);
free(ranksToChild0);
free(ranksToChild1);
return ncclSuccess;
}
@@ -200,13 +178,13 @@ ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclChannel* channel = comm->channels+c;
// Set root of collTree to id nranks
if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master
channel->collTreeUp.up = channel->collTreeDn.up = nranks;
channel->collTree.up = nranks;
}
if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
channel->collTree.down[0] = -1;
}
channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTreeUp.up, channel->collTreeUp.down[0]);
channel->collTree.depth = depth;
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTree.up, channel->collTree.down[0]);
}
int recvIndex = 0; // recv GPU index is always 0
int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks;
@@ -214,13 +192,13 @@ ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclChannel* channel = comm->channels+comm->nChannels/2+c;
// Set root of collTree to id nranks
if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master
channel->collTreeUp.up = channel->collTreeDn.up = nranks;
channel->collTree.up = nranks;
}
if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
channel->collTree.down[0] = -1;
}
channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTreeDn.up, channel->collTreeDn.down[0]);
channel->collTree.depth = depth;
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTree.up, channel->collTree.down[0]);
}
return ncclSuccess;
}
@@ -237,7 +215,7 @@ int ncclMinNchannels() {
if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
if (minNchannels > MAXCHANNELS) {
WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS);
WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
minNchannels = MAXCHANNELS;
}
if (minNchannels < 0) minNchannels = 0;
@@ -249,41 +227,39 @@ int ncclMaxNchannels() {
if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
if (maxNchannels < 1) {
WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels);
WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
maxNchannels = 1;
}
return maxNchannels;
}
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets) {
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1;
int nranks = comm->nRanks;
int nChannels = comm->nChannels;
NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeUpRecv, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeUpSend, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeDnRecv, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeDnSend, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToParent, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild0, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild1, nranks*MAXCHANNELS));
for (int i=0; i<nranks; i++) {
for (int c=0; c<nChannels;c++) {
ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c];
ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c];
ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c];
ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c];
treeUpRecv[c*nranks+i] = allTopoRanks[i]->treeUpRecv[c];
treeUpSend[c*nranks+i] = allTopoRanks[i]->treeUpSend[c];
treeDnRecv[c*nranks+i] = allTopoRanks[i]->treeDnRecv[c];
treeDnSend[c*nranks+i] = allTopoRanks[i]->treeDnSend[c];
treeToParent[c*nranks+i] = allTopoRanks[i]->treeToParent[c];
treeToChild0[c*nranks+i] = allTopoRanks[i]->treeToChild0[c];
treeToChild1[c*nranks+i] = allTopoRanks[i]->treeToChild1[c];
}
}
// Connect rings and trees. This should also duplicate the channels.
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks));
NCCLCHECK(connectTrees(comm, treeUpRecv, treeUpSend, treeDnRecv, treeDnSend, firstRanks));
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, firstRanks, treePatterns));
// Duplicate ringPrev/ringNext for ncclBuildRing
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
@@ -317,10 +293,9 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
free(ringSend);
free(ringPrev);
free(ringNext);
free(treeUpRecv);
free(treeUpSend);
free(treeDnRecv);
free(treeDnSend);
free(treeToParent);
free(treeToChild0);
free(treeToChild1);
return ncclSuccess;
}
+28 -110
Voir le fichier
@@ -26,88 +26,10 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
return ncclSuccess;
}
}
WARN("Could not find node of type %d id %lx\n", t, id);
WARN("Could not find node of type %d id %lx", t, id);
return ncclInternalError;
}
// [RCCL]
// This function traverses only XGMI links (including multi-GPU hops) and builds them into the
// topology system, which corresponds to how XGMI hardware operates
static ncclResult_t ncclTopoSetXgmi(struct ncclTopoSystem* system)
{
// Compute paths to GPU g
for (int g=0; g<system->nodes[GPU].count; g++) {
struct ncclTopoNode *baseNode = system->nodes[GPU].nodes+g;
if (baseNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
}
// breadth-first search to set all paths to that node in the system
struct ncclTopoNodeList nodeList;
struct ncclTopoNodeList nextNodeList;
nodeList.count = 1; nodeList.list[0] = baseNode;
nextNodeList.count = 0;
struct ncclTopoLinkList* basePath;
NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
basePath->count = 0;
basePath->width = LOC_WIDTH;
basePath->type = PATH_LOC;
while (nodeList.count) {
nextNodeList.count = 0;
for (int n=0; n<nodeList.count; n++) {
struct ncclTopoNode* node = nodeList.list[n];
struct ncclTopoLinkList* path;
NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
struct ncclTopoNode* remNode = link->remNode;
// Skip non-XGMI links
if (link->type != LINK_NVL) continue;
if (remNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
}
struct ncclTopoLinkList* remPath;
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
float width = std::min(path->width, link->width);
if (remPath->width < width) {
// Find reverse link
for (int l=0; l<remNode->nlinks; l++) {
if (remNode->links[l].remNode == node) {
remPath->list[0] = remNode->links+l;
break;
}
}
if (remPath->list[0] == NULL) {
WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
remNode->type, remNode->id, remNode->nlinks, node->type, node->id);
return ncclInternalError;
}
// Copy the rest of the path
for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
remPath->count = path->count + 1;
remPath->width = width;
remPath->type = PATH_NVL;
// Add to the list for the next iteration if not already in the list
// In this case, permit GPUs are intermediate XGMI steps
for (int i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) continue;
nextNodeList.list[nextNodeList.count++] = remNode;
}
}
}
memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
}
}
return ncclSuccess;
}
// [/RCCL]
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
if (baseNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
@@ -139,14 +61,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
struct ncclTopoLinkList* remPath;
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
float width = std::min(path->width, link->width);
// [RCCL] Do not let XGMI paths be overwritten (even if PCIe path may be faster)
// Unless they are of shorter length
// if (remPath->width < width) {
bool notXGMI = remPath->type != PATH_NVL;
if (remPath->width < width && notXGMI) {
// [/RCCL]
if (remPath->width < width) {
// Find reverse link
for (int l=0; l<remNode->nlinks; l++) {
if (remNode->links[l].remNode == node) {
@@ -166,14 +81,13 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
// Start with path type = link type. PATH and LINK types are supposed to match.
// Don't consider LINK_NET as we only care about the NIC->GPU path.
int type = link->type == LINK_NET ? 0 : link->type;
int type = link->type == LINK_NET ? LINK_LOC : link->type;
// Differentiate between one and multiple PCI switches
if (type == PATH_PIX && (node->type == PCI || link->remNode->type == PCI) && remPath->count > 3) type = PATH_PXB;
if (node->type == PCI && remNode->type == PCI) type = PATH_PXB;
// Consider a path going through the CPU as PATH_PHB
if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB;
// Ignore Power CPU in an NVLink path
if (path->type == PATH_NVL && type == PATH_SYS && link->remNode->type == CPU &&
link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) type = 0;
// Set 1 hop NVLink as NVB
//if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
remPath->type = std::max(path->type, type);
@@ -303,7 +217,7 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
if (l == -1) {
char* str = getenv(levelEnv);
if (str) {
for (int i=0; i<PATH_NET; i++) {
for (int i=0; i<=PATH_SYS; i++) {
if (strcmp(str, topoPathTypeStr[i]) == 0) {
l = i;
break;
@@ -325,9 +239,10 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
}
int ncclTopoUserP2pLevel = -1;
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read) {
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) {
*p2p = 0;
*read = 0;
if (read) *read = 0;
if (intermediateRank) *intermediateRank = -1;
// Get GPUs from topology
int g1, g2;
@@ -337,7 +252,16 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
// GPU not found, we can't use p2p.
return ncclSuccess;
}
// Set intermediate GPU rank, if routing through an intermediate GPU.
struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
if (path->count == 2) {
struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
if (intermediateNode->type == GPU && intermediateRank) {
*intermediateRank = intermediateNode->gpu.rank;
}
}
// In general, use P2P whenever we can.
int p2pLevel = PATH_SYS;
@@ -358,6 +282,9 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
if (model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
else p2pLevel = PATH_SYS;
}
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
p2pLevel = PATH_PXB;
}
compare:
// Compute the PCI distance and compare with the p2pLevel.
@@ -366,7 +293,7 @@ compare:
if (path->type == PATH_NVL) {
struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
// Enable P2P Read for Ampere/NVLink only
if ((gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
if (read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
}
return ncclSuccess;
@@ -392,9 +319,6 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
if (read) { // For reads (sends) only enable under certain conditions
int gdrReadParam = ncclParamNetGdrRead();
if (gdrReadParam == 0) return ncclSuccess;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
return ncclSuccess;
#else
if (gdrReadParam < 0) {
int nvlink = 0;
// Since we don't know whether there are other communicators,
@@ -409,7 +333,6 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
}
if (!nvlink) return ncclSuccess;
}
#endif
}
// Check if we are close enough that it makes sense to enable GDR
@@ -445,10 +368,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
}
// [RCCL] Add XGMI-only links between GPUs first before any other paths
NCCLCHECK(ncclTopoSetXgmi(system));
// [/RCCL]
// Set direct paths from/to GPUs.
for (int g=0; g<system->nodes[GPU].count; g++) {
// Compute paths to GPU g
@@ -456,8 +375,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
// Update path when we don't want to / can't use GPU Direct P2P
for (int p=0; p<system->nodes[GPU].count; p++) {
int p2p, read;
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, &read));
int p2p;
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
if (p2p == 0) {
// Divert all traffic through the CPU
int cpu;
@@ -565,8 +484,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*
// Local rank
path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
if (path->type == PATH_NVL) {
int sm = system->nodes[GPU].nodes[g].gpu.cudaCompCap;
double nvlWidth = sm < 70 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
float nvlWidth = ncclTopoNVLinkSpeed(system->nodes[GPU].nodes[g].gpu.cudaCompCap);
*nChannels = 2*std::max(1, (int)(path->width / nvlWidth));
} else {
*nChannels = 2;
@@ -600,7 +518,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
}
}
if (comm->topo->type == RCCL_TOPO_4P2H_ROME) {
if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_4P2H_ROME) {
// Adjust P2P channels on Rome
comm->p2pnChannelsPerPeer = 2;
comm->p2pnChannels = 2;
@@ -612,7 +530,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
}
// Init channels that weren't used so far
for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c));
for (int c=comm->nChannels; c<std::max(comm->nChannels, comm->p2pnChannels); c++) NCCLCHECK(initChannel(comm, c));
// We want to spread channels used when there aren't many and progressively
// fill the whole space of nChannels. To do so we mirror the bits in the
+1 -1
Voir le fichier
@@ -21,7 +21,7 @@ void dumpLine(int* values, int nranks, const char* prefix) {
ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
for (int r=0; r<nrings; r++) {
char prefix[30];
char prefix[40];
/*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
dumpLine(prev+r*nranks, nranks, prefix);
sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
+26 -2
Voir le fichier
@@ -20,8 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#define MAX_ROME_GPUS 8
#define MAX_ROME_NICS 2
#define MAX_ROME_GPUS 16
#define MAX_ROME_NICS 8
struct rcclRomeModel {
int nGpus;
@@ -235,6 +235,28 @@ static struct rcclRomeModel rome_model_46 = {
.ringBase = "6 5 7 4 1 2 3 0|7 4 6 5 1 0 3 2",
};
static struct rcclRomeModel rome_model_48 = {
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3,
.gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, },
.nicIds = { },
.gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
.nicNuma = { },
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
.pattern = "20202020",
.ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0",
};
static struct rcclRomeModel rome_model_49 = {
.nGpus = 8, .nCpus = 4, .nNics = 4, .nLinks = 3,
.gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, },
.nicIds = { 0x45000, 0x13000, 0xc6000, 0x85000, },
.gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
.nicNuma = { 0, 1, 2, 3, },
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
.pattern = "21212121",
.ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0",
};
static struct rcclRomeModel romeTopoModels[] = {
rome_model_22,
rome_model_25,
@@ -254,4 +276,6 @@ static struct rcclRomeModel romeTopoModels[] = {
rome_model_44,
rome_model_45,
rome_model_46,
rome_model_48,
rome_model_49,
};
+113 -79
Voir le fichier
@@ -25,9 +25,18 @@ static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu
}
return maxWidth;
}
static float getTotalWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
float nvlinkWidth = 0.0, pciWidth = 0.0;
for (int l=0; l<gpu->nlinks; l++) {
struct ncclTopoLink* link = gpu->links+l;
if (link->type == LINK_NVL) nvlinkWidth += link->width;
if (link->type == LINK_PCI) pciWidth = link->width;
}
return std::max(pciWidth, nvlinkWidth);
}
ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
system->maxWidth = 0.0;
system->type = 0;
system->totalWidth = 0.0;
int inter = system->nodes[NET].count;
if (inter == 0 && system->nodes[GPU].count == 1) {
system->maxWidth = LOC_WIDTH;
@@ -36,6 +45,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
for (int g=0; g<system->nodes[GPU].count; g++) {
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
system->maxWidth = std::max(system->maxWidth, getMaxWidth(system, gpu, inter ? NET : GPU));
system->totalWidth = std::max(system->totalWidth, getTotalWidth(system, gpu));
}
return ncclSuccess;
}
@@ -48,7 +58,7 @@ static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode*
return ncclSuccess;
}
}
WARN("Could not find rev link for %d/%d -> %d/%d\n", node1->type, node1->id, node2->type, node2->id);
WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id);
return ncclInternalError;
}
@@ -149,9 +159,6 @@ struct ncclGpuScore {
int intraNhops;
int intraWidth;
int interNhops;
// [RCCL]
int intraType; // New sort parameter to favor XGMI
// [/RCCL]
int interPciWidth;
int interWidth; // Most important
};
@@ -162,9 +169,6 @@ static int cmpScore(const void * g1, const void * g2) {
int d;
if ((d = (s2->interWidth - s1->interWidth))) return d;
if ((d = (s2->interPciWidth - s1->interPciWidth))) return d;
// [RCCL]
if ((d = (s1->intraType - s2->intraType))) return d; // Prefer XGMI over any other types
// [/RCCL]
if ((d = (s1->interNhops - s2->interNhops))) return d;
if ((d = (s2->intraWidth - s1->intraWidth))) return d;
if ((d = (s1->intraNhops - s2->intraNhops))) return d;
@@ -187,7 +191,7 @@ static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* in
return ncclSuccess;
}
}
WARN("Could not find gpu rank %d\n", rank);
WARN("Could not find gpu rank %d", rank);
return ncclInternalError;
}
@@ -198,7 +202,7 @@ static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int*
return ncclSuccess;
}
}
WARN("Could not find net id %lx\n", id);
WARN("Could not find net id %lx", id);
return ncclInternalError;
}
@@ -224,25 +228,11 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
for (int i=1; i<ngpus; i++) {
int g = (start+i)%ngpus;
if (paths[g].count == 0) continue; // There is no path to that GPU
// [RCCL] - Prune earlier for performance
{
if (paths[g].type > graph->typeIntra) continue; // Skip if the intra path type is already slower than the current target
if (paths[g].width < graph->speedIntra) continue;
if (netPaths)
{
if (netPaths[g].type > graph->typeInter) continue; // Skip if the inter path type is already slower than the current target
if (netPaths[g].width < graph->speedInter) continue;
}
}
// [/RCCL]
if (system->nodes[GPU].nodes[g].used & flag) continue;
scores[count].g = g;
scores[count].startIndex = i;
scores[count].intraNhops = paths[g].count;
scores[count].intraWidth = paths[g].width;
// [RCCL] - Add path type as sort factor
scores[count].intraType = paths[g].type;
// [/RCCL]
if (netPaths) {
scores[count].interNhops = netPaths[g].count;
scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g);
@@ -293,7 +283,6 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time, int type, int index, int g) {
const uint64_t flag = 1ULL<<(graph->nChannels);
struct ncclTopoNode* gpu;
NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu));
if (gpu) {
gpu->used ^= flag;
@@ -304,6 +293,35 @@ ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopo
return ncclSuccess;
}
static int ncclTopoCountXGMI(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
int count = 0;
for (int c=0; c<graph->nChannels; c++) {
for (int i=0; i<ngpus; i++) {
int g = graph->intra[ngpus*c+i];
int n = graph->intra[ngpus*c+((i+1)%ngpus)];
struct ncclTopoNode *node;
int j;
for (j=0; j<ngpus; j++)
if (system->nodes[GPU].nodes[j].gpu.rank == g) break;
if (j<ngpus) {
node = system->nodes[GPU].nodes+j;
for (int k = 0; k<system->nodes[GPU].count; k++) {
if (node->paths[GPU][k].count == 1) {
struct ncclTopoLink* link = node->paths[GPU][k].list[0];
struct ncclTopoNode* remNode = link->remNode;
if (remNode->gpu.rank == n) {
if (link->type == LINK_NVL)
count ++;
}
}
}
}
}
}
return count;
}
ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
// 1. Constraint to get the same nChannels between Rings and Trees
if (graph->nChannels < graph->minChannels) return ncclSuccess;
@@ -317,6 +335,9 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
// 3. Less hops (but not at the price of going cross NICs)
if (graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
// 4. Prefer graph with more XGMI connections
if (graph->nChannels == refGraph->nChannels
&& ncclTopoCountXGMI(system, refGraph) < ncclTopoCountXGMI(system, graph)) *copy = 1;
return ncclSuccess;
}
@@ -352,11 +373,26 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
// Balanced Tree : count half of the bandwidth on first two GPUs
int nextBackToNet = -1;
float speedInterSave = graph->speedInter;
if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
// Count half of the bandwidth on each of the first two GPUs
if (step == 0) nextBackToNet = 1;
else if (net->id != graph->inter[graph->nChannels*2+1]) continue;
graph->speedInter /= 2;
}
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
graph->speedInter = speedInterSave;
if (net) {
graph->inter[graph->nChannels*2+1] = net->id;
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, time));
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->speedInter /= 2;
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
graph->speedInter = speedInterSave;
}
}
}
@@ -427,9 +463,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
if (paths[i].count < paths[f].count) f = i;
int t = 1 << 10;
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, (f == 0) ? FORCED_ORDER_PCI : 0, &t, NET, n, f));
// [RCCL] Event if forced order PCI is found, continue the search instead of ending early
// if (t == -1) *time = -1;
// [/RCCL]
if (t == -1) *time = -1;
}
// Then try the most local GPUs
@@ -493,13 +527,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
if (system->nodes[NET].count) {
if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1;
else if (pattern == NCCL_TOPO_PATTERN_TREE) *backToNet = 0;
else *backToNet = 1;
if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
else *backToFirstRank = -1;
else if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) *backToNet = 1;
else *backToNet = 0;
*backToFirstRank = -1;
} else {
*backToNet = -1;
if (pattern == NCCL_TOPO_PATTERN_RING || pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
if (pattern == NCCL_TOPO_PATTERN_RING) *backToFirstRank = system->nodes[GPU].count-1;
else *backToFirstRank = -1;
}
return ncclSuccess;
@@ -513,14 +546,6 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
} else {
// Intra-node only.
// [RCCL] - Instead of trying PCI ordering, or replaying, just go straight to searching
{
for (int g=0; g<system->nodes[GPU].count; g++) {
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
}
return ncclSuccess;
}
// [/RCCL]
if (graph->nChannels == 0) {
// Try PCI order first
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
@@ -544,7 +569,7 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
/* User defined graph from XML file */
/************************************/
struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "LOC", PATH_LOC }, { NULL, 0 } };
struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "NVB", PATH_NVB}, { "LOC", PATH_LOC }, { NULL, 0 } };
ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
int* inter = graph->inter+2*c;
@@ -839,17 +864,6 @@ static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclT
return ncclSuccess;
}
static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *g, int *n, int nnet, int *net_map) {
*g = 0; *n = 0;
int i;
if (ncclTopoIdToIndex(system, CPU, id, &i) == ncclInternalError) return false;
for (int j = 0; j < nnet; j++)
if (system->nodes[NET].nodes[net_map[j]].paths[CPU][i].count == 2) (*n)++;
for (int j = 0; j < system->nodes[GPU].count; j++)
if (system->nodes[GPU].nodes[j].paths[CPU][i].count == 2) (*g)++;
return true;
}
static ncclResult_t ncclGpuIdToIndex(struct ncclTopoSystem* system, int id, int* index) {
*index = -1;
for (int i=0; i<system->nodes[GPU].count; i++) {
@@ -868,12 +882,18 @@ static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRo
romeTopo->nNics = 0;
romeTopo->nLinks = 0;
for (int i = 0; i < romeTopo->nGpus; i ++) {
int gpu, n;
int gpu, n, m, distance;
NCCLCHECK(ncclGpuIdToIndex(system, i, &gpu));
romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id;
for (n = 0; n < romeTopo->nCpus; n++)
if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count == 2) break;
if (n < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[n].id;
m = 0;
distance = system->nodes[GPU].nodes[gpu].paths[CPU][m].count;
for (n = 1; n < romeTopo->nCpus; n++) {
if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count < distance) {
distance = system->nodes[GPU].nodes[gpu].paths[CPU][n].count;
m = n;
}
}
if (m < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[m].id;
struct ncclTopoNode* node = system->nodes[GPU].nodes+gpu;
if (node->paths[GPU] == NULL) continue;
@@ -911,24 +931,32 @@ static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRo
}
}
for (int i = 0; i < romeTopo->nNics; i ++) {
int net, n, m, distance;
NCCLCHECK(ncclTopoIdToIndex(system, NET, net_map[i], &net));
m = 0;
distance = system->nodes[NET].nodes[net].paths[CPU][m].count;
for (n = 0; n < romeTopo->nCpus; n++)
if (system->nodes[NET].nodes[net].paths[CPU][n].count < distance) {
distance = system->nodes[NET].nodes[net].paths[CPU][n].count;
m = n;
}
if (m < romeTopo->nCpus) romeTopo->nicNuma[i] = system->nodes[CPU].nodes[m].id;
else return ncclSuccess;
}
// number of GPUs and NICs on each numa node is used as first screening pattern
for (int i = 0; i < romeTopo->nCpus; i++) {
int g, n;
getGpuNetCount(system, i, &g, &n, romeTopo->nNics, net_map);
int g = 0, n = 0;
for (int j = 0; j < romeTopo->nGpus; j++)
if (romeTopo->gpuNuma[j] == i) g++;
for (int j = 0; j < romeTopo->nNics; j++)
if (romeTopo->nicNuma[j] == i) n++;
pattern[i*2] = '0' + g;
pattern[i*2+1] = '0' + n;
}
pattern[romeTopo->nCpus*2] = 0;
for (int i = 0; i < romeTopo->nNics; i ++) {
int net, n;
NCCLCHECK(ncclTopoIdToIndex(system, NET, net_map[i], &net));
for (n = 0; n < romeTopo->nCpus; n++)
if (system->nodes[NET].nodes[net].paths[CPU][n].count == 2) break;
if (n < romeTopo->nCpus) romeTopo->nicNuma[i] = system->nodes[CPU].nodes[n].id;
else return ncclSuccess;
}
const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE");
if (romeModelFile) {
INFO(NCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile);
@@ -1064,7 +1092,7 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopo
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
float speedArray[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#else
float speedArray[] = { 42.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
float speedArray[] = { 42.0, 30.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#endif
#define NSPEEDS (sizeof(speedArray)/sizeof(float))
@@ -1111,11 +1139,20 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
// TODO: benchmark balance tree vs split tree
//if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
#else
// SPLIT_TREE works better on older archs.
int ccMin;
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
if (ccMin < 80 && graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
#endif
struct ncclTopoGraph tmpGraph;
memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
// First try crossnic, then decrease speed and finally increase speedIntra.
tmpGraph.pattern = graph->pattern;
int pass = 1;
int speedIndex = 0;
while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++;
@@ -1130,7 +1167,7 @@ search:
NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
#if 0
printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
for (int c=0; c<graph->nChannels; c++) {
printf("%2d : ", c);
for (int g=0; g<ngpus; g++) {
@@ -1140,7 +1177,8 @@ search:
}
#endif
// Optimal solution, stop here
if (graph->nChannels == graph->maxChannels && graph->speedInter == system->maxWidth) goto done;
if (time == -1) goto done;
if (graph->nChannels*graph->speedInter >= system->totalWidth) goto done;
if (pass == 1) {
// First pass, we don't have a solution yet ; try other options
@@ -1154,7 +1192,7 @@ search:
if (time != -1) globalTimeout += time;
else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
if (globalTimeout < 0) goto done;
if (globalTimeout < 0 && graph->nChannels) goto done;
int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
@@ -1169,10 +1207,6 @@ search:
tmpGraph.typeInter = PATH_PIX;
// Try a simpler tree
if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) {
tmpGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
goto search;
}
if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
goto search;
@@ -1222,7 +1256,7 @@ done:
}
if (graph->nChannels == 0 && graph->collNet == 0) {
WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
graph->inter[0] = graph->inter[1] = 0;
graph->speedIntra = graph->speedInter = 0.1;
+41 -10
Voir le fichier
@@ -20,18 +20,17 @@
#endif
#include "xml.h"
#include "cpuset.h"
#include <numa.h>
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
const char* topoLinkTypeStr[] = { "LOC", "XGMI", "PCI", "", "", "SYS", "NET" };
const char* topoPathTypeStr[] = { "LOC", "XGMI", "PIX", "PXB", "PHB", "SYS", "NET" };
const char* topoLinkTypeStr[] = { "LOC", "XGMI", "", "PCI", "", "", "SYS", "NET" };
const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "PIX", "PXB", "PHB", "SYS" };
#else
const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "", "", "SYS", "NET" };
const char* topoPathTypeStr[] = { "LOC", "NVL", "PIX", "PXB", "PHB", "SYS", "NET" };
const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "SYS", "NET" };
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PHB", "SYS" };
#endif
/******************************************************************/
@@ -83,6 +82,9 @@ static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* wi
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
*width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH;
}
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
*width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_WIDTH : ZPI_WIDTH;
}
return ncclSuccess;
}
@@ -105,7 +107,7 @@ ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode*
ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
WARN("Error : tried to create too many nodes of type %d\n", type);
WARN("Error : tried to create too many nodes of type %d", type);
return ncclInternalError;
}
struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
@@ -226,7 +228,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
}
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f ===", s->maxWidth);
INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f totalWidth %2.1f ===", s->maxWidth, s->totalWidth);
char line[1024];
for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
INFO(NCCL_GRAPH, "==========================================");
@@ -380,7 +382,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
}
struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } };
struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { NULL, 0 } };
struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { "CentaurHauls", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { " Shanghai ", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { NULL, 0 } };
ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
int numaId;
@@ -403,6 +405,11 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
} else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
int familyId, modelId;
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG;
}
if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
int familyId, modelId;
@@ -487,7 +494,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
if (gpu == NULL) {
WARN("Add NVLink error : could not find GPU %lx\n", pBusId);
WARN("Add NVLink error : could not find GPU %lx", pBusId);
return ncclInternalError;
}
int count;
@@ -515,7 +522,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
}
}
if (remote) {
int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
float nvlSpeed = ncclTopoNVLinkSpeed(gpu->gpu.cudaCompCap);
NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));
if (remote->type != GPU) {
NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));
@@ -600,6 +607,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
struct ncclXmlNode* node;
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
if (node == NULL) continue;
NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
NCCLCHECK(xmlSetAttrInt(node, "rank", r));
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
}
@@ -614,6 +622,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
NCCLCHECK(collNetGetProperties(n, &props));
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
@@ -631,6 +640,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
NCCLCHECK(ncclNetGetProperties(n, &props));
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
@@ -639,6 +649,9 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
}
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
NCCLCHECK(ncclTopoTrimXml(xml));
xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
@@ -747,3 +760,21 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank) {
}
return ncclSuccess;
}
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) {
*count = system->nodes[NET].count;
return ncclSuccess;
}
ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax) {
if (system->nodes[GPU].count == 0) return ncclInternalError;
int min, max;
min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap;
for (int g=1; g<system->nodes[GPU].count; g++) {
min = std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
max = std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
}
if (ccMin) *ccMin = min;
if (ccMax) *ccMax = max;
return ncclSuccess;
}
+30 -13
Voir le fichier
@@ -13,15 +13,19 @@
#include <sched.h>
#define LOC_WIDTH 5000.0
#define PASCAL_NVLINK_WIDTH 18.0
#define VOLTA_NVLINK_WIDTH 21.0
#define SM60_NVLINK_WIDTH 18.0
#define SM70_NVLINK_WIDTH 21.0
#define SM80_NVLINK_WIDTH 21.0
#define SM86_NVLINK_WIDTH 12.0
#define PCI_WIDTH 12.0 // PCI Gen3 x16
#define QPI_WIDTH 6.0
#define SKL_QPI_WIDTH 9.0
#define ZPI_WIDTH 6.0
#define YONGFENG_ZPI_WIDTH 9.0
#define P9_WIDTH 32.0
#define ARM_WIDTH 6.0
#define NET_WIDTH 12.0 // 100Gbit
#define VEGA_XGMI_WIDTH 20.0
#define VEGA_XGMI_WIDTH 24.0
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
// to GPU traffic consumes more PCI bandwidth.
@@ -40,20 +44,21 @@ extern const char* topoNodeTypeStr[];
// We want link types and path types to match as much as possible
#define LINK_LOC 0
#define LINK_NVL 1
#define LINK_PCI 2
// Skipping 3 for PATH_PXB
// Skipping 4 for PATH_PHB
#define LINK_SYS 5
#define LINK_NET 6
// Skipping 2 for PATH_NVB
#define LINK_PCI 3
// Skipping 4 for PATH_PXB
// Skipping 5 for PATH_PHB
#define LINK_SYS 6
#define LINK_NET 7
extern const char* topoLinkTypeStr[];
#define PATH_LOC 0
#define PATH_NVL 1
#define PATH_PIX 2
#define PATH_PXB 3
#define PATH_PHB 4
#define PATH_SYS 5
#define PATH_NET 6
#define PATH_NVB 2
#define PATH_PIX 3
#define PATH_PXB 4
#define PATH_PHB 5
#define PATH_SYS 6
extern const char* topoPathTypeStr[];
struct ncclTopoNode;
@@ -125,6 +130,7 @@ struct ncclTopoNodeSet {
struct ncclTopoSystem {
struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
float maxWidth;
float totalWidth;
int type;
};
@@ -141,6 +147,8 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels);
ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml);
ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax);
static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) {
*index = -1;
for (int i=0; i<system->nodes[type].count; i++) {
@@ -163,4 +171,13 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank,
return ncclInternalError;
}
// Returns NVLink speed in GB/s
static float ncclTopoNVLinkSpeed(int cudaCompCap) {
return
cudaCompCap == 86 ? SM86_NVLINK_WIDTH :
cudaCompCap >= 80 ? SM80_NVLINK_WIDTH :
cudaCompCap >= 70 ? SM70_NVLINK_WIDTH :
cudaCompCap >= 60 ? SM60_NVLINK_WIDTH :
SM80_NVLINK_WIDTH;
}
#endif
+28 -25
Voir le fichier
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -28,7 +28,7 @@
* / \ / \ / \ \
* 1 3 5 7 9 11 13
*/
ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) {
int up, down0, down1;
int bit;
for (bit=1; bit<nranks; bit<<=1) {
@@ -37,13 +37,16 @@ ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
if (rank == 0) {
*u = -1;
*d0 = nranks > 1 ? bit >> 1 : -1;
*d1 = -1;
*d0 = -1;
// Child rank is > 0 so it has to be our child 1, not 0.
*d1 = nranks > 1 ? bit >> 1 : -1;
return ncclSuccess;
}
up = (rank ^ bit) | (bit << 1);
// if smaller than the parent, we are his first child, otherwise we're his second
if (up >= nranks) up = (rank ^ bit);
*parentChildType = (rank < up) ? 0 : 1;
*u = up;
int lowbit = bit >> 1;
@@ -62,42 +65,42 @@ ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
}
/* Build a double binary tree. Take the previous tree for the first tree.
* For the second tree, we use a mirror tree (if nranks is odd)
* For the second tree, we use a mirror tree (if nranks is even)
*
* 8---------0---------5
* ______/ \______ _____/ \______
* 4 12 1 9
* / \ / \ / \
* 2 6 10 3 7 10
* / \ / \ / \ / \ / \ / \
* 1 3 5 7 9 11 2 4 6 8 11 12
* 0---------------8 3----------------11
* ______/ \ / \______
* 4 \ / 7
* / \ \ / / \
* 2 6 10 1 5 9
* / \ / \ / \ / \ / \ / \
* 1 3 5 7 9 11 0 2 4 6 8 10
*
* or shift it by one rank (if nranks is even)
* or shift it by one rank (if nranks is odd).
*
* 8---------0--------------9
* ______/ \ ______/ \
* 4 \ 5 \
* / \ \ / \ \
* 2 6 10 3 7 11
* / \ / \ / \ / \ / \ / \
* 1 3 5 7 9 11 2 4 6 8 10 1
* 0---------------8 1---------------9
* ______/ \______ ______/ \______
* 4 12 5 0
* / \ / / \ /
* 2 6 10 3 7 11
* / \ / \ / \ / \ / \ / \
* 1 3 5 7 9 11 2 4 6 8 10 12
*/
ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* s1, int* d1_0, int* d1_1) {
ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) {
// First tree ... use a btree
ncclGetBtree(nranks, rank, s0, d0_0, d0_1);
ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0);
// Second tree ... mirror or shift
if (nranks % 2 == 0) {
if (nranks % 2 == 1) {
// shift
int shiftrank = (rank-1+nranks) % nranks;
int u, d0, d1;
ncclGetBtree(nranks, shiftrank, &u, &d0, &d1);
ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1);
*s1 = u == -1 ? -1 : (u+1) % nranks;
*d1_0 = d0 == -1 ? -1 : (d0+1) % nranks;
*d1_1 = d1 == -1 ? -1 : (d1+1) % nranks;
} else {
// mirror
int u, d0, d1;
ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1);
ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1);
*s1 = u == -1 ? -1 : nranks-1-u;
*d1_0 = d0 == -1 ? -1 : nranks-1-d0;
*d1_1 = d1 == -1 ? -1 : nranks-1-d1;
+67 -21
Voir le fichier
@@ -71,45 +71,66 @@ static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
};
// LL128 max BW (per channel) for the different collectives
// ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce
static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15.2, 16.7 };
// ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce
static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15.2, 16.9 };
static const double llMaxBws[2][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0} };
static const double perChMaxTreeBws[2][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 22.5, 16.0} };
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_MAX_NTHREADS;
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
#else
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
#endif
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
if (comm->nRanks <= 1) return ncclSuccess;
int nNodes = comm->nNodes;
int nRanks = comm->nRanks;
if (nRanks <= 1) return ncclSuccess;
int compCap80 = minCompCap == 80 && maxCompCap == 80 ? 1 : 0;
float ppn = (float)comm->nRanks / comm->nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
int cpuArch, cpuVendor, cpuModel;
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
int index2 = nNodes <= 2 ? nNodes-1 : 2;
// LL: for single node, we look at GPU type; for multi-node, we look at CPU type
int index1 = nNodes == 1 ? compCap80 : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
double llMaxBw = llMaxBws[index1][index2];
double perChMaxTreeBw = perChMaxTreeBws[compCap80][index2];
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph };
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) :
coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 :
comm->nRanks;
int nInterSteps = coll == ncclCollAllReduce ? 2*(comm->nNodes-1) :
coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nNodes-1 :
comm->nNodes;
int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
nRanks;
int nInterSteps = coll == ncclFuncAllReduce ? 2*(nNodes-1) :
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
nNodes;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
if (coll != ncclCollAllReduce && a != NCCL_ALGO_RING) continue;
if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
float speed = comm->nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
float speed = nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
float busBw = graphs[a]->nChannels * speed;
// Various model refinements
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/5.0;
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
double maxTreeBw = comm->nNodes > 2 ?
@@ -118,21 +139,29 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.27, comm->nNodes > 1 ? 70.0 : 90.0);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/2.3;
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (comm->nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels*7.0/9.0);
#else
if (compCap80) busBw = std::min(busBw, 235.0f);
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
#endif
if (a == NCCL_ALGO_COLLNET) busBw *= .9;
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0; // CollNet does not support LL128
// Convert bus BW to algorithm BW
float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * comm->nRanks) / nsteps;
float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps;
comm->bandwidths[coll][a][p] = busBw * ratio;
comm->latencies[coll][a][p] = baseLat[a][p];
float intraLat = hwLat[intraHw[a]][a][p];
float interLat = hwLat[NCCL_HW_NET][a][p];
if (comm->nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
if (a == NCCL_ALGO_RING) {
float lat = hwLat[hw[a]][a][p];
if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) {
if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
if (ringGraph->sameChannels) {
comm->latencies[coll][a][p] += lat;
} else {
@@ -144,10 +173,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
}
} else if (a == NCCL_ALGO_TREE) {
comm->latencies[coll][a][p] +=
2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
} else {
comm->latencies[coll][a][p] +=
2 * (comm->nRanks/comm->nNodes-1) * intraLat + interLat;
2 * (nRanks/nNodes-1) * intraLat + interLat;
}
}
}
@@ -168,6 +197,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
}
// Disable CollNet if it is not supported
if (comm->collNetSupport == 0) {
algoEnable[NCCL_ALGO_COLLNET] = 0;
// If user has hard set NCCL_ALGO=COLLNET, ignore it
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) {
algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
}
}
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
int pEnable = protoEnable[p];
@@ -178,7 +216,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
}
if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
// Only disable algo for Allreduce since others only have one
if (c == ncclCollAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
if (c == ncclFuncAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
}
if (comm->rank == 0) {
@@ -214,7 +252,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD;
comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
}
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= comm->nRanks;
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks;
// Override defaults with user env
char* str = getenv("NCCL_THREAD_THRESHOLDS");
@@ -263,8 +301,16 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
*time = -1.0; return ncclSuccess;
}
int logSize = log2i(info->nBytes>>6);
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if (algorithm == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[protocol][logSize];
else if (algorithm == NCCL_ALGO_RING && logSize < 22) bw *= ringCorrectionFactor[protocol][logSize];
#else
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
&& info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring
#endif
*time = lat + (info->nBytes) / (1000 * bw);
return ncclSuccess;
}
+39 -17
Voir le fichier
@@ -71,7 +71,7 @@ ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) {
if (c == '=') {
ptr[o] = '\0';
if (value == NULL) {
WARN("XML Parse : Unexpected value with name %s\n", ptr);
WARN("XML Parse : Unexpected value with name %s", ptr);
return ncclInternalError;
}
return xmlGetValue(file, value, last);
@@ -137,7 +137,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
// Re-read the name, we got '/' in the first call
NCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
if (c != '>') {
WARN("XML Parse error : unexpected trailing %c in closing tag %s\n", c, node->name);
WARN("XML Parse error : unexpected trailing %c in closing tag %s", c, node->name);
return ncclInternalError;
}
return ncclSuccess;
@@ -150,7 +150,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
while (c == ' ') {
NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c));
if (a == MAX_ATTR_COUNT) {
INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)\n", MAX_ATTR_COUNT);
INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)", MAX_ATTR_COUNT);
// Actually we need to still consume the extra attributes so we have an extra one.
} else a++;
}
@@ -178,7 +178,7 @@ ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* hea
if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess;
while (1) {
if (xml->maxIndex == MAX_NODES) {
WARN("Error : XML parser is limited to 1024 nodes\n");
WARN("Error : XML parser is limited to 1024 nodes");
return ncclInternalError;
}
struct ncclXmlNode* node = xml->nodes+xml->maxIndex;
@@ -373,7 +373,7 @@ ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const char* pat
char strValue[MAX_STR_LEN];
NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue));
if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); }
TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s\n", path, fileName, attrName, strValue);
TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s", path, fileName, attrName, strValue);
return ncclSuccess;
}
@@ -579,7 +579,6 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
NCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index));
if (index == -1) {
if (nvmlDev == NULL) {
//WARN("No NVML, trying to use CUDA instead");
const char* busId;
NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
if (busId == NULL || hipDeviceGetByPCIBusId(&dev, busId) != hipSuccess) dev = -1;
@@ -662,7 +661,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12;
if (maxNvLinks > 0 && nvmlDev == NULL) {
WARN("No NVML device handle. Skipping nvlink detection.\n");
WARN("No NVML device handle. Skipping nvlink detection.");
maxNvLinks = 0;
}
@@ -721,6 +720,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
char* path;
NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
free(path);
}
}
}
@@ -732,10 +732,14 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
struct ncclXmlNode* node;
NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node));
NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
NCCLCHECK(wrapNvmlSymbols());
NCCLCHECK(wrapNvmlInit());
nvmlDevice_t nvmlDev;
if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
nvmlDevice_t nvmlDev = NULL;
static int nvmlInit = 0;
if (nvmlInit == 0) {
nvmlInit = (wrapNvmlSymbols() != ncclSuccess || wrapNvmlInit() != ncclSuccess) ? 2 : 1;
}
if (nvmlInit == 1) {
if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
}
NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
return ncclSuccess;
}
@@ -778,12 +782,8 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--);
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
strcpy(busId, pciSysPath+offset+1);
NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", busId));
if (parent == NULL) {
NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
NCCLCHECK(xmlSetAttr(parent, "busid", busId));
NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
}
NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent));
NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
} else {
// Virtual NIC, no PCI device, attach to first CPU
NCCLCHECK(xmlFindTag(xml, "cpu", &parent));
@@ -802,6 +802,28 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
return ncclSuccess;
}
ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node) {
const char* str;
NCCLCHECK(xmlGetAttr(node, "keep", &str));
if (str && strcmp(str, "1") == 0) {
NCCLCHECK(xmlUnsetAttr(node, "keep"));
} else {
// Copy nSubs and subs as they could change as we trim recursively.
struct ncclXmlNode* subs[MAX_SUBS];
int nSubs = node->nSubs;
memcpy(subs, node->subs, node->nSubs*sizeof(struct ncclXmlNode*));
for (int s=0; s<nSubs; s++) {
NCCLCHECK(ncclTopoTrimXmlRec(subs[s]));
}
if (node->nSubs == 0) NCCLCHECK(xmlRemoveNode(node));
}
return ncclSuccess;
}
ncclResult_t ncclTopoTrimXml(struct ncclXml* xml) {
NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes));
return ncclSuccess;
}
/**************************************************/
/* Parser rules for the user-defined graph search */
/**************************************************/
+40 -5
Voir le fichier
@@ -8,7 +8,7 @@
#define XML_H_
// A few constraints to make the implementation easy
#define MAX_STR_LEN 256
#define MAX_STR_LEN 255
#define MAX_ATTR_COUNT 16
#define MAX_SUBS 32
#define MAX_NODES 1024
@@ -19,10 +19,10 @@
#define NODE_TYPE_SINGLE 3
struct ncclXmlNode {
char name[MAX_STR_LEN];
char name[MAX_STR_LEN+1];
struct {
char key[MAX_STR_LEN];
char value[MAX_STR_LEN];
char key[MAX_STR_LEN+1];
char value[MAX_STR_LEN+1];
} attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params
int nAttrs;
int type;
@@ -47,6 +47,9 @@ ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXm
ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode);
ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode);
/* Remove unneeded parts */
ncclResult_t ncclTopoTrimXml(struct ncclXml* xml);
/**************/
/* XML Struct */
/* Functions */
@@ -56,7 +59,7 @@ static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrNa
*index = -1;
const int nAttrs = node->nAttrs;
for (int a=0; a<nAttrs; a++) {
if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN-1) == 0) {
if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN) == 0) {
*index = a;
return ncclSuccess;
}
@@ -127,8 +130,10 @@ static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, c
if (index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
}
strncpy(node->attrs[index].value, value, MAX_STR_LEN);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return ncclSuccess;
}
@@ -138,8 +143,10 @@ static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName
if (index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
}
snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return ncclSuccess;
}
@@ -149,8 +156,22 @@ static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrNa
if (index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
}
snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return ncclSuccess;
}
static ncclResult_t xmlUnsetAttr(struct ncclXmlNode* node, const char* attrName) {
int index;
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if (index == -1) return ncclSuccess;
for (int i=index+1; i<node->nAttrs; i++) {
strcpy(node->attrs[i-1].key, node->attrs[i].key);
strcpy(node->attrs[i-1].value, node->attrs[i].value);
}
node->nAttrs--;
return ncclSuccess;
}
@@ -199,6 +220,20 @@ static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent,
s->parent = parent;
if (parent) parent->subs[parent->nSubs++] = s;
strncpy(s->name, subName, MAX_STR_LEN);
s->name[MAX_STR_LEN] = '\0';
return ncclSuccess;
}
static ncclResult_t xmlRemoveNode(struct ncclXmlNode* node) {
node->type = NODE_TYPE_NONE;
struct ncclXmlNode* parent = node->parent;
if (parent == NULL) return ncclSuccess;
int shift = 0;
for (int s=0; s<parent->nSubs; s++) {
if (parent->subs[s] == node) shift = 1;
else if (shift) parent->subs[s-1] = parent->subs[s];
}
parent->nSubs--;
return ncclSuccess;
}
+120 -80
Voir le fichier
@@ -35,7 +35,6 @@ struct ncclInitArgs {
};
struct ncclCollArgs {
ncclComm_t comm;
int connect;
};
enum ncclAsyncFuncType {
@@ -110,6 +109,7 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) {
NCCL_API(ncclResult_t, ncclGroupStart);
ncclResult_t ncclGroupStart() {
NVTX3_FUNC_RANGE_IN(nccl_domain);
if (ncclGroupMode == 0) {
memset(ncclGroupArgs, 0, sizeof(struct ncclAsyncArgs)*MAX_ASYNC_OPS);
}
@@ -118,7 +118,7 @@ ncclResult_t ncclGroupStart() {
}
static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, void* recvbuff, ssize_t sendbytes, const void* sendbuff) {
struct ncclInfo info = { ncclCollSendRecv, "SendRecv",
struct ncclInfo info = { ncclFuncSendRecv, "SendRecv",
sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */
1, 1 };
info.delta = delta;
@@ -126,26 +126,32 @@ static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int chann
info.sendbytes = sendbytes;
info.recvbytes = recvbytes;
if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage;
NCCLCHECK(ncclSaveKernel(&info));
NCCLCHECK(ncclSaveP2pKernel(&info));
return ncclSuccess;
}
void* ncclAsyncThreadPreconnect(void* args_) {
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
CUDACHECKTHREAD(hipSetDevice(args->coll.comm->cudaDev));
for (int c=0; c<args->coll.comm->p2pnChannels; c++) {
struct ncclComm* comm = args->coll.comm;
struct ncclChannel* channel = comm->channels+c;
struct ncclP2PConnect* connect = &comm->p2plist.connect;
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, channel, connect->nrecv[c], connect->recv+c*comm->nRanks, connect->nsend[c], connect->send+c*comm->nRanks));
connect->nrecv[c] = 0;
connect->nsend[c] = 0;
}
struct ncclComm* comm = args->coll.comm;
CUDACHECKTHREAD(hipSetDevice(comm->cudaDev));
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL));
return args;
}
static size_t getP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
size_t size = std::max(minSize, DIVUP(totalSize, minChannels));
int nChannels = minChannels;
while (size > maxSize && nChannels <= maxChannels/2) {
nChannels *= 2;
size = DIVUP(totalSize, nChannels);
}
ALIGN_SIZE(size, minSize);
return size;
}
NCCL_API(ncclResult_t, ncclGroupEnd);
ncclResult_t ncclGroupEnd() {
NVTX3_FUNC_RANGE_IN(nccl_domain);
if (ncclGroupMode == 0) {
WARN("ncclGroupEnd: not in a group call.");
return ncclInvalidUsage;
@@ -186,29 +192,21 @@ ncclResult_t ncclGroupEnd() {
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
if (p2plist->count != 0) {
struct ncclComm* comm = args->coll.comm;
args->coll.connect = 0;
for (int c=0; c<comm->p2pnChannels; c++)
args->coll.connect += comm->p2plist.connect.nsend[c] + comm->p2plist.connect.nrecv[c];
if (args->coll.connect) {
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
}
}
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL && (args->coll.connect)) {
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
int err = pthread_join(ncclGroupThreads[i], NULL);
if (err != 0) {
WARN("Error waiting for pthread_join : %s\n", strerror(errno));
WARN("Error waiting for pthread_join : %s", strerror(errno));
return ncclSystemError;
}
NCCLCHECKGOTO(args->ret, ret, end);
args->coll.comm->connect = 0;
}
}
@@ -218,56 +216,102 @@ ncclResult_t ncclGroupEnd() {
struct ncclComm* comm = args->coll.comm;
int rank = comm->rank;
int nRanks = comm->nRanks;
struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
if (p2plist->count) {
for (int delta=0; delta<nRanks; delta++) {
struct ncclP2Plist* p2pSends = comm->p2pSends;
struct ncclP2Plist* p2pRecvs = comm->p2pRecvs;
// Compute how much to split operations
// Natural step size matching buffer steps.
ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
// Try to use all channels
int nChannelsMax = comm->p2pnChannelsPerPeer;
int nChannelsMin = nChannelsMax;
// Try to use all channels, but one channel per operation.
while (nChannelsMin*comm->nRanks > std::max(comm->nChannels, comm->p2pnChannels) && nChannelsMin > 1) nChannelsMin /= 2;
// Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
while (nChannelsMax*comm->nRanks > std::max(comm->nChannels, comm->p2pnChannels)*4 && nChannelsMax > 1) nChannelsMax /= 2;
while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
// schedule delta 0, +1, -1, +2, -2, ...
// also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
for (int d=0; d<=nRanks/4; d++) {
int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, (nRanks-(nRanks/2-d))%nRanks };
int index = 0;
int delta = deltas[index];
sched_delta:
uint32_t from = (rank+nRanks-delta)%nRanks;
uint32_t to = (rank+delta)%nRanks;
struct ncclP2Pinfo* recv = p2pRecvs[from].head;
struct ncclP2Pinfo* send = p2pSends[to].head;
if (recv != NULL || send != NULL) {
ssize_t totRecvBytes = -1, totSendBytes = -1;
if (recv != NULL) totRecvBytes = recv->nbytes;
if (send != NULL) totSendBytes = send->nbytes;
ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
// Compute how much to split operations
// Natural step size matching buffer steps.
ssize_t stepSize = 4*comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
// Split each operation on p2pnChannelsPerPeer max.
ssize_t recvChunkSize = DIVUP(p2plist->peerlist[from].recvbytes, comm->p2pnChannelsPerPeer);
ssize_t sendChunkSize = DIVUP(p2plist->peerlist[to].sendbytes, comm->p2pnChannelsPerPeer);
recvChunkSize = std::max((ssize_t)1, DIVUP(recvChunkSize, stepSize)) * stepSize;
sendChunkSize = std::max((ssize_t)1, DIVUP(sendChunkSize, stepSize)) * stepSize;
ssize_t sendOffset = 0;
ssize_t recvOffset = 0;
int remaining = 1;
int chunk = 0;
while (remaining) {
int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
remaining = 0;
ssize_t recvbytes = p2plist->peerlist[from].recvbytes-recvOffset;
ssize_t sendbytes = p2plist->peerlist[to].sendbytes-sendOffset;
if (recvbytes > recvChunkSize) { remaining = 1; recvbytes = recvChunkSize; } else p2plist->peerlist[from].recvbytes = -1;
if (sendbytes > sendChunkSize) { remaining = 1; sendbytes = sendChunkSize; } else p2plist->peerlist[to].sendbytes = -1;
if (sendbytes >= 0 || recvbytes >= 0) {
NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
recvbytes, ((char*)(p2plist->peerlist[from].recvbuff)) + recvOffset,
sendbytes, ((const char*)(p2plist->peerlist[to].sendbuff)) + sendOffset), ret, end);
ssize_t sendOffset = 0;
ssize_t recvOffset = 0;
int sendRemaining = 1, recvRemaining = 1;
int chunk = 0;
do {
int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
ssize_t recvbytes = totRecvBytes-recvOffset;
ssize_t sendbytes = totSendBytes-sendOffset;
if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
// 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
// (total size == 0), otherwise set size to -1 so that the kernel skips the operation.
if (sendbytes == 0 && totSendBytes != 0) sendbytes = -1;
if (recvbytes == 0 && totRecvBytes != 0) recvbytes = -1;
if (sendbytes >= 0 || recvbytes >= 0) {
NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL), ret, group_cleanup);
}
recvOffset += recvChunkSize;
sendOffset += sendChunkSize;
chunk++;
} while (sendRemaining || recvRemaining);
if (recv) {
NCCLCHECKGOTO(dequeueP2pInfo(p2pRecvs+from), ret, group_cleanup);
comm->p2pRecvCount--;
}
recvOffset += recvChunkSize;
sendOffset += sendChunkSize;
chunk++;
if (send) {
NCCLCHECKGOTO(dequeueP2pInfo(p2pSends+to), ret, group_cleanup);
comm->p2pSendCount--;
}
}
index++;
if (index == 1 && deltas[1] == deltas[0]) index++;
if (index == 2 && deltas[2] == deltas[0]) index++;
if (index == 3 && deltas[3] == deltas[2]) index++;
if (index == 3 && deltas[3] == deltas[1]) index++;
if (index < 4) {
delta = deltas[index];
goto sched_delta;
}
}
p2plist->count = 0;
}
}
}
/* Collectives are done in three steps :
* 0. Save kernels previously enqueued. Compute channel, algo, proto, etc.
* 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
* 2. Barrier Wait. No CUDA call is permitted
* 3. Enqueue Events. CUDA event wait/enqueue.
* This is needed because step 2 cannot call any CUDA primitive, otherwise if
* hipFree happens between 1 and 3, it could block that CUDA call and
* cudaFree happens between 1 and 3, it could block that CUDA call and
* prevent some ranks from launching their network threads, which would
* prevent the NCCL call from completing, blocking the hipFree call.
* prevent the NCCL call from completing, blocking the cudaFree call.
*/
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
ncclComm_t comm = args->coll.comm;
NCCLCHECKGOTO(ncclSaveCommKernels(comm), ret, group_cleanup);
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
@@ -304,32 +348,28 @@ group_cleanup:
*args->init.newcomm = NULL;
} else {
struct ncclComm* comm = args->coll.comm;
for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
struct ncclChannel* channel = comm->channels+c;
for (int i=0; i<channel->collCount; i++) {
channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
// Reset aggregation counters
comm->asyncOpCount = 0;
comm->asyncTotalSize = 0;
// Dequeue p2p lists
if (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
struct ncclP2Plist* p2pSends = comm->p2pSends;
struct ncclP2Plist* p2pRecvs = comm->p2pRecvs;
for (int peer=0; peer<comm->nRanks; peer++) {
while (p2pSends[peer].head != NULL) dequeueP2pInfo(p2pSends+peer);
while (p2pRecvs[peer].head != NULL) dequeueP2pInfo(p2pRecvs+peer);
}
channel->collFifoTail = channel->collStart;
channel->collCount = 0;
comm->p2pSendCount = comm->p2pRecvCount = 0;
}
/* Cancel all proxy ops : mark them as ncclProxyOpNone and they should be freed later on */
/* Free all proxy ops in state->nextOps */
struct ncclProxyState* state = &comm->proxyState;
struct ncclProxyArgs *op, *start;
pthread_mutex_lock(&state->mutex);
op = start = state->ops;
while (op) {
if (op->opCount >= comm->lastOpCount) op->state = ncclProxyOpNone;
struct ncclProxyArgs* peerOp = op->nextPeer;
while (peerOp) {
if (peerOp->opCount >= comm->lastOpCount) peerOp->state = ncclProxyOpNone;
peerOp = peerOp->nextPeer;
}
op = op->next;
if (op == start) break;
pthread_mutex_lock(&state->poolMutex);
for (struct ncclProxyArgs *op = state->nextOps; op; op = op->next) {
op->next = state->pool;
state->pool = op;
}
comm->opCount = comm->lastOpCount;
pthread_cond_signal(&state->cond);
pthread_mutex_unlock(&state->mutex);
pthread_mutex_unlock(&state->poolMutex);
state->nextOps = NULL;
comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
comm->userStreamSet = false;
+3 -1
Voir le fichier
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -16,6 +16,8 @@ ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commSt
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr);
ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
ncclResult_t bootstrapClose(void* commState);
ncclResult_t bootstrapAbort(void* commState);
#endif
+1 -1
Voir le fichier
@@ -24,7 +24,7 @@ static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, voi
static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
static ncclResult_t collNetFlush(void* collComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclCollNet->flush(collComm, data, size, mhandle)); return ncclSuccess; }
static ncclResult_t collNetIflush(void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
+51 -48
Voir le fichier
@@ -8,63 +8,67 @@
#ifndef NCCL_COLLECTIVES_H_
#define NCCL_COLLECTIVES_H_
#define FUNC_INDEX_P2P (4+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps)
#define FUNC_INDEX(coll, redop, dtype, al, pr) ((coll >= NCCL_NUM_FUNCTIONS) \
? (coll-NCCL_NUM_FUNCTIONS+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps) \
: ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)))
#define FUNC_INDEX_P2P (NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps)
#define FUNC_INDEX_A2A (FUNC_INDEX_P2P+1)
#define FUNC_INDEX_A2AV (FUNC_INDEX_P2P+2)
#define NCCL_COLL_NAME(coll, op, dtype) \
coll##_##op##_##dtype
#define FUNC_INDEX(func, redop, ncclType, al, pr) ((func >= NCCL_NUM_FUNCTIONS) \
? (func-NCCL_NUM_FUNCTIONS+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps) \
: ((((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)))
#define NCCL_KERN_NAME(coll, op, dtype) \
coll##Kernel_##op##_##dtype
#define NCCL_FUNC_NAME(func, algo, proto, redop, type) \
ncclFunction_##func##_##algo##_##proto##_##redop##_##type
#define NCCL_KERN_NAME(func, algo, proto, redop, type) \
ncclKernel_##func##_##algo##_##proto##_##redop##_##type
#define NCCL_IMPL_NAME(func, algo, proto) \
nccl##func##algo##proto
/* Declare all collective operations */
#define DECL_COLL5(coll, op, dtype) \
extern __device__ __attribute__((noinline)) void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm); \
#define DECL5(func, algo, proto, redop, type) \
extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, redop, type)(struct ncclWorkElem* args); \
extern __global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(struct ncclWorkElem first); \
#define DECL_COLL4(coll, op, dtype) \
DECL_COLL5(coll, op, dtype) \
DECL_COLL5(coll##LL, op, dtype) \
DECL_COLL5(coll##LL128, op, dtype)
#define DECL4(func, algo, redop, type) \
DECL5(func, algo, SIMPLE, redop, type) \
DECL5(func, algo, LL, redop, type) \
DECL5(func, algo, LL128, redop, type)
#define DECL_COLL3(coll, op, dtype) \
DECL_COLL4(coll##Ring, op, dtype) \
DECL_COLL4(coll##Tree, op, dtype) \
DECL_COLL4(coll##CollNet, op, dtype)
#define DECL3(func, redop, type) \
DECL4(func, RING, redop, type) \
DECL4(func, TREE, redop, type) \
DECL4(func, COLLNET, redop, type)
#define DECL_COLL2(coll, op) \
DECL_COLL3(coll, op, i8) \
DECL_COLL3(coll, op, u8) \
DECL_COLL3(coll, op, i32) \
DECL_COLL3(coll, op, u32) \
DECL_COLL3(coll, op, i64) \
DECL_COLL3(coll, op, u64) \
DECL_COLL3(coll, op, f16) \
DECL_COLL3(coll, op, f32) \
DECL_COLL3(coll, op, f64) \
DECL_COLL3(coll, op, b16)
#define DECL2(func, redop) \
DECL3(func, redop, int8_t) \
DECL3(func, redop, uint8_t) \
DECL3(func, redop, int32_t) \
DECL3(func, redop, uint32_t) \
DECL3(func, redop, int64_t) \
DECL3(func, redop, uint64_t) \
DECL3(func, redop, half) \
DECL3(func, redop, float) \
DECL3(func, redop, double) \
DECL3(func, redop, rccl_bfloat16)
#define DECL_COLL(coll) \
DECL_COLL2(coll, sum) \
DECL_COLL2(coll, prod) \
DECL_COLL2(coll, min) \
DECL_COLL2(coll, max)
#define DECL(func) \
DECL2(func, Sum) \
DECL2(func, Prod) \
DECL2(func, Min) \
DECL2(func, Max)
#define DECL_ALL_COLLS \
DECL_COLL2(ncclBroadcast, copy) \
DECL_COLL(ncclReduce) \
DECL_COLL2(ncclAllGather, copy) \
DECL_COLL(ncclReduceScatter) \
DECL_COLL(ncclAllReduce) \
DECL_COLL5(ncclGather, copy, i8) \
DECL_COLL5(ncclScatter, copy, i8) \
DECL_COLL5(ncclAllToAll, copy, i8) \
DECL_COLL5(ncclAllToAllv, copy, i8) \
DECL_COLL5(ncclSendRecv, copy, i8) \
#define DECL_ALL \
DECL2(Broadcast, Sum) \
DECL(Reduce) \
DECL2(AllGather, Sum) \
DECL(ReduceScatter) \
DECL(AllReduce) \
DECL5(SendRecv, RING, SIMPLE, Sum, int8_t) \
DECL5(AllToAll, RING, SIMPLE, Sum, int8_t) \
DECL5(AllToAllv, RING, SIMPLE, Sum, int8_t) \
DECL_ALL_COLLS
DECL_ALL
// CHUNKSIZE must be a multiple of SLICESIZE
//#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
@@ -92,5 +96,4 @@ DECL_ALL_COLLS
#define ALLTOALL_CHUNKSTEPS 4
#define ALLTOALLV_SLICESTEPS 4
#define ALLTOALLV_CHUNKSTEPS 4
#endif
+17 -4
Voir le fichier
@@ -55,8 +55,8 @@ struct ncclRecvMem {
struct {
uint64_t tail;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
int sizesFifo[NCCL_STEPS];
void* ptrsFifo[NCCL_STEPS];
};
char pad4[MEM_ALIGN];
};
@@ -70,6 +70,10 @@ struct ncclComm {
struct ncclTopoSystem* topo;
void* bootstrap;
// Bitmasks for ncclTransportP2pSetup
int connect;
uint32_t* connectSend;
uint32_t* connectRecv;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
@@ -134,8 +138,8 @@ struct ncclComm {
int* intraCudaDevs;
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
struct ncclDevComm* args;
struct ncclDevComm** argsptr;
struct ncclWorkElem args;
void* argsptr;
// Global proxy thread
pthread_t proxyThread;
@@ -143,8 +147,17 @@ struct ncclComm {
// Whether this communicator uses collNet
int collNetSupport;
// Store info of async operations
struct ncclInfo* asyncOps;
int asyncOpCount;
size_t asyncTotalSize;
//list of async p2p operation queued in a group semantics
struct ncclP2Plist p2plist;
struct ncclP2Plist* p2pSends;
struct ncclP2Plist* p2pRecvs;
int p2pSendCount;
int p2pRecvCount;
// [RCCL]
bool alltoallDisable; // RCCL AllToAll/Scatter/Gather API
+1
Voir le fichier
@@ -57,5 +57,6 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
#include "alloc.h"
#include "utils.h"
#include "param.h"
#include "nvtx_stub.h"
#endif // end include guard
+2 -2
Voir le fichier
@@ -19,7 +19,7 @@ static int hexToInt(char c) {
#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
uint32_t cpumasks[CPU_SET_N_U32];
int m = CPU_SET_N_U32-1;
cpumasks[m] = 0;
@@ -42,7 +42,7 @@ ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
return ncclSuccess;
}
ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
int c = 0;
uint8_t* m8 = (uint8_t*)mask;
for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
+1 -1
Voir le fichier
@@ -25,7 +25,7 @@ extern pthread_mutex_t ncclDebugOutputLock;
extern FILE *ncclDebugFile;
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
// Let code temporarily downgrade WARN into INFO
extern thread_local int ncclDebugNoWarn;
+62 -64
Voir le fichier
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -27,8 +27,8 @@
#define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollGather, ncclCollScatter, ncclCollAllToAll, ncclCollAllToAllv, ncclCollSendRecv} ncclFunc_t;
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+4];
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncAllToAll, ncclFuncAllToAllv } ncclFunc_t;
extern const char* ncclFuncStr[];
#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
#define NCCL_ALGO_TREE 0
@@ -64,6 +64,7 @@ union ncclLLFifoLine {
#define WARP_SIZE 64
#define MAXCHANNELS 32
#define NCCL_MAX_NTHREADS 256
#define NCCL_SIMPLE_MAX_NTHREADS NCCL_MAX_NTHREADS
#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
#define NCCL_LL_LINES_PER_THREAD 8
#ifdef TEST_LL_CLEANUP
@@ -77,7 +78,7 @@ union ncclLLFifoLine {
// Make sure the clean mask will last for at least NCCL_NSTEPS
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
#define NCCL_LL128_LINESIZE 64
#define NCCL_LL128_LINESIZE 128
#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
@@ -88,15 +89,12 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
// to 3 dests. Use 70% for reduce and 30% for bcast.
#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 2
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
#define NCCL_DIRECT_GPU 0x01
#define NCCL_DIRECT_NIC 0x10
#define MAXBARRIERS 2
#define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
struct ncclConnInfo {
// Regular comm mechanism
char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
@@ -104,9 +102,11 @@ struct ncclConnInfo {
uint64_t *head; // Local for send, remote for recv
int direct; // Direct communication
int shared; // Buffers are shared
void **ptrExchange; // Pointer exchange for direct communication
int *fifo; // Size fifo for proxy
int *sizesFifo; // Sizes fifo from GPU to proxy
void* *ptrsFifo; // Buffer fifo from proxy to GPU
uint64_t step; // Keep where we are
uint64_t llLastCleaning;
@@ -115,7 +115,7 @@ struct ncclConnInfo {
// allows software to explicitly initiate a flush read to HDP memory. See more
// descriptions in primitives.h.
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
uint32_t* curr_hdp_reg; // Current GPU's HDP register
};
struct ncclConnector {
@@ -156,79 +156,69 @@ struct ncclDevComm;
#pragma pack(push) /* push current alignment to stack */
#pragma pack(4) /* set alignment to 4 bytes boundary */
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclColl. */
struct CollectiveArgs {
struct ncclDevComm* comm;
uint64_t opCount;
#define NCCL_MAX_WORK_ELEMENTS 2
#define NCCL_MAX_GROUPS (NCCL_MAX_WORK_ELEMENTS*2)
/* ncclWork is to be a power of two, currently 8x64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclWorkElem. */
struct ncclWorkElem {
// Header
struct ncclDevComm* comm;
uint16_t nThreads;
uint16_t funcIndex;
uint16_t index;
uint16_t active;
// local and remote input, output, and buffer
const void * sendbuff;
void * recvbuff;
// Op-specific fields. Make sure the common part stays the
// same on all structs of the union
uint64_t opCount;
// Op-specific fields.
union {
struct {
uint16_t nThreads;
} common;
struct {
uint16_t nThreads;
uint8_t bid;
uint8_t nChannels;
uint32_t root;
size_t count;
size_t lastChunkSize;
uint32_t root;
uint8_t bid;
uint8_t nChannels;
} coll;
struct {
uint16_t nThreads;
uint16_t unused;
int32_t delta;
size_t sendCount;
size_t recvCount;
int32_t delta;
uint16_t nThreads;
} p2p;
struct {
uint16_t nThreads;
size_t count;
uint8_t bid;
uint8_t nChannels;
size_t count;
size_t* extra;
} a2av;
// [RCCL] Clique-based arguments
// NOTE: Follows same field structure as coll
// because nChannels is accessed from "coll" struct.
struct {
uint16_t nThreads;
size_t count;
cliqueDevicePtrs_t* ptrs;
uint32_t unused;
uint8_t bid;
uint8_t nChannels;
size_t count;
int verbose;
cliqueDevicePtrs_t* ptrs;
} clique;
// [/RCCL]
uint64_t align[3];
};
};
struct ncclColl {
union {
struct {
struct CollectiveArgs args;
uint16_t funcIndex;
uint16_t nextIndex;
uint8_t active;
};
int data[0x10];
};
struct ncclWork {
struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
};
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size");
struct ncclChannel {
union {
struct {
struct ncclRing ring;
struct ncclTree treeUp;
struct ncclTree treeDn;
struct ncclTree collTreeUp;
struct ncclTree collTreeDn;
struct ncclTree tree;
struct ncclTree collTree;
int id;
@@ -237,16 +227,11 @@ struct ncclChannel {
struct ncclPeer* devPeers;
// Operation list for aggregation
struct ncclColl* collectives;
size_t* collectivesExtra;
int collStart;
int collCount;
int collFifoHead; // Only used by GPU
int collFifoTail; // Only used by CPU
struct ncclWork* workFifo;
int workCount;
uint64_t workFifoTail; // Only used by CPU
size_t* a2avParams;
uint32_t* sync;
uint64_t* barrier;
uint64_t* barrier_next;
#ifdef ENABLE_PROFILING
struct timeval tvs;
uint64_t sizes;
@@ -304,9 +289,11 @@ struct ncclProf {
#ifdef ENABLE_COLLTRACE
typedef enum {
ncclCollTraceNotReady,
ncclCollTraceKernelLaunchType,
ncclCollTraceCollEndType,
ncclCollTraceAbortType
ncclCollTraceAbortType,
ncclCollTraceDataType
} ncclCollTraceDataType_t;
struct ncclCollTrace {
@@ -316,11 +303,22 @@ struct ncclCollTrace {
uint32_t data_0;
uint64_t timeStamp;
uint64_t opCount;
uint64_t data_1;
union {
uint64_t data_1;
struct {
uint16_t nThreads;
uint8_t bid;
uint8_t nChannels;
} coll;
struct {
uint16_t nThreads;
uint16_t delta;
} p2p;
};
};
static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must have a pow2 size");
#define COLLTRACE_NUM_ITEMS 1024
#define COLLTRACE_NUM_ITEMS 8192
#endif
struct ncclDevComm {
+2
Voir le fichier
@@ -19,5 +19,7 @@ ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
ncclResult_t ncclSaveKernel(struct ncclInfo* info);
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info);
ncclResult_t ncclSaveCommKernels(struct ncclComm* comm);
#endif // End include guard
+11 -9
Voir le fichier
@@ -29,7 +29,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
// Query topology
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
// Set CPU affinity
@@ -40,20 +40,23 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
#define NCCL_TOPO_CPU_ARCH_ARM 3
#define NCCL_TOPO_CPU_VENDOR_INTEL 1
#define NCCL_TOPO_CPU_VENDOR_AMD 2
#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
#define NCCL_TOPO_CPU_TYPE_BDW 1
#define NCCL_TOPO_CPU_TYPE_SKL 2
#define NCCL_TOPO_CPU_TYPE_ZEN 3
#define NCCL_TOPO_CPU_TYPE_ROME 4
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
#define NCCL_TOPO_MAX_NODES 256
// Init search. Needs to be done before calling ncclTopoCompute
ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
#define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Split tree (send/recv from different ranks) flowing in both directions
#define NCCL_TOPO_PATTERN_TREE 3 // Simple tree (send/recv from same rank) flowing in both directions
#define NCCL_TOPO_PATTERN_BALANCED_TREE 1 // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU)
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
#define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU
#define NCCL_TOPO_PATTERN_RING 4 // Ring
struct ncclTopoGraph {
// Input / output
@@ -84,17 +87,16 @@ struct ncclTopoRanks {
int ringSend[MAXCHANNELS];
int ringPrev[MAXCHANNELS];
int ringNext[MAXCHANNELS];
int treeUpRecv[MAXCHANNELS];
int treeUpSend[MAXCHANNELS];
int treeDnRecv[MAXCHANNELS];
int treeDnSend[MAXCHANNELS];
int treeToParent[MAXCHANNELS];
int treeToChild0[MAXCHANNELS];
int treeToChild1[MAXCHANNELS];
};
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets);
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
+15 -12
Voir le fichier
@@ -15,6 +15,9 @@
#define NCCL_PTR_HOST 0x1
#define NCCL_PTR_CUDA 0x2
// Maximum number of requests per comm object
#define NCCL_NET_MAX_REQUESTS 8
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALL=~0} ncclDebugLogSubSys;
@@ -29,9 +32,9 @@ typedef struct {
int speed; // Port speed in Mbps.
int port; // Port number.
int maxComms; // Maximum number of comms we can create
}ncclNetProperties_v3_t;
}ncclNetProperties_v4_t;
typedef ncclNetProperties_v3_t ncclNetProperties_t;
typedef ncclNetProperties_v4_t ncclNetProperties_t;
typedef struct {
// Name of the network (mainly for logs)
@@ -41,7 +44,7 @@ typedef struct {
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
@@ -62,7 +65,7 @@ typedef struct {
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
@@ -70,11 +73,11 @@ typedef struct {
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v3_t;
} ncclNet_v4_t;
typedef ncclNet_v3_t ncclNet_t;
typedef ncclNet_v4_t ncclNet_t;
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v3
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v4
typedef struct {
// Name of the collective network (mainly for logs)
@@ -85,7 +88,7 @@ typedef struct {
// If ndev returns 0, all other functions might be set to NULL.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create connections.
@@ -105,17 +108,17 @@ typedef struct {
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*flush)(void* collComm, void* data, int size, void* mhandle);
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free collective comm objects
ncclResult_t (*closeColl)(void* collComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v3_t;
} ncclCollNet_v4_t;
typedef ncclCollNet_v3_t ncclCollNet_t;
typedef ncclCollNet_v4_t ncclCollNet_t;
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v3
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v4
#endif // end include guard
+2 -2
Voir le fichier
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
@@ -25,7 +25,7 @@ static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, voi
static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
+1 -15
Voir le fichier
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -45,14 +45,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index)
NVMLCHECK(nvmlDeviceGetIndex(device, index));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetHandleByPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
NVMLCHECK(nvmlDeviceGetPciInfo(device, pci));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive));
return ncclSuccess;
@@ -66,10 +58,6 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig
NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
return ncclSuccess;
}
static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor));
return ncclSuccess;
@@ -150,12 +138,10 @@ ncclResult_t wrapNvmlShutdown(void);
ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
nvmlNvLinkCapability_t capability, unsigned int *capResult);
ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
#endif // NVML_DIRECT
+14
Voir le fichier
@@ -0,0 +1,14 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_NVTX_H_
#define NCCL_NVTX_H_
#include "nvtx3.hpp"
struct nccl_domain{static constexpr char const* name{"NCCL"};};
#endif
+2268
Voir le fichier
Fichier diff supprimé car celui-ci est trop grand Voir la Diff
+1470
Voir le fichier
Fichier diff supprimé car celui-ci est trop grand Voir la Diff
+141
Voir le fichier
@@ -0,0 +1,141 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvToolsExt.h"
#include "cuda.h"
#ifndef NVTOOLSEXT_CUDA_V3
#define NVTOOLSEXT_CUDA_V3
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* ========================================================================= */
/** \name Functions for CUDA Resource Naming
*/
/** \addtogroup RESOURCE_NAMING
* \section RESOURCE_NAMING_CUDA CUDA Resource Naming
*
* This section covers the API functions that allow to annotate CUDA resources
* with user-provided names.
*
* @{
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
#define NVTX_RESOURCE_CLASS_CUDA 4
/** \endcond */
/* ------------------------------------------------------------------------- */
/** \brief Resource types for CUDA
*/
typedef enum nvtxResourceCUDAType_t
{
NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
} nvtxResourceCUDAType_t;
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA device.
*
* Allows the user to associate a CUDA device with a user-provided name.
*
* \param device - The handle of the CUDA device to name.
* \param name - The name of the CUDA device.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA context.
*
* Allows the user to associate a CUDA context with a user-provided name.
*
* \param context - The handle of the CUDA context to name.
* \param name - The name of the CUDA context.
*
* \par Example:
* \code
* CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
* if ( CUDA_SUCCESS != status )
* goto Error;
* nvtxNameCuContext(cuContext, "CTX_NAME");
* \endcode
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA stream.
*
* Allows the user to associate a CUDA stream with a user-provided name.
*
* \param stream - The handle of the CUDA stream to name.
* \param name - The name of the CUDA stream.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA event.
*
* Allows the user to associate a CUDA event with a user-provided name.
*
* \param event - The handle of the CUDA event to name.
* \param name - The name of the CUDA event.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
/** @} */
/** @} */ /* END RESOURCE_NAMING */
/* ========================================================================= */
#ifdef UNICODE
#define nvtxNameCuDevice nvtxNameCuDeviceW
#define nvtxNameCuContext nvtxNameCuContextW
#define nvtxNameCuStream nvtxNameCuStreamW
#define nvtxNameCuEvent nvtxNameCuEventW
#else
#define nvtxNameCuDevice nvtxNameCuDeviceA
#define nvtxNameCuContext nvtxNameCuContextA
#define nvtxNameCuStream nvtxNameCuStreamA
#define nvtxNameCuEvent nvtxNameCuEventA
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */
#include "nvtxDetail/nvtxImplCuda_v3.h"
#undef NVTX_IMPL_GUARD_CUDA
#endif /*NVTX_NO_IMPL*/
#endif /* NVTOOLSEXT_CUDA_V3 */
+117
Voir le fichier
@@ -0,0 +1,117 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvToolsExt.h"
#include "cuda.h"
#include "driver_types.h"
#ifndef NVTOOLSEXT_CUDART_V3
#define NVTOOLSEXT_CUDART_V3
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* ========================================================================= */
/** \name Functions for CUDA Resource Naming
*/
/** \addtogroup RESOURCE_NAMING
* \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming
*
* This section covers the API functions that allow to annotate CUDA resources
* with user-provided names.
*
* @{
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
#define NVTX_RESOURCE_CLASS_CUDART 5
/** \endcond */
/* ------------------------------------------------------------------------- */
/** \brief Resource types for CUDART
*/
typedef enum nvtxResourceCUDARTType_t
{
NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
} nvtxResourceCUDARTType_t;
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA device.
*
* Allows the user to associate a CUDA device with a user-provided name.
*
* \param device - The id of the CUDA device to name.
* \param name - The name of the CUDA device.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA stream.
*
* Allows the user to associate a CUDA stream with a user-provided name.
*
* \param stream - The handle of the CUDA stream to name.
* \param name - The name of the CUDA stream.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA event.
*
* Allows the user to associate a CUDA event with a user-provided name.
*
* \param event - The handle of the CUDA event to name.
* \param name - The name of the CUDA event.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
/** @} */
/** @} */ /* END RESOURCE_NAMING */
/* ========================================================================= */
#ifdef UNICODE
#define nvtxNameCudaDevice nvtxNameCudaDeviceW
#define nvtxNameCudaStream nvtxNameCudaStreamW
#define nvtxNameCudaEvent nvtxNameCudaEventW
#else
#define nvtxNameCudaDevice nvtxNameCudaDeviceA
#define nvtxNameCudaStream nvtxNameCudaStreamA
#define nvtxNameCudaEvent nvtxNameCudaEventA
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */
#include "nvtxDetail/nvtxImplCudaRt_v3.h"
#undef NVTX_IMPL_GUARD_CUDART
#endif /*NVTX_NO_IMPL*/
#endif /* NVTOOLSEXT_CUDART_V3 */
+191
Voir le fichier
@@ -0,0 +1,191 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvToolsExt.h"
#include <CL/cl.h>
#ifndef NVTOOLSEXT_OPENCL_V3
#define NVTOOLSEXT_OPENCL_V3
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* ========================================================================= */
/** \name Functions for OpenCL Resource Naming
*/
/** \addtogroup RESOURCE_NAMING
* \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
*
* This section covers the API functions that allow to annotate OpenCL resources
* with user-provided names.
*
* @{
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
#define NVTX_RESOURCE_CLASS_OPENCL 6
/** \endcond */
/* ------------------------------------------------------------------------- */
/** \brief Resource types for OpenCL
*/
typedef enum nvtxResourceOpenCLType_t
{
NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7),
} nvtxResourceOpenCLType_t;
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL device.
*
* Allows to associate an OpenCL device with a user-provided name.
*
* \param device - The handle of the OpenCL device to name.
* \param name - The name of the OpenCL device.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL context.
*
* Allows to associate an OpenCL context with a user-provided name.
*
* \param context - The handle of the OpenCL context to name.
* \param name - The name of the OpenCL context.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL command queue.
*
* Allows to associate an OpenCL command queue with a user-provided name.
*
* \param command_queue - The handle of the OpenCL command queue to name.
* \param name - The name of the OpenCL command queue.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL memory object.
*
* Allows to associate an OpenCL memory object with a user-provided name.
*
* \param memobj - The handle of the OpenCL memory object to name.
* \param name - The name of the OpenCL memory object.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL sampler.
*
* Allows to associate an OpenCL sampler with a user-provided name.
*
* \param sampler - The handle of the OpenCL sampler to name.
* \param name - The name of the OpenCL sampler.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL program.
*
* Allows to associate an OpenCL program with a user-provided name.
*
* \param program - The handle of the OpenCL program to name.
* \param name - The name of the OpenCL program.
*
* \code
* cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
* (const char **) &cSourceCL, &program_length, &ciErrNum);
* shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
* nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
* \endcode
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates an OpenCL event.
*
* Allows to associate an OpenCL event with a user-provided name.
*
* \param evnt - The handle of the OpenCL event to name.
* \param name - The name of the OpenCL event.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
/** @} */
/** @} */ /* END RESOURCE_NAMING */
/* ========================================================================= */
#ifdef UNICODE
#define nvtxNameClDevice nvtxNameClDeviceW
#define nvtxNameClContext nvtxNameClContextW
#define nvtxNameClCommandQueue nvtxNameClCommandQueueW
#define nvtxNameClMemObject nvtxNameClMemObjectW
#define nvtxNameClSampler nvtxNameClSamplerW
#define nvtxNameClProgram nvtxNameClProgramW
#define nvtxNameClEvent nvtxNameClEventW
#else
#define nvtxNameClDevice nvtxNameClDeviceA
#define nvtxNameClContext nvtxNameClContextA
#define nvtxNameClCommandQueue nvtxNameClCommandQueueA
#define nvtxNameClMemObject nvtxNameClMemObjectA
#define nvtxNameClSampler nvtxNameClSamplerA
#define nvtxNameClProgram nvtxNameClProgramA
#define nvtxNameClEvent nvtxNameClEventA
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
#include "nvtxDetail/nvtxImplOpenCL_v3.h"
#undef NVTX_IMPL_GUARD_OPENCL
#endif /*NVTX_NO_IMPL*/
#endif /* NVTOOLSEXT_OPENCL_V3 */
+382
Voir le fichier
@@ -0,0 +1,382 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvToolsExt.h"
#ifndef NVTOOLSEXT_SYNC_V3
#define NVTOOLSEXT_SYNC_V3
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* \cond SHOW_HIDDEN
* \version \NVTX_VERSION_2
*/
#define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) )
/** \endcond */
/**
* \page PAGE_SYNCHRONIZATION Synchronization
*
* This section covers a subset of the API that allow users to track additional
* synchronization details of their application. Naming OS synchronization primitives
* may allow users to better understand the data collected by traced synchronization
* APIs. Additionally, a user defined synchronization object can allow the users to
* to tell the tools when the user is building their own synchronization system
* that do not rely on the OS to provide behaviors and instead use techniques like
* atomic operations and spinlocks.
*
* See module \ref SYNCHRONIZATION for details.
*
* \par Example:
* \code
* class MyMutex
* {
* volatile long bLocked;
* nvtxSyncUser_t hSync;
* public:
* MyMutex(const char* name, nvtxDomainHandle_t d){
* bLocked = 0;
*
* nvtxSyncUserAttributes_t attribs = { 0 };
* attribs.version = NVTX_VERSION;
* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
* attribs.message.ascii = name;
* hSync = nvtxDomainSyncUserCreate(d, &attribs);
* }
*
* ~MyMutex() {
* nvtxDomainSyncUserDestroy(hSync);
* }
*
* bool Lock() {
* nvtxDomainSyncUserAcquireStart(hSync);
* bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic
* if (acquired) {
* nvtxDomainSyncUserAcquireSuccess(hSync);
* }
* else {
* nvtxDomainSyncUserAcquireFailed(hSync);
* }
* return acquired;
* }
* void Unlock() {
* nvtxDomainSyncUserReleasing(hSync);
* bLocked = false;
* }
* };
* \endcode
*
* \version \NVTX_VERSION_2
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
#define NVTX_RESOURCE_CLASS_SYNC_OS 2 /**< Synchronization objects that are OS specific. */
#define NVTX_RESOURCE_CLASS_SYNC_PTHREAD 3 /**< Synchronization objects that are from the POSIX Threads API (pthread)*/
/** \endcond */
/* ------------------------------------------------------------------------- */
/** \defgroup SYNCHRONIZATION Synchronization
* See page \ref PAGE_SYNCHRONIZATION.
* @{
*/
/** \brief Resource type values for OSs with POSIX Thread API support
*/
typedef enum nvtxResourceSyncPosixThreadType_t
{
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 1), /* pthread_mutex_t */
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_CONDITION = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 2), /* pthread_cond_t */
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_RWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 3), /* pthread_rwlock_t */
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_BARRIER = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 4), /* pthread_barrier_t */
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 5), /* pthread_spinlock_t */
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_ONCE = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 6) /* pthread_once_t */
} nvtxResourceSyncPosixThreadType_t;
/** \brief Resource type values for Windows OSs
*/
typedef enum nvtxResourceSyncWindowsType_t
{
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_EVENT = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_CRITICAL_SECTION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SRWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5)
} nvtxResourceSyncWindowsType_t;
/** \brief Resource type values for Linux and Linux derived OSs such as Android
* \sa
* ::nvtxResourceSyncPosixThreadType_t
*/
typedef enum nvtxResourceSyncLinuxType_t
{
NVTX_RESOURCE_TYPE_SYNC_LINUX_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
NVTX_RESOURCE_TYPE_SYNC_LINUX_FUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
NVTX_RESOURCE_TYPE_SYNC_LINUX_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
NVTX_RESOURCE_TYPE_SYNC_LINUX_COMPLETION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
NVTX_RESOURCE_TYPE_SYNC_LINUX_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5),
NVTX_RESOURCE_TYPE_SYNC_LINUX_SEQLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 6),
NVTX_RESOURCE_TYPE_SYNC_LINUX_RCU = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 7)
} nvtxResourceSyncLinuxType_t;
/** \brief Resource type values for Android come from Linux.
* \sa
* ::nvtxResourceSyncLinuxType_t
* ::nvtxResourceSyncPosixThreadType_t
*/
typedef enum nvtxResourceSyncLinuxType_t nvtxResourceSyncAndroidType_t;
/** \brief User Defined Synchronization Object Handle .
* \anchor SYNCUSER_HANDLE_STRUCTURE
*
* This structure is opaque to the user and is used as a handle to reference
* a user defined syncrhonization object. The tools will return a pointer through the API for the application
* to hold on it's behalf to reference the string in the future.
*
*/
typedef struct nvtxSyncUser* nvtxSyncUser_t;
/** \brief User Defined Synchronization Object Attributes Structure.
* \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE
*
* This structure is used to describe the attributes of a user defined synchronization
* object. The layout of the structure is defined by a specific version of the tools
* extension library and can change between different versions of the Tools Extension
* library.
*
* \par Initializing the Attributes
*
* The caller should always perform the following three tasks when using
* attributes:
* <ul>
* <li>Zero the structure
* <li>Set the version field
* <li>Set the size field
* </ul>
*
* Zeroing the structure sets all the event attributes types and values
* to the default value.
*
* The version and size field are used by the Tools Extension
* implementation to handle multiple versions of the attributes structure.
*
* It is recommended that the caller use one of the following to methods
* to initialize the event attributes structure:
*
* \par Method 1: Initializing nvtxEventAttributes for future compatibility
* \code
* nvtxSyncUserAttributes_t attribs = {0};
* attribs.version = NVTX_VERSION;
* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
* \endcode
*
* \par Method 2: Initializing nvtxSyncUserAttributes_t for a specific version
* \code
* nvtxSyncUserAttributes_t attribs = {0};
* attribs.version = 1;
* attribs.size = (uint16_t)(sizeof(nvtxSyncUserAttributes_t));
* \endcode
*
* If the caller uses Method 1 it is critical that the entire binary
* layout of the structure be configured to 0 so that all fields
* are initialized to the default value.
*
* The caller should either use both NVTX_VERSION and
* NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
* and a versioned type (Method 2). Using a mix of the two methods
* will likely cause either source level incompatibility or binary
* incompatibility in the future.
*
* \par Settings Attribute Types and Values
*
*
* \par Example:
* \code
* // Initialize
* nvtxSyncUserAttributes_t attribs = {0};
* attribs.version = NVTX_VERSION;
* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
*
* // Configure the Attributes
* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
* attribs.message.ascii = "Example";
* \endcode
*
* \sa
* ::nvtxDomainSyncUserCreate
*/
typedef struct nvtxSyncUserAttributes_v0
{
/**
* \brief Version flag of the structure.
*
* Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
* supported in this header file. This can optionally be overridden to
* another version of the tools extension library.
*/
uint16_t version;
/**
* \brief Size of the structure.
*
* Needs to be set to the size in bytes of the event attribute
* structure used to specify the event.
*/
uint16_t size;
/** \brief Message type specified in this attribute structure.
*
* Defines the message format of the attribute structure's \ref nvtxSyncUserAttributes_v0::message
* "message" field.
*
* Default Value is NVTX_MESSAGE_UNKNOWN
*/
int32_t messageType; /* nvtxMessageType_t */
/** \brief Message assigned to this attribute structure.
*
* The text message that is attached to an event.
*/
nvtxMessageValue_t message;
} nvtxSyncUserAttributes_v0;
typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
/* ------------------------------------------------------------------------- */
/** \brief Create a user defined synchronization object
* This is used to track non-OS synchronization working with spinlocks and atomics
*
* \param domain - Domain to own the resource
* \param attribs - A structure to assign multiple attributes to the object.
*
* \return A handle that represents the newly created user defined synchronization object.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/
NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
/* ------------------------------------------------------------------------- */
/** \brief Destroy a user defined synchronization object
* This is used to track non-OS synchronization working with spinlocks and atomics
*
* \param handle - A handle to the object to operate on.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle);
/* ------------------------------------------------------------------------- */
/** \brief Signal to tools that an attempt to acquire a user defined synchronization object
*
* \param handle - A handle to the object to operate on.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle);
/* ------------------------------------------------------------------------- */
/** \brief Signal to tools of failure in acquiring a user defined synchronization object
* This should be called after \ref nvtxDomainSyncUserAcquireStart
*
* \param handle - A handle to the object to operate on.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle);
/* ------------------------------------------------------------------------- */
/** \brief Signal to tools of success in acquiring a user defined synchronization object
* This should be called after \ref nvtxDomainSyncUserAcquireStart.
*
* \param handle - A handle to the object to operate on.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle);
/* ------------------------------------------------------------------------- */
/** \brief Signal to tools of releasing a reservation on user defined synchronization object
* This should be called after \ref nvtxDomainSyncUserAcquireSuccess.
*
* \param handle - A handle to the object to operate on.
*
* \sa
* ::nvtxDomainSyncUserCreate
* ::nvtxDomainSyncUserDestroy
* ::nvtxDomainSyncUserAcquireStart
* ::nvtxDomainSyncUserAcquireFailed
* ::nvtxDomainSyncUserAcquireSuccess
* ::nvtxDomainSyncUserReleasing
*
* \version \NVTX_VERSION_2
*/
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle);
/** @} */ /*END defgroup*/
#ifdef __cplusplus
}
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot included directly */
#include "nvtxDetail/nvtxImplSync_v3.h"
#undef NVTX_IMPL_GUARD_SYNC
#endif /*NVTX_NO_IMPL*/
#endif /* NVTOOLSEXT_SYNC_V3 */
+438
Voir le fichier
@@ -0,0 +1,438 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
#endif
/* ---- Include required platform headers ---- */
#if defined(_WIN32)
#include <Windows.h>
#else
#include <unistd.h>
#if defined(__ANDROID__)
#include <android/api-level.h>
#endif
#if defined(__linux__) || defined(__CYGWIN__)
#include <sched.h>
#endif
#include <limits.h>
#include <dlfcn.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/types.h>
#include <pthread.h>
#include <stdlib.h>
#include <wchar.h>
#endif
/* ---- Define macros used in this file ---- */
#define NVTX_INIT_STATE_FRESH 0
#define NVTX_INIT_STATE_STARTED 1
#define NVTX_INIT_STATE_COMPLETE 2
#ifdef NVTX_DEBUG_PRINT
#ifdef __ANDROID__
#include <android/log.h>
#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
#else
#include <stdio.h>
#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
#endif
#else /* !defined(NVTX_DEBUG_PRINT) */
#define NVTX_ERR(...)
#define NVTX_INFO(...)
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#ifdef __GNUC__
#pragma GCC visibility push(hidden)
#endif
/* ---- Forward declare all functions referenced in globals ---- */
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
NvtxCallbackModule module,
NvtxFunctionTable* out_table,
unsigned int* out_size);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(
uint32_t version);
NVTX_LINKONCE_FWDDECL_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(
uint32_t exportTableId);
#include "nvtxInitDecls.h"
/* ---- Define all globals ---- */
typedef struct nvtxGlobals_t
{
volatile unsigned int initState;
NvtxExportTableCallbacks etblCallbacks;
NvtxExportTableVersionInfo etblVersionInfo;
/* Implementation function pointers */
nvtxMarkEx_impl_fntype nvtxMarkEx_impl_fnptr;
nvtxMarkA_impl_fntype nvtxMarkA_impl_fnptr;
nvtxMarkW_impl_fntype nvtxMarkW_impl_fnptr;
nvtxRangeStartEx_impl_fntype nvtxRangeStartEx_impl_fnptr;
nvtxRangeStartA_impl_fntype nvtxRangeStartA_impl_fnptr;
nvtxRangeStartW_impl_fntype nvtxRangeStartW_impl_fnptr;
nvtxRangeEnd_impl_fntype nvtxRangeEnd_impl_fnptr;
nvtxRangePushEx_impl_fntype nvtxRangePushEx_impl_fnptr;
nvtxRangePushA_impl_fntype nvtxRangePushA_impl_fnptr;
nvtxRangePushW_impl_fntype nvtxRangePushW_impl_fnptr;
nvtxRangePop_impl_fntype nvtxRangePop_impl_fnptr;
nvtxNameCategoryA_impl_fntype nvtxNameCategoryA_impl_fnptr;
nvtxNameCategoryW_impl_fntype nvtxNameCategoryW_impl_fnptr;
nvtxNameOsThreadA_impl_fntype nvtxNameOsThreadA_impl_fnptr;
nvtxNameOsThreadW_impl_fntype nvtxNameOsThreadW_impl_fnptr;
nvtxNameCuDeviceA_fakeimpl_fntype nvtxNameCuDeviceA_impl_fnptr;
nvtxNameCuDeviceW_fakeimpl_fntype nvtxNameCuDeviceW_impl_fnptr;
nvtxNameCuContextA_fakeimpl_fntype nvtxNameCuContextA_impl_fnptr;
nvtxNameCuContextW_fakeimpl_fntype nvtxNameCuContextW_impl_fnptr;
nvtxNameCuStreamA_fakeimpl_fntype nvtxNameCuStreamA_impl_fnptr;
nvtxNameCuStreamW_fakeimpl_fntype nvtxNameCuStreamW_impl_fnptr;
nvtxNameCuEventA_fakeimpl_fntype nvtxNameCuEventA_impl_fnptr;
nvtxNameCuEventW_fakeimpl_fntype nvtxNameCuEventW_impl_fnptr;
nvtxNameClDeviceA_fakeimpl_fntype nvtxNameClDeviceA_impl_fnptr;
nvtxNameClDeviceW_fakeimpl_fntype nvtxNameClDeviceW_impl_fnptr;
nvtxNameClContextA_fakeimpl_fntype nvtxNameClContextA_impl_fnptr;
nvtxNameClContextW_fakeimpl_fntype nvtxNameClContextW_impl_fnptr;
nvtxNameClCommandQueueA_fakeimpl_fntype nvtxNameClCommandQueueA_impl_fnptr;
nvtxNameClCommandQueueW_fakeimpl_fntype nvtxNameClCommandQueueW_impl_fnptr;
nvtxNameClMemObjectA_fakeimpl_fntype nvtxNameClMemObjectA_impl_fnptr;
nvtxNameClMemObjectW_fakeimpl_fntype nvtxNameClMemObjectW_impl_fnptr;
nvtxNameClSamplerA_fakeimpl_fntype nvtxNameClSamplerA_impl_fnptr;
nvtxNameClSamplerW_fakeimpl_fntype nvtxNameClSamplerW_impl_fnptr;
nvtxNameClProgramA_fakeimpl_fntype nvtxNameClProgramA_impl_fnptr;
nvtxNameClProgramW_fakeimpl_fntype nvtxNameClProgramW_impl_fnptr;
nvtxNameClEventA_fakeimpl_fntype nvtxNameClEventA_impl_fnptr;
nvtxNameClEventW_fakeimpl_fntype nvtxNameClEventW_impl_fnptr;
nvtxNameCudaDeviceA_impl_fntype nvtxNameCudaDeviceA_impl_fnptr;
nvtxNameCudaDeviceW_impl_fntype nvtxNameCudaDeviceW_impl_fnptr;
nvtxNameCudaStreamA_fakeimpl_fntype nvtxNameCudaStreamA_impl_fnptr;
nvtxNameCudaStreamW_fakeimpl_fntype nvtxNameCudaStreamW_impl_fnptr;
nvtxNameCudaEventA_fakeimpl_fntype nvtxNameCudaEventA_impl_fnptr;
nvtxNameCudaEventW_fakeimpl_fntype nvtxNameCudaEventW_impl_fnptr;
nvtxDomainMarkEx_impl_fntype nvtxDomainMarkEx_impl_fnptr;
nvtxDomainRangeStartEx_impl_fntype nvtxDomainRangeStartEx_impl_fnptr;
nvtxDomainRangeEnd_impl_fntype nvtxDomainRangeEnd_impl_fnptr;
nvtxDomainRangePushEx_impl_fntype nvtxDomainRangePushEx_impl_fnptr;
nvtxDomainRangePop_impl_fntype nvtxDomainRangePop_impl_fnptr;
nvtxDomainResourceCreate_impl_fntype nvtxDomainResourceCreate_impl_fnptr;
nvtxDomainResourceDestroy_impl_fntype nvtxDomainResourceDestroy_impl_fnptr;
nvtxDomainNameCategoryA_impl_fntype nvtxDomainNameCategoryA_impl_fnptr;
nvtxDomainNameCategoryW_impl_fntype nvtxDomainNameCategoryW_impl_fnptr;
nvtxDomainRegisterStringA_impl_fntype nvtxDomainRegisterStringA_impl_fnptr;
nvtxDomainRegisterStringW_impl_fntype nvtxDomainRegisterStringW_impl_fnptr;
nvtxDomainCreateA_impl_fntype nvtxDomainCreateA_impl_fnptr;
nvtxDomainCreateW_impl_fntype nvtxDomainCreateW_impl_fnptr;
nvtxDomainDestroy_impl_fntype nvtxDomainDestroy_impl_fnptr;
nvtxInitialize_impl_fntype nvtxInitialize_impl_fnptr;
nvtxDomainSyncUserCreate_impl_fntype nvtxDomainSyncUserCreate_impl_fnptr;
nvtxDomainSyncUserDestroy_impl_fntype nvtxDomainSyncUserDestroy_impl_fnptr;
nvtxDomainSyncUserAcquireStart_impl_fntype nvtxDomainSyncUserAcquireStart_impl_fnptr;
nvtxDomainSyncUserAcquireFailed_impl_fntype nvtxDomainSyncUserAcquireFailed_impl_fnptr;
nvtxDomainSyncUserAcquireSuccess_impl_fntype nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
nvtxDomainSyncUserReleasing_impl_fntype nvtxDomainSyncUserReleasing_impl_fnptr;
/* Tables of function pointers -- Extra null added to the end to ensure
* a crash instead of silent corruption if a tool reads off the end. */
NvtxFunctionPointer* functionTable_CORE [NVTX_CBID_CORE_SIZE + 1];
NvtxFunctionPointer* functionTable_CUDA [NVTX_CBID_CUDA_SIZE + 1];
NvtxFunctionPointer* functionTable_OPENCL[NVTX_CBID_OPENCL_SIZE + 1];
NvtxFunctionPointer* functionTable_CUDART[NVTX_CBID_CUDART_SIZE + 1];
NvtxFunctionPointer* functionTable_CORE2 [NVTX_CBID_CORE2_SIZE + 1];
NvtxFunctionPointer* functionTable_SYNC [NVTX_CBID_SYNC_SIZE + 1];
} nvtxGlobals_t;
NVTX_LINKONCE_DEFINE_GLOBAL nvtxGlobals_t NVTX_VERSIONED_IDENTIFIER(nvtxGlobals) =
{
NVTX_INIT_STATE_FRESH,
{
sizeof(NvtxExportTableCallbacks),
NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)
},
{
sizeof(NvtxExportTableVersionInfo),
NVTX_VERSION,
0,
NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)
},
/* Implementation function pointers */
NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init),
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init),
/* Tables of function pointers */
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr,
0
},
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr,
0
},
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr,
0
},
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr,
0
},
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr,
0
},
{
0,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr,
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr,
0
}
};
/* ---- Define static inline implementations of core API functions ---- */
#include "nvtxImplCore.h"
/* ---- Define implementations of export table functions ---- */
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
NvtxCallbackModule module,
NvtxFunctionTable* out_table,
unsigned int* out_size)
{
unsigned int bytes = 0;
NvtxFunctionTable table = (NvtxFunctionTable)0;
switch (module)
{
case NVTX_CB_MODULE_CORE:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE);
break;
case NVTX_CB_MODULE_CUDA:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA);
break;
case NVTX_CB_MODULE_OPENCL:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL);
break;
case NVTX_CB_MODULE_CUDART:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART);
break;
case NVTX_CB_MODULE_CORE2:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2);
break;
case NVTX_CB_MODULE_SYNC:
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC;
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC);
break;
default: return 0;
}
if (out_size)
*out_size = (bytes / (unsigned int)sizeof(NvtxFunctionPointer*)) - 1;
if (out_table)
*out_table = table;
return 1;
}
NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(uint32_t exportTableId)
{
switch (exportTableId)
{
case NVTX_ETID_CALLBACKS: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblCallbacks;
case NVTX_ETID_VERSIONINFO: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblVersionInfo;
default: return 0;
}
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(uint32_t version)
{
/* Reserved for custom implementations to resolve problems with tools */
(void)version;
}
/* ---- Define implementations of init versions of all API functions ---- */
#include "nvtxInitDefs.h"
/* ---- Define implementations of initialization functions ---- */
#include "nvtxInit.h"
#ifdef __GNUC__
#pragma GCC visibility pop
#endif
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
+307
Voir le fichier
@@ -0,0 +1,307 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr;
if(local!=0)
(*local)(eventAttrib);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message)
{
#ifndef NVTX_DISABLE
nvtxMarkA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr;
if(local!=0)
(*local)(message);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message)
{
#ifndef NVTX_DISABLE
nvtxMarkW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr;
if(local!=0)
(*local)(message);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr;
if(local!=0)
return (*local)(eventAttrib);
else
#endif /*NVTX_DISABLE*/
return (nvtxRangeId_t)0;
}
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message)
{
#ifndef NVTX_DISABLE
nvtxRangeStartA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (nvtxRangeId_t)0;
}
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message)
{
#ifndef NVTX_DISABLE
nvtxRangeStartW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (nvtxRangeId_t)0;
}
NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id)
{
#ifndef NVTX_DISABLE
nvtxRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr;
if(local!=0)
(*local)(id);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr;
if(local!=0)
return (*local)(eventAttrib);
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message)
{
#ifndef NVTX_DISABLE
nvtxRangePushA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message)
{
#ifndef NVTX_DISABLE
nvtxRangePushW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC int NVTX_API nvtxRangePop(void)
{
#ifndef NVTX_DISABLE
nvtxRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr;
if(local!=0)
return (*local)();
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr;
if(local!=0)
(*local)(category, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr;
if(local!=0)
(*local)(category, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameOsThreadA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr;
if(local!=0)
(*local)(threadId, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameOsThreadW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr;
if(local!=0)
(*local)(threadId, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxDomainMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr;
if(local!=0)
(*local)(domain, eventAttrib);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxDomainRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr;
if(local!=0)
return (*local)(domain, eventAttrib);
else
#endif /*NVTX_DISABLE*/
return (nvtxRangeId_t)0;
}
NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id)
{
#ifndef NVTX_DISABLE
nvtxDomainRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr;
if(local!=0)
(*local)(domain, id);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
{
#ifndef NVTX_DISABLE
nvtxDomainRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr;
if(local!=0)
return (*local)(domain, eventAttrib);
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain)
{
#ifndef NVTX_DISABLE
nvtxDomainRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr;
if(local!=0)
return (*local)(domain);
else
#endif /*NVTX_DISABLE*/
return (int)NVTX_NO_PUSH_POP_TRACKING;
}
NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs)
{
#ifndef NVTX_DISABLE
nvtxDomainResourceCreate_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr;
if(local!=0)
return (*local)(domain, attribs);
else
#endif /*NVTX_DISABLE*/
return (nvtxResourceHandle_t)0;
}
NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource)
{
#ifndef NVTX_DISABLE
nvtxDomainResourceDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr;
if(local!=0)
(*local)(resource);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name)
{
#ifndef NVTX_DISABLE
nvtxDomainNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr;
if(local!=0)
(*local)(domain, category, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxDomainNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr;
if(local!=0)
(*local)(domain, category, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string)
{
#ifndef NVTX_DISABLE
nvtxDomainRegisterStringA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr;
if(local!=0)
return (*local)(domain, string);
else
#endif /*NVTX_DISABLE*/
return (nvtxStringHandle_t)0;
}
NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string)
{
#ifndef NVTX_DISABLE
nvtxDomainRegisterStringW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr;
if(local!=0)
return (*local)(domain, string);
else
#endif /*NVTX_DISABLE*/
return (nvtxStringHandle_t)0;
}
NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* message)
{
#ifndef NVTX_DISABLE
nvtxDomainCreateA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (nvtxDomainHandle_t)0;
}
NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* message)
{
#ifndef NVTX_DISABLE
nvtxDomainCreateW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr;
if(local!=0)
return (*local)(message);
else
#endif /*NVTX_DISABLE*/
return (nvtxDomainHandle_t)0;
}
NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain)
{
#ifndef NVTX_DISABLE
nvtxDomainDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr;
if(local!=0)
(*local)(domain);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved)
{
#ifndef NVTX_DISABLE
nvtxInitialize_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr;
if(local!=0)
(*local)(reserved);
#endif /*NVTX_DISABLE*/
}
+81
Voir le fichier
@@ -0,0 +1,81 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD_CUDART
#error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined).
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
if(local!=0)
(*local)(stream, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
if(local!=0)
(*local)(stream, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
if(local!=0)
(*local)(event, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
if(local!=0)
(*local)(event, name);
#endif /*NVTX_DISABLE*/
}
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
+102
Voir le fichier
@@ -0,0 +1,102 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD_CUDA
#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name);
typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name);
typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name);
typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name);
typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
if(local!=0)
(*local)(context, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
if(local!=0)
(*local)(context, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
if(local!=0)
(*local)(stream, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
if(local!=0)
(*local)(stream, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
if(local!=0)
(*local)(event, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
if(local!=0)
(*local)(event, name);
#endif /*NVTX_DISABLE*/
}
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
+161
Voir le fichier
@@ -0,0 +1,161 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD_OPENCL
#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
typedef void (NVTX_API * nvtxNameClDeviceA_impl_fntype)(cl_device_id device, const char* name);
typedef void (NVTX_API * nvtxNameClDeviceW_impl_fntype)(cl_device_id device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClContextA_impl_fntype)(cl_context context, const char* name);
typedef void (NVTX_API * nvtxNameClContextW_impl_fntype)(cl_context context, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClCommandQueueA_impl_fntype)(cl_command_queue command_queue, const char* name);
typedef void (NVTX_API * nvtxNameClCommandQueueW_impl_fntype)(cl_command_queue command_queue, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClMemObjectA_impl_fntype)(cl_mem memobj, const char* name);
typedef void (NVTX_API * nvtxNameClMemObjectW_impl_fntype)(cl_mem memobj, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClSamplerA_impl_fntype)(cl_sampler sampler, const char* name);
typedef void (NVTX_API * nvtxNameClSamplerW_impl_fntype)(cl_sampler sampler, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClProgramA_impl_fntype)(cl_program program, const char* name);
typedef void (NVTX_API * nvtxNameClProgramW_impl_fntype)(cl_program program, const wchar_t* name);
typedef void (NVTX_API * nvtxNameClEventA_impl_fntype)(cl_event evnt, const char* name);
typedef void (NVTX_API * nvtxNameClEventW_impl_fntype)(cl_event evnt, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClDeviceA_impl_fntype local = (nvtxNameClDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClDeviceW_impl_fntype local = (nvtxNameClDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr;
if(local!=0)
(*local)(device, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClContextA_impl_fntype local = (nvtxNameClContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr;
if(local!=0)
(*local)(context, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClContextW_impl_fntype local = (nvtxNameClContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr;
if(local!=0)
(*local)(context, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClCommandQueueA_impl_fntype local = (nvtxNameClCommandQueueA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr;
if(local!=0)
(*local)(command_queue, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClCommandQueueW_impl_fntype local = (nvtxNameClCommandQueueW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr;
if(local!=0)
(*local)(command_queue, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClMemObjectA_impl_fntype local = (nvtxNameClMemObjectA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr;
if(local!=0)
(*local)(memobj, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClMemObjectW_impl_fntype local = (nvtxNameClMemObjectW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr;
if(local!=0)
(*local)(memobj, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClSamplerA_impl_fntype local = (nvtxNameClSamplerA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr;
if(local!=0)
(*local)(sampler, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClSamplerW_impl_fntype local = (nvtxNameClSamplerW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr;
if(local!=0)
(*local)(sampler, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClProgramA_impl_fntype local = (nvtxNameClProgramA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr;
if(local!=0)
(*local)(program, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClProgramW_impl_fntype local = (nvtxNameClProgramW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr;
if(local!=0)
(*local)(program, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameClEventA_impl_fntype local = (nvtxNameClEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr;
if(local!=0)
(*local)(evnt, name);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameClEventW_impl_fntype local = (nvtxNameClEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr;
if(local!=0)
(*local)(evnt, name);
#endif /*NVTX_DISABLE*/
}
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
+83
Voir le fichier
@@ -0,0 +1,83 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD_SYNC
#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
if(local!=0)
return (*local)(domain, attribs);
else
#endif /*NVTX_DISABLE*/
return (nvtxSyncUser_t)0;
}
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
if(local!=0)
(*local)(handle);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
if(local!=0)
(*local)(handle);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
if(local!=0)
(*local)(handle);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
if(local!=0)
(*local)(handle);
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle)
{
#ifndef NVTX_DISABLE
nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
if(local!=0)
(*local)(handle);
#endif /*NVTX_DISABLE*/
}
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
+312
Voir le fichier
@@ -0,0 +1,312 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
#endif
/* ---- Platform-independent helper definitions and functions ---- */
/* Prefer macros over inline functions to reduce symbol resolution at link time */
#if defined(_WIN32)
#define NVTX_PATHCHAR wchar_t
#define NVTX_STR(x) L##x
#define NVTX_GETENV _wgetenv
#define NVTX_BUFSIZE MAX_PATH
#define NVTX_DLLHANDLE HMODULE
#define NVTX_DLLOPEN(x) LoadLibraryW(x)
#define NVTX_DLLFUNC GetProcAddress
#define NVTX_DLLCLOSE FreeLibrary
#define NVTX_YIELD() SwitchToThread()
#define NVTX_MEMBAR() MemoryBarrier()
#define NVTX_ATOMIC_WRITE_32(address, value) InterlockedExchange((volatile LONG*)address, value)
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand)
#elif defined(__GNUC__)
#define NVTX_PATHCHAR char
#define NVTX_STR(x) x
#define NVTX_GETENV getenv
#define NVTX_BUFSIZE PATH_MAX
#define NVTX_DLLHANDLE void*
#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
#define NVTX_DLLFUNC dlsym
#define NVTX_DLLCLOSE dlclose
#define NVTX_YIELD() sched_yield()
#define NVTX_MEMBAR() __sync_synchronize()
/* Ensure full memory barrier for atomics, to match Windows functions */
#define NVTX_ATOMIC_WRITE_32(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value)
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
#else
#error The library does not support your configuration!
#endif
/* Define this to 1 for platforms that where pre-injected libraries can be discovered. */
#if defined(_WIN32)
/* TODO */
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
#else
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
#endif
/* Define this to 1 for platforms that support environment variables */
/* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */
/* Try: #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */
#define NVTX_SUPPORT_ENV_VARS 1
/* Define this to 1 for platforms that support dynamic/shared libraries */
#define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1
/* Injection libraries implementing InitializeInjectionNvtx2 may be statically linked,
* and this will override any dynamic injection. Useful for platforms where dynamic
* injection is not available. Since weak symbols not explicitly marked extern are
* guaranteed to be initialized to zero if no definitions are found by the linker, the
* dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */
#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__)
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1
/* To statically inject an NVTX library, define InitializeInjectionNvtx2_fnptr as a normal
* symbol (not weak) pointing to the implementation of InitializeInjectionNvtx2 (which
* does not need to be named "InitializeInjectionNvtx2" as is necessary in a dynamic
* injection library. */
__attribute__((weak)) NvtxInitializeInjectionNvtxFunc_t InitializeInjectionNvtx2_fnptr;
#else
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0
#endif
/* This function tries to find or load an NVTX injection library and get the
* address of its InitializeInjection2 function. If such a function pointer
* is found, it is called, and passed the address of this NVTX instance's
* nvtxGetExportTable function, so the injection can attach to this instance.
* If the initialization fails for any reason, any dynamic library loaded will
* be freed, and all NVTX implementation functions will be set to no-ops. If
* initialization succeeds, NVTX functions not attached to the tool will be set
* to no-ops. This is implemented as one function instead of several small
* functions to minimize the number of weak symbols the linker must resolve.
* Order of search is:
* - Pre-injected library exporting InitializeInjectionNvtx2
* - Loadable library exporting InitializeInjectionNvtx2
* - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
* - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
* - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
*/
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void);
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void)
{
const char* const initFuncName = "InitializeInjectionNvtx2";
NvtxInitializeInjectionNvtxFunc_t init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)0;
NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0;
int entryPointStatus = 0;
#if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY
/* Use POSIX global symbol chain to query for init function from any module */
init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(0, initFuncName);
#endif
#if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY
/* Try discovering dynamic injection library to load */
if (!init_fnptr)
{
#if NVTX_SUPPORT_ENV_VARS
/* If env var NVTX_INJECTION64_PATH is set, it should contain the path
* to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4)
? NVTX_STR("NVTX_INJECTION32_PATH")
: NVTX_STR("NVTX_INJECTION64_PATH");
#endif /* NVTX_SUPPORT_ENV_VARS */
NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE];
const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0;
/* Refer to this variable explicitly in case all references to it are #if'ed out */
(void)injectionLibraryPathBuf;
#if NVTX_SUPPORT_ENV_VARS
/* Disable the warning for getenv & _wgetenv -- this usage is safe because
* these functions are not called again before using the returned value. */
#if defined(_MSC_VER)
#pragma warning( push )
#pragma warning( disable : 4996 )
#endif
injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName);
#if defined(_MSC_VER)
#pragma warning( pop )
#endif
#endif
#if defined(__ANDROID__)
if (!injectionLibraryPath)
{
const char *bits = (sizeof(void*) == 4) ? "32" : "64";
char cmdlineBuf[32];
char pkgName[PATH_MAX];
int count;
int pid;
FILE *fp;
size_t bytesRead;
size_t pos;
pid = (int)getpid();
count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid);
if (count <= 0 || count >= (int)sizeof(cmdlineBuf))
{
NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
fp = fopen(cmdlineBuf, "r");
if (!fp)
{
NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp);
fclose(fp);
if (bytesRead == 0)
{
NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
pkgName[bytesRead] = 0;
/* String can contain colon as a process separator. In this case the package name is before the colon. */
pos = 0;
while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0')
{
++pos;
}
pkgName[pos] = 0;
count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits);
if (count <= 0 || count >= NVTX_BUFSIZE)
{
NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
/* On Android, verify path is accessible due to aggressive file access restrictions. */
/* For dlopen, if the filename contains a leading slash, then it is interpreted as a */
/* relative or absolute pathname; otherwise it will follow the rules in ld.so. */
if (injectionLibraryPathBuf[0] == '/')
{
#if (__ANDROID_API__ < 21)
int access_err = access(injectionLibraryPathBuf, F_OK | R_OK);
#else
int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0);
#endif
if (access_err != 0)
{
NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
}
injectionLibraryPath = injectionLibraryPathBuf;
}
#endif
/* At this point, injectionLibraryPath is specified if a dynamic
* injection library was specified by a tool. */
if (injectionLibraryPath)
{
/* Load the injection library */
injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath);
if (!injectionLibraryHandle)
{
NVTX_ERR("Failed to load injection library\n");
return NVTX_ERR_INIT_LOAD_LIBRARY;
}
else
{
/* Attempt to get the injection library's entry-point */
init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName);
if (!init_fnptr)
{
NVTX_DLLCLOSE(injectionLibraryHandle);
NVTX_ERR("Failed to get address of function InitializeInjectionNvtx2 from injection library\n");
return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT;
}
}
}
}
#endif
#if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY
if (!init_fnptr)
{
/* Check weakly-defined function pointer. A statically-linked injection can define this as
* a normal symbol and it will take precedence over a dynamic injection. */
if (InitializeInjectionNvtx2_fnptr)
{
init_fnptr = InitializeInjectionNvtx2_fnptr;
}
}
#endif
/* At this point, if init_fnptr is not set, then no tool has specified
* an NVTX injection library -- return non-success result so all NVTX
* API functions will be set to no-ops. */
if (!init_fnptr)
{
return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE;
}
/* Invoke injection library's initialization function. If it returns
* 0 (failure) and a dynamic injection was loaded, unload it. */
entryPointStatus = init_fnptr(NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable));
if (entryPointStatus == 0)
{
NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n");
if (injectionLibraryHandle)
{
NVTX_DLLCLOSE(injectionLibraryHandle);
}
return NVTX_ERR_INIT_FAILED_LIBRARY_ENTRY_POINT;
}
return NVTX_SUCCESS;
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void)
{
unsigned int old;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState == NVTX_INIT_STATE_COMPLETE)
{
return;
}
NVTX_ATOMIC_CAS_32(
old,
&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState,
NVTX_INIT_STATE_STARTED,
NVTX_INIT_STATE_FRESH);
if (old == NVTX_INIT_STATE_FRESH)
{
int result;
int forceAllToNoops;
/* Load & initialize injection library -- it will assign the function pointers */
result = NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)();
/* Set all pointers not assigned by the injection to null */
forceAllToNoops = result != NVTX_SUCCESS; /* Set all to null if injection init failed */
NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(forceAllToNoops);
/* Signal that initialization has finished, so now the assigned function pointers will be used */
NVTX_ATOMIC_WRITE_32(
&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState,
NVTX_INIT_STATE_COMPLETE);
}
else /* Spin-wait until initialization has finished */
{
NVTX_MEMBAR();
while (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState != NVTX_INIT_STATE_COMPLETE)
{
NVTX_YIELD();
NVTX_MEMBAR();
}
}
}
+81
Voir le fichier
@@ -0,0 +1,81 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
#endif
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved);
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle);
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle);
+573
Voir le fichier
@@ -0,0 +1,573 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_IMPL_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
#endif
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxMarkEx(eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxMarkA(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxMarkW(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangeStartEx(eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangeStartA(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangeStartW(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxRangeEnd(id);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangePushEx(eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangePushA(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangePushW(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxRangePop();
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxNameCategoryA(category, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxNameCategoryW(category, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxNameOsThreadA(threadId, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxNameOsThreadW(threadId, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainMarkEx(domain, eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainRangeStartEx(domain, eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainRangeEnd(domain, id);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainRangePushEx(domain, eventAttrib);
}
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainRangePop(domain);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainResourceCreate(domain, attribs);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainResourceDestroy(resource);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainNameCategoryA(domain, category, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainNameCategoryW(domain, category, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainRegisterStringA(domain, string);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainRegisterStringW(domain, string);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainCreateA(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
return nvtxDomainCreateW(message);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxDomainDestroy(domain);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved){
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
nvtxInitialize(reserved);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name){
nvtxNameCuDeviceA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name){
nvtxNameCuDeviceW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name){
nvtxNameCuContextA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
if (local)
local(context, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name){
nvtxNameCuContextW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
if (local)
local(context, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name){
nvtxNameCuStreamA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
if (local)
local(stream, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name){
nvtxNameCuStreamW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
if (local)
local(stream, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name){
nvtxNameCuEventA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
if (local)
local(event, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name){
nvtxNameCuEventW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
if (local)
local(event, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name){
nvtxNameCudaDeviceA_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name){
nvtxNameCudaDeviceW_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name){
nvtxNameCudaStreamA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
if (local)
local(stream, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name){
nvtxNameCudaStreamW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
if (local)
local(stream, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name){
nvtxNameCudaEventA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
if (local)
local(event, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name){
nvtxNameCudaEventW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
if (local)
local(event, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name){
nvtxNameClDeviceA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name){
nvtxNameClDeviceW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr;
if (local)
local(device, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name){
nvtxNameClContextA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr;
if (local)
local(context, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name){
nvtxNameClContextW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr;
if (local)
local(context, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name){
nvtxNameClCommandQueueA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr;
if (local)
local(command_queue, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name){
nvtxNameClCommandQueueW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr;
if (local)
local(command_queue, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name){
nvtxNameClMemObjectA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr;
if (local)
local(memobj, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name){
nvtxNameClMemObjectW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr;
if (local)
local(memobj, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name){
nvtxNameClSamplerA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr;
if (local)
local(sampler, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name){
nvtxNameClSamplerW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr;
if (local)
local(sampler, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name){
nvtxNameClProgramA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr;
if (local)
local(program, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name){
nvtxNameClProgramW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr;
if (local)
local(program, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name){
nvtxNameClEventA_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr;
if (local)
local(evnt, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name){
nvtxNameClEventW_fakeimpl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr;
if (local)
local(evnt, name);
}
NVTX_LINKONCE_DEFINE_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs){
nvtxDomainSyncUserCreate_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
if (local) {
return local(domain, attribs);
}
return (nvtxSyncUser_t)0;
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle){
nvtxDomainSyncUserDestroy_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
if (local)
local(handle);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle){
nvtxDomainSyncUserAcquireStart_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
if (local)
local(handle);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle){
nvtxDomainSyncUserAcquireFailed_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
if (local)
local(handle);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle){
nvtxDomainSyncUserAcquireSuccess_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
if (local)
local(handle);
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle){
nvtxDomainSyncUserReleasing_impl_fntype local;
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
if (local)
local(handle);
}
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops);
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops)
{
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr = NULL;
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init) || forceAllToNoops)
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr = NULL;
}

Certains fichiers ne sont pas affichés car ce diff contient trop de modifications Voir plus