@@ -21,7 +21,8 @@ def runTestCommand (platform, project)
|
||||
def command = """#!/usr/bin/env bash
|
||||
set -x
|
||||
cd ${project.paths.project_build_prefix}/build/release/test
|
||||
${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_output=xml --gtest_color=yes
|
||||
${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter="BroadcastCorrectnessSweep*:*float32*" --gtest_output=xml --gtest_color=yes
|
||||
${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTestsMultiProcess --gtest_filter="BroadcastMultiProcessCorrectnessSweep*:*float32*" --gtest_output=xml --gtest_color=yes
|
||||
"""
|
||||
|
||||
platform.runCommand(this, command)
|
||||
|
||||
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env groovy
|
||||
// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
|
||||
@Library('rocJenkins@pong') _
|
||||
|
||||
// This is file for internal AMD use.
|
||||
// If you are interested in running your own Jenkins, please raise a github issue for assistance.
|
||||
|
||||
import com.amd.project.*
|
||||
import com.amd.docker.*
|
||||
import java.nio.file.Path
|
||||
|
||||
def runCompileCommand(platform, project, jobName, boolean debug=false)
|
||||
{
|
||||
project.paths.construct_build_prefix()
|
||||
|
||||
def command = """#!/usr/bin/env bash
|
||||
set -x
|
||||
${project.paths.project_build_prefix}/docs/run_doc.sh
|
||||
"""
|
||||
|
||||
try
|
||||
{
|
||||
platform.runCommand(this, command)
|
||||
}
|
||||
catch(e)
|
||||
{
|
||||
throw e
|
||||
}
|
||||
|
||||
publishHTML([allowMissing: false,
|
||||
alwaysLinkToLastBuild: false,
|
||||
keepAll: false,
|
||||
reportDir: "${project.paths.project_build_prefix}/docs/source/_build/html",
|
||||
reportFiles: "index.html",
|
||||
reportName: "Documentation",
|
||||
reportTitles: "Documentation"])
|
||||
}
|
||||
|
||||
def runCI =
|
||||
{
|
||||
nodeDetails, jobName->
|
||||
|
||||
def prj = new rocProject('rccl-internal', 'StaticAnalysis')
|
||||
|
||||
// Define test architectures, optional rocm version argument is available
|
||||
def nodes = new dockerNodes(nodeDetails, jobName, prj)
|
||||
|
||||
boolean formatCheck = false
|
||||
boolean staticAnalysis = true
|
||||
|
||||
def compileCommand =
|
||||
{
|
||||
platform, project->
|
||||
|
||||
runCompileCommand(platform, project, jobName, false)
|
||||
}
|
||||
|
||||
buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis)
|
||||
}
|
||||
|
||||
ci: {
|
||||
String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
|
||||
|
||||
properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 6')])]))
|
||||
stage(urlJobName) {
|
||||
runCI([ubuntu18:['any']], urlJobName)
|
||||
}
|
||||
}
|
||||
@@ -103,8 +103,6 @@ set(CU_SOURCES
|
||||
src/collectives/device/broadcast.cu
|
||||
src/collectives/device/reduce_scatter.cu
|
||||
src/collectives/device/sendrecv.cu
|
||||
src/collectives/device/gather.cu
|
||||
src/collectives/device/scatter.cu
|
||||
src/collectives/device/all_to_all.cu
|
||||
src/collectives/device/all_to_allv.cu
|
||||
src/collectives/device/functions.cu)
|
||||
|
||||
+9
-1
@@ -1,5 +1,5 @@
|
||||
|
||||
Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -30,3 +30,11 @@
|
||||
The U.S. Department of Energy funded the development of this software
|
||||
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
|
||||
|
||||
|
||||
This code also includes files from the NVIDIA Tools Extension SDK project.
|
||||
|
||||
See:
|
||||
|
||||
https://github.com/NVIDIA/NVTX
|
||||
|
||||
for more information and license details.
|
||||
|
||||
+5
-4
@@ -67,7 +67,8 @@ There are unit tests implemented with the Googletest framework in RCCL, which ar
|
||||
To invoke the unit tests, go to the build folder, then the test subfolder, and execute the appropriate unit test executable(s).
|
||||
|
||||
Unit test names are now of the format:
|
||||
[CollectiveCall]CorrectnessSweep/[CollectiveCall]CorrectnessTest.[Type of test]/[ncclRedOp_t]_[datatype]_[number of elements]_[number of devices]_[in place/out of place]_[environment variables]
|
||||
|
||||
[CollectiveCall]CorrectnessSweep/[CollectiveCall]CorrectnessTest.[Type of test]/[ncclRedOp_t]_[datatype]_[number of elements]_[number of devices]_[in place/out of place]_[environment variables]
|
||||
|
||||
This allows filtering of unit tests being run by their parameter values by passing the --gtest_filter command line flag, for example:
|
||||
|
||||
@@ -82,10 +83,10 @@ See the rccl-tests README for more information on how to build and run those tes
|
||||
|
||||
## Library and API Documentation
|
||||
|
||||
Please refer to the [Library documentation](http://rccl.readthedocs.io/) for current documentation.
|
||||
Please refer to the [Library documentation](https://rccl.readthedocs.io/) for current documentation.
|
||||
|
||||
## Copyright
|
||||
|
||||
All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
All source code and accompanying documentation is copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
All modifications are copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
All modifications are copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
+8
-10
@@ -1,17 +1,15 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
if [ -d docBin ]; then
|
||||
rm -rf docBin
|
||||
fi
|
||||
set -eu
|
||||
|
||||
sed -e 's/ROCFFT_EXPORT //g' ../library/include/rocfft.h > rocfft.h
|
||||
doxygen Doxyfile
|
||||
# Make this directory the PWD
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")"
|
||||
|
||||
# Build doxygen info
|
||||
./run_doxygen.sh
|
||||
|
||||
# Build sphinx docs
|
||||
cd source
|
||||
make clean
|
||||
make html
|
||||
cd ..
|
||||
|
||||
rm rocfft.h
|
||||
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
#!/bin/bash
|
||||
# # Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# # Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
if [ -d docBin ]; then
|
||||
rm -rf docBin
|
||||
fi
|
||||
set -eu
|
||||
|
||||
rm nccl.h
|
||||
# Make this directory the PWD
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")"
|
||||
|
||||
sed -e 's/ROCFFT_EXPORT //g' ../src/nccl.h.in > nccl.h
|
||||
# Rename our input file
|
||||
cp ../src/nccl.h.in nccl.h
|
||||
|
||||
# Build the doxygen info
|
||||
rm -rf docBin
|
||||
doxygen Doxyfile
|
||||
#rm nccl.h
|
||||
|
||||
# Cleanup
|
||||
rm nccl.h
|
||||
|
||||
@@ -56,8 +56,8 @@ master_doc = 'index'
|
||||
|
||||
# General information about the project.
|
||||
project = u'RCCL'
|
||||
copyright = u'2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019-2020 Advanced Mirco Devices'
|
||||
author = u'Advanced Mirco Devices'
|
||||
copyright = u'2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019-2021 Advanced Micro Devices'
|
||||
author = u'Advanced Micro Devices'
|
||||
|
||||
# The version info for the project you're documenting, acts as replacement for
|
||||
# |version| and |release|, also used in various other places throughout the
|
||||
@@ -156,7 +156,7 @@ latex_elements = {
|
||||
# author, documentclass [howto, manual, or own class]).
|
||||
latex_documents = [
|
||||
(master_doc, 'RCCL.tex', u'RCCL Documentation',
|
||||
u'Advanced Mirco Devices', 'manual'),
|
||||
u'Advanced Micro Devices', 'manual'),
|
||||
]
|
||||
|
||||
|
||||
|
||||
+2
-2
@@ -199,10 +199,10 @@ if ($run_tests); then
|
||||
if (test -f "./test/UnitTests"); then
|
||||
if ($run_tests_all); then
|
||||
./test/UnitTests
|
||||
NCCL_COMM_ID=$HOSTNAME:55512 ./test/UnitTestsMultiProcess
|
||||
./test/UnitTestsMultiProcess
|
||||
else
|
||||
./test/UnitTests --gtest_filter="BroadcastCorrectnessSweep*:*float32*"
|
||||
NCCL_COMM_ID=$HOSTNAME:55512 ./test/UnitTestsMultiProcess --gtest_filter="BroadcastMultiProcessCorrectnessSweep*:*float32*"
|
||||
./test/UnitTestsMultiProcess --gtest_filter="BroadcastMultiProcessCorrectnessSweep*:*float32*"
|
||||
fi
|
||||
else
|
||||
echo "Unit tests have not been built yet; please re-run script with -t to build unit tests."
|
||||
|
||||
@@ -11,6 +11,7 @@ KEEP ?= 0
|
||||
DEBUG ?= 0
|
||||
TRACE ?= 0
|
||||
PROFAPI ?= 0
|
||||
NVTX ?= 1
|
||||
|
||||
NVCC = $(CUDA_HOME)/bin/nvcc
|
||||
|
||||
@@ -87,6 +88,10 @@ ifneq ($(TRACE), 0)
|
||||
CXXFLAGS += -DENABLE_TRACE
|
||||
endif
|
||||
|
||||
ifeq ($(NVTX), 0)
|
||||
CXXFLAGS += -DNVTX_DISABLE
|
||||
endif
|
||||
|
||||
ifneq ($(KEEP), 0)
|
||||
NVCUFLAGS += -keep
|
||||
endif
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 7
|
||||
NCCL_PATCH := 8
|
||||
NCCL_MINOR := 8
|
||||
NCCL_PATCH := 4
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
@@ -9,7 +9,7 @@ Package: libnccl${nccl:Major}
|
||||
Section: libs
|
||||
Architecture: ${pkg:Arch}
|
||||
Depends: ${misc:Depends}, ${shlibs:Depends}
|
||||
Description: NVIDIA Collectives Communication Library (NCCL) Runtime
|
||||
Description: NVIDIA Collective Communication Library (NCCL) Runtime
|
||||
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
|
||||
communication routines for GPUs, implementing all-reduce, all-gather, reduce,
|
||||
broadcast, and reduce-scatter.
|
||||
@@ -21,7 +21,7 @@ Package: libnccl-dev
|
||||
Section: libdevel
|
||||
Architecture: ${pkg:Arch}
|
||||
Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
|
||||
Description: NVIDIA Collectives Communication Library (NCCL) Development Files
|
||||
Description: NVIDIA Collective Communication Library (NCCL) Development Files
|
||||
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
|
||||
communication routines for GPUs, implementing all-reduce, all-gather, reduce,
|
||||
broadcast, and reduce-scatter.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
Name: libnccl
|
||||
Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
|
||||
Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
|
||||
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
|
||||
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
|
||||
|
||||
Group: Development/Libraries
|
||||
License: BSD
|
||||
@@ -18,13 +18,13 @@ NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
|
||||
sockets.
|
||||
|
||||
%package devel
|
||||
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
|
||||
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
|
||||
Group: Development/Libraries
|
||||
%description devel
|
||||
NCCL development files
|
||||
|
||||
%package static
|
||||
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
|
||||
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
|
||||
Group: Development/Libraries
|
||||
%description static
|
||||
NCCL static library
|
||||
|
||||
+272
-207
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -18,144 +19,77 @@
|
||||
#include "clique/Hash.h"
|
||||
// [/RCCL]
|
||||
|
||||
struct bootstrapNetComm {
|
||||
int fd;
|
||||
};
|
||||
|
||||
/* Init functions */
|
||||
static char bootstrapNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
|
||||
static union socketAddress bootstrapNetIfAddrs[MAX_IFS];
|
||||
static int bootstrapNetIfs = -1;
|
||||
static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
|
||||
static union socketAddress bootstrapNetIfAddr;
|
||||
static int bootstrapNetInitDone = 0;
|
||||
pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
ncclResult_t bootstrapNetInit() {
|
||||
if (bootstrapNetIfs == -1) {
|
||||
if (bootstrapNetInitDone == 0) {
|
||||
pthread_mutex_lock(&bootstrapNetLock);
|
||||
if (bootstrapNetIfs == -1) {
|
||||
bootstrapNetIfs = findInterfaces(bootstrapNetIfNames, bootstrapNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
|
||||
if (bootstrapNetIfs <= 0) {
|
||||
WARN("Bootstrap : no socket interface found");
|
||||
return ncclInternalError;
|
||||
} else {
|
||||
char line[1024];
|
||||
char addrline[1024];
|
||||
line[0] = '\0';
|
||||
for (int i=0; i<bootstrapNetIfs; i++) {
|
||||
snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, bootstrapNetIfNames+i*MAX_IF_NAME_SIZE,
|
||||
socketToString(&bootstrapNetIfAddrs[i].sa, addrline));
|
||||
if (bootstrapNetInitDone == 0) {
|
||||
char* env = getenv("NCCL_COMM_ID");
|
||||
if (env) {
|
||||
union socketAddress remoteAddr;
|
||||
if (GetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) {
|
||||
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
if (findInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
|
||||
WARN("NET/Socket : No usable listening interface found");
|
||||
return ncclSystemError;
|
||||
}
|
||||
} else {
|
||||
int nIfs = findInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
|
||||
if (nIfs <= 0) {
|
||||
WARN("Bootstrap : no socket interface found");
|
||||
return ncclInternalError;
|
||||
}
|
||||
line[1023] = '\0';
|
||||
INFO(NCCL_INIT, "Bootstrap : Using%s", line);
|
||||
}
|
||||
char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2];
|
||||
sprintf(line, " %s:", bootstrapNetIfName);
|
||||
socketToString(&bootstrapNetIfAddr.sa, line+strlen(line));
|
||||
INFO(NCCL_INIT, "Bootstrap : Using%s", line);
|
||||
bootstrapNetInitDone = 1;
|
||||
}
|
||||
pthread_mutex_unlock(&bootstrapNetLock);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetNewComm(struct bootstrapNetComm** comm) {
|
||||
NCCLCHECK(ncclCalloc(comm, 1));
|
||||
(*comm)->fd = -1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr) {
|
||||
if (dev >= bootstrapNetIfs) return ncclInternalError;
|
||||
memcpy(addr, bootstrapNetIfAddrs+dev, sizeof(*addr));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Socket Interface Selection type */
|
||||
enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
|
||||
|
||||
static ncclResult_t bootstrapNetListen(int dev, ncclNetHandle_t* netHandle, void** listenComm) {
|
||||
union socketAddress* connectAddr = (union socketAddress*) netHandle;
|
||||
static_assert(sizeof(union socketAddress) < NCCL_NET_HANDLE_MAXSIZE, "union socketAddress size is too large");
|
||||
// if dev >= 0, listen based on dev
|
||||
if (dev >= 0) {
|
||||
NCCLCHECK(bootstrapNetGetSocketAddr(dev, connectAddr));
|
||||
} else if (dev == findSubnetIf) {
|
||||
// handle stores a remote address
|
||||
// need to find a local addr that is in the same network as the remote addr
|
||||
union socketAddress localAddr;
|
||||
char ifName[MAX_IF_NAME_SIZE];
|
||||
if (findInterfaceMatchSubnet(ifName, &localAddr, connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
|
||||
WARN("NET/Socket : No usable listening interface found");
|
||||
return ncclSystemError;
|
||||
}
|
||||
// pass the local address back
|
||||
memcpy(connectAddr, &localAddr, sizeof(localAddr));
|
||||
} // Otherwise, handle stores a local address
|
||||
struct bootstrapNetComm* comm;
|
||||
NCCLCHECK(bootstrapNetNewComm(&comm));
|
||||
NCCLCHECK(createListenSocket(&comm->fd, connectAddr));
|
||||
*listenComm = comm;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetConnect(int dev, ncclNetHandle_t* netHandle, void** sendComm) {
|
||||
union socketAddress* connectAddr = (union socketAddress*) netHandle;
|
||||
struct bootstrapNetComm* comm;
|
||||
NCCLCHECK(bootstrapNetNewComm(&comm));
|
||||
NCCLCHECK(connectAddress(&comm->fd, connectAddr));
|
||||
*sendComm = comm;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) {
|
||||
struct bootstrapNetComm* lComm = (struct bootstrapNetComm*)listenComm;
|
||||
struct bootstrapNetComm* rComm;
|
||||
NCCLCHECK(bootstrapNetNewComm(&rComm));
|
||||
static ncclResult_t bootstrapNetAccept(int listenFd, int* recvFd) {
|
||||
struct sockaddr_in sockaddr;
|
||||
socklen_t socklen = sizeof(struct sockaddr_in);
|
||||
SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
|
||||
*recvComm = rComm;
|
||||
SYSCHECKVAL(accept(listenFd, (struct sockaddr*)&sockaddr, &socklen), "accept", *recvFd);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetClose(void* opaqueComm) {
|
||||
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)opaqueComm;
|
||||
if (comm) {
|
||||
close(comm->fd);
|
||||
free(comm);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(bootstrapNetClose(sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(bootstrapNetClose(recvComm)); return ncclSuccess; }
|
||||
static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(bootstrapNetClose(listenComm)); return ncclSuccess; }
|
||||
|
||||
// Additional sync functions
|
||||
static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) {
|
||||
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)sendComm;
|
||||
NCCLCHECK(socketSend(comm->fd, &size, sizeof(int)));
|
||||
NCCLCHECK(socketSend(comm->fd, data, size));
|
||||
static ncclResult_t bootstrapNetSend(int fd, void* data, int size) {
|
||||
NCCLCHECK(socketSend(fd, &size, sizeof(int)));
|
||||
NCCLCHECK(socketSend(fd, data, size));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
|
||||
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)recvComm;
|
||||
static ncclResult_t bootstrapNetRecv(int fd, void* data, int size) {
|
||||
int recvSize;
|
||||
NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int)));
|
||||
NCCLCHECK(socketRecv(fd, &recvSize, sizeof(int)));
|
||||
if (recvSize > size) {
|
||||
WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
|
||||
WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
|
||||
return ncclInternalError;
|
||||
}
|
||||
NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size)));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapNetCreateHandle(ncclNetHandle_t* netHandle, const char* str) {
|
||||
union socketAddress* connectAddr = (union socketAddress*) netHandle;
|
||||
NCCLCHECK(GetSocketAddrFromString(connectAddr, str));
|
||||
NCCLCHECK(socketRecv(fd, data, std::min(recvSize, size)));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct extInfo {
|
||||
int rank;
|
||||
int nranks;
|
||||
ncclNetHandle_t extHandleListenRoot;
|
||||
ncclNetHandle_t extHandleListen;
|
||||
union socketAddress extAddressListenRoot;
|
||||
union socketAddress extAddressListen;
|
||||
};
|
||||
|
||||
#include <sys/resource.h>
|
||||
@@ -169,33 +103,37 @@ static ncclResult_t setFilesLimit() {
|
||||
}
|
||||
|
||||
static void *bootstrapRoot(void* bootstrapRootStruct) { // [RCCL] Modified to include hash argument)
|
||||
|
||||
// [RCCL] Unpack bootstrapRootStruct
|
||||
struct bootstrapRootStruct* rootStruct = (struct bootstrapRootStruct*) bootstrapRootStruct;
|
||||
void* listenComm = rootStruct->listenComm;
|
||||
unsigned long hash = rootStruct->hash;
|
||||
struct bootstrapRootStruct rootStruct = *(struct bootstrapRootStruct*)bootstrapRootStruct;
|
||||
int listenFd = rootStruct.listenFd;
|
||||
unsigned long hash = rootStruct.hash;
|
||||
int pid = getpid(); // sharing PID to other ranks for creating shared memory files for CliqueManager
|
||||
free(bootstrapRootStruct);
|
||||
// [/RCCL]
|
||||
|
||||
ncclResult_t res = ncclSuccess;
|
||||
int nranks = 0, c = 0;
|
||||
|
||||
struct extInfo info;
|
||||
ncclNetHandle_t *rankHandles = NULL;
|
||||
ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
|
||||
ncclNetHandle_t zero = { 0 }; // for sanity checking
|
||||
void* tmpComm;
|
||||
ncclResult_t res;
|
||||
union socketAddress *rankAddresses = NULL;
|
||||
union socketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
|
||||
union socketAddress *zero = NULL;
|
||||
NCCLCHECKGOTO(ncclCalloc(&zero, 1), res, out);
|
||||
setFilesLimit();
|
||||
|
||||
TRACE(NCCL_INIT, "BEGIN");
|
||||
/* Receive addresses from all ranks */
|
||||
int nranks = 0, c = 0;
|
||||
do {
|
||||
NCCLCHECKGOTO(bootstrapNetAccept(listenComm, &tmpComm), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
|
||||
int tmpFd;
|
||||
NCCLCHECKGOTO(bootstrapNetAccept(listenFd, &tmpFd), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetRecv(tmpFd, &info, sizeof(info)), res, out);
|
||||
close(tmpFd);
|
||||
|
||||
if (c == 0) {
|
||||
nranks = info.nranks;
|
||||
NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out);
|
||||
NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out);
|
||||
NCCLCHECKGOTO(ncclCalloc(&rankAddresses, nranks), res, out);
|
||||
NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nranks), res, out);
|
||||
}
|
||||
|
||||
if (nranks != info.nranks) {
|
||||
@@ -203,14 +141,14 @@ static void *bootstrapRoot(void* bootstrapRootStruct) { // [RCCL] Modified to in
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) {
|
||||
if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union socketAddress)) != 0) {
|
||||
WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
|
||||
goto out;
|
||||
}
|
||||
|
||||
// Save the connection handle for that rank
|
||||
memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t));
|
||||
memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
|
||||
memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union socketAddress));
|
||||
memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union socketAddress));
|
||||
|
||||
++c;
|
||||
TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks);
|
||||
@@ -224,35 +162,37 @@ static void *bootstrapRoot(void* bootstrapRootStruct) { // [RCCL] Modified to in
|
||||
// Send the connect handle for the next rank in the AllGather ring
|
||||
for (int r=0; r<nranks; ++r) {
|
||||
int next = (r+1) % nranks;
|
||||
void *tmpSendComm;
|
||||
NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot+r, &tmpSendComm), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
|
||||
|
||||
int tmpSendFd;
|
||||
NCCLCHECKGOTO(connectAddress(&tmpSendFd, rankAddressesRoot+r), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetSend(tmpSendFd, rankAddresses+next, sizeof(union socketAddress)), res, out);
|
||||
{ // [RCCL] Send the root pid for shared file naming
|
||||
NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, &pid, sizeof(int)), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetSend(tmpSendFd, &pid, sizeof(int)), res, out);
|
||||
} // [/RCCL]
|
||||
NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
|
||||
close(tmpSendFd);
|
||||
}
|
||||
TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
|
||||
|
||||
out:
|
||||
bootstrapNetCloseListen(listenComm);
|
||||
if (rankHandles) free(rankHandles);
|
||||
if (rankHandlesRoot) free(rankHandlesRoot);
|
||||
close(listenFd);
|
||||
if (rankAddresses) free(rankAddresses);
|
||||
if (rankAddressesRoot) free(rankAddressesRoot);
|
||||
if (zero) free(zero);
|
||||
|
||||
TRACE(NCCL_INIT, "DONE");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
|
||||
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
|
||||
void* listenComm;
|
||||
NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, netHandle, &listenComm));
|
||||
union socketAddress* connectAddr = (union socketAddress*) id;
|
||||
int listenFd;
|
||||
NCCLCHECK(createListenSocket(&listenFd, connectAddr));
|
||||
pthread_t thread;
|
||||
|
||||
// [RCCL] Use the ncclUniqueId to get a hash for bootstrap
|
||||
struct bootstrapRootStruct* rootStruct = new bootstrapRootStruct;
|
||||
struct bootstrapRootStruct* rootStruct = new struct bootstrapRootStruct;
|
||||
rootStruct->hash = djb2Hash(id->internal);
|
||||
rootStruct->listenComm = listenComm;
|
||||
rootStruct->listenFd = listenFd;
|
||||
pthread_create(&thread, NULL, bootstrapRoot, (void *)rootStruct);
|
||||
// [/RCCL]
|
||||
|
||||
@@ -260,18 +200,19 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
|
||||
static_assert(sizeof(ncclNetHandle_t) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
|
||||
static_assert(sizeof(union socketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
|
||||
memset(id, 0, sizeof(ncclUniqueId));
|
||||
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
|
||||
union socketAddress* connectAddr = (union socketAddress*) id;
|
||||
|
||||
char* env = getenv("NCCL_COMM_ID");
|
||||
if (env) {
|
||||
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
|
||||
if (bootstrapNetCreateHandle(netHandle, env) != 0) {
|
||||
if (GetSocketAddrFromString(connectAddr, env) != ncclSuccess) {
|
||||
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
} else {
|
||||
memcpy(id, &bootstrapNetIfAddr, sizeof(union socketAddress));
|
||||
NCCLCHECK(bootstrapCreateRoot(id, false));
|
||||
}
|
||||
|
||||
@@ -280,25 +221,135 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
|
||||
|
||||
struct unexConn {
|
||||
int peer;
|
||||
void* comm;
|
||||
int fd;
|
||||
struct unexConn* next;
|
||||
};
|
||||
|
||||
struct extState {
|
||||
void* extBstrapListenComm;
|
||||
void* extBstrapRingRecvComm;
|
||||
void* extBstrapRingSendComm;
|
||||
ncclNetHandle_t* peerBstrapHandles;
|
||||
struct unexConn* unexpectedConnections;
|
||||
int rank;
|
||||
int nranks;
|
||||
int dev;
|
||||
int rootPid; // [RCCL] PID of root
|
||||
// Remote allocator state
|
||||
struct remAllocState {
|
||||
int cudaDev;
|
||||
int listenFd;
|
||||
int stop;
|
||||
};
|
||||
|
||||
struct extState {
|
||||
int extListenFd;
|
||||
int extRingRecvFd;
|
||||
int extRingSendFd;
|
||||
union socketAddress* peerCommAddresses;
|
||||
union socketAddress* peerAllocAddresses;
|
||||
struct unexConn* unexpectedConnections;
|
||||
int cudaDev;
|
||||
int rank;
|
||||
int nranks;
|
||||
|
||||
// Intermediate memory allocation service
|
||||
struct remAllocState* allocState;
|
||||
pthread_t allocThread;
|
||||
};
|
||||
|
||||
#define MAX_SEGMENTS 128
|
||||
|
||||
static ncclResult_t remoteAlloc(void** ptr, int fd) {
|
||||
size_t size;
|
||||
NCCLCHECK(socketRecv(fd, &size, sizeof(size_t)));
|
||||
hipIpcMemHandle_t devIpc;
|
||||
NCCLCHECK(ncclCudaCalloc((char**)ptr, size, true));
|
||||
hipError_t res = hipIpcGetMemHandle(&devIpc, *ptr);
|
||||
if (res != hipSuccess) {
|
||||
WARN("[Rem Allocator] hipIpcGetMemHandle failed : %s", hipGetErrorString(res));
|
||||
hipFree(*ptr);
|
||||
CUDACHECK(res);
|
||||
}
|
||||
// The CUDA IPC
|
||||
NCCLCHECK(socketSend(fd, &devIpc, sizeof(hipIpcMemHandle_t)));
|
||||
// And the direct pointer
|
||||
NCCLCHECK(socketSend(fd, ptr, sizeof(void*)));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#include <poll.h>
|
||||
|
||||
// Service thread to allocate memory for other GPUs, used as intermediate step.
|
||||
void* ncclRemoteMemAllocationService(void* args) {
|
||||
struct remAllocState* state = (struct remAllocState *) args;
|
||||
if (hipSetDevice(state->cudaDev) != hipSuccess) {
|
||||
WARN("[Rem Allocator] Failed to set CUDA device %d", state->cudaDev);
|
||||
}
|
||||
|
||||
// Prepare poll descriptor
|
||||
void* segments[MAX_SEGMENTS];
|
||||
struct pollfd pollfds[MAX_SEGMENTS+1];
|
||||
for (int s=0; s<MAX_SEGMENTS; s++) segments[s] = NULL;
|
||||
for (int s=0; s<MAX_SEGMENTS; s++) {
|
||||
pollfds[s].fd = -1;
|
||||
pollfds[s].events = POLLHUP;
|
||||
}
|
||||
pollfds[MAX_SEGMENTS].fd = state->listenFd;
|
||||
pollfds[MAX_SEGMENTS].events = POLLIN;
|
||||
|
||||
int nbuffers = 0;
|
||||
while (state->stop == 0 || (state->stop == 1 && nbuffers > 0)) {
|
||||
if (int error = poll(pollfds, MAX_SEGMENTS+1, 100/*ms*/) < 0) {
|
||||
WARN("[Rem Allocator] Poll failed with error %d", error);
|
||||
return NULL;
|
||||
}
|
||||
if (pollfds[MAX_SEGMENTS].revents) {
|
||||
int s = 0;
|
||||
while (segments[s] != NULL && s < MAX_SEGMENTS) s++;
|
||||
if (bootstrapNetAccept(pollfds[MAX_SEGMENTS].fd, &pollfds[s].fd) != ncclSuccess) {
|
||||
pollfds[s].fd = -1;
|
||||
} else {
|
||||
if (s == MAX_SEGMENTS || (remoteAlloc(segments+s, pollfds[s].fd) != ncclSuccess)) {
|
||||
WARN("[Rem Allocator] Allocation failed (segment %d, fd %d)", s, pollfds[s].fd);
|
||||
close(pollfds[s].fd);
|
||||
pollfds[s].fd = -1;
|
||||
} else {
|
||||
nbuffers++;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int s=0; s<MAX_SEGMENTS; s++) {
|
||||
if (pollfds[s].revents & POLLHUP) {
|
||||
if (hipFree(segments[s]) != hipSuccess) {
|
||||
WARN("[Rem Allocator] hipFree %p failed", segments[s]);
|
||||
}
|
||||
segments[s] = NULL;
|
||||
close(pollfds[s].fd);
|
||||
pollfds[s].fd = -1;
|
||||
nbuffers--;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int s=0; s<MAX_SEGMENTS; s++) {
|
||||
if (segments[s]) hipFree(segments[s]);
|
||||
close(pollfds[s].fd);
|
||||
}
|
||||
close(state->listenFd);
|
||||
free(state);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
int fd;
|
||||
ncclResult_t res;
|
||||
*id = -1;
|
||||
NCCLCHECK(connectAddress(&fd, state->peerAllocAddresses+rank));
|
||||
NCCLCHECKGOTO(socketSend(fd, &size, sizeof(size_t)), res, end);
|
||||
NCCLCHECKGOTO(socketRecv(fd, ipc, sizeof(hipIpcMemHandle_t)), res, end);
|
||||
NCCLCHECKGOTO(socketRecv(fd, ptr, sizeof(void*)), res, end);
|
||||
*id = fd;
|
||||
end:
|
||||
return res;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapRemFree(int id, int rank, void* commState) {
|
||||
SYSCHECK(close(id), "close");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState, int* rootPid) { // [RCCL] Adding rootPid
|
||||
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
|
||||
bool idFromEnv = getenv("NCCL_COMM_ID") != NULL;
|
||||
struct extState* state;
|
||||
NCCLCHECK(ncclCalloc(&state, 1));
|
||||
state->rank = rank;
|
||||
@@ -310,19 +361,15 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
|
||||
struct extInfo info = { 0 };
|
||||
info.rank = rank;
|
||||
info.nranks = nranks;
|
||||
void *tmpSendComm, *tmpRecvComm;
|
||||
// Pass the remote address to listen via info
|
||||
if (idFromEnv) {
|
||||
memcpy(&info.extHandleListen, netHandle, sizeof(ncclNetHandle_t));
|
||||
memcpy(&info.extHandleListenRoot, netHandle, sizeof(ncclNetHandle_t));
|
||||
}
|
||||
// listen will return the local address via info (specify interface type 'findSubnetIf')
|
||||
state->dev = idFromEnv ? findSubnetIf : 0;
|
||||
void* extBstrapListenCommRoot;
|
||||
NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm));
|
||||
NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot));
|
||||
int tmpSendFd, tmpRecvFd;
|
||||
|
||||
// stagger connection times to avoid an overload of the root at very high rank counts
|
||||
int extListenFdRoot;
|
||||
memcpy(&info.extAddressListen, &bootstrapNetIfAddr, sizeof(union socketAddress));
|
||||
memcpy(&info.extAddressListenRoot, &bootstrapNetIfAddr, sizeof(union socketAddress));
|
||||
NCCLCHECK(createListenSocket(&state->extListenFd, &info.extAddressListen));
|
||||
NCCLCHECK(createListenSocket(&extListenFdRoot, &info.extAddressListenRoot));
|
||||
|
||||
// stagger connection times to avoid an overload of the root
|
||||
if (nranks > 128) {
|
||||
long msec = rank;
|
||||
struct timespec tv;
|
||||
@@ -333,28 +380,38 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
|
||||
}
|
||||
|
||||
// send info on my listening socket to root
|
||||
NCCLCHECK(bootstrapNetConnect(state->dev, netHandle, &tmpSendComm));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
|
||||
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
|
||||
union socketAddress* rootAddr = (union socketAddress*)id;
|
||||
NCCLCHECK(connectAddress(&tmpSendFd, rootAddr));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendFd, &info, sizeof(info)));
|
||||
close(tmpSendFd);
|
||||
|
||||
// get info on my "next" rank in the bootstrap ring from root
|
||||
ncclNetHandle_t extHandleNext;
|
||||
NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm));
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
|
||||
union socketAddress extAddressNext;
|
||||
NCCLCHECK(bootstrapNetAccept(extListenFdRoot, &tmpRecvFd));
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &extAddressNext, sizeof(extAddressNext)));
|
||||
{ // [RCCL] Receive PID from root
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, rootPid, sizeof(int)));
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, rootPid, sizeof(int)));
|
||||
} // [/RCCL]
|
||||
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
|
||||
NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
|
||||
close(tmpRecvFd);
|
||||
close(extListenFdRoot);
|
||||
|
||||
NCCLCHECK(bootstrapNetConnect(state->dev, &extHandleNext, &state->extBstrapRingSendComm));
|
||||
NCCLCHECK(connectAddress(&state->extRingSendFd, &extAddressNext));
|
||||
// Accept the connect request from the previous rank in the AllGather ring
|
||||
NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
|
||||
NCCLCHECK(bootstrapNetAccept(state->extListenFd, &state->extRingRecvFd));
|
||||
|
||||
// AllGather all listen handlers
|
||||
NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks));
|
||||
memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t));
|
||||
NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t)));
|
||||
NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks));
|
||||
memcpy(state->peerCommAddresses+rank, &info.extAddressListen, sizeof(union socketAddress));
|
||||
NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union socketAddress)));
|
||||
|
||||
// Create the memory allocation service
|
||||
NCCLCHECK(ncclCalloc(&state->peerAllocAddresses, nranks));
|
||||
memcpy(state->peerAllocAddresses+rank, &bootstrapNetIfAddr, sizeof(union socketAddress));
|
||||
NCCLCHECK(ncclCalloc(&state->allocState, 1));
|
||||
CUDACHECK(hipGetDevice(&state->allocState->cudaDev));
|
||||
NCCLCHECK(createListenSocket(&state->allocState->listenFd, state->peerAllocAddresses+rank));
|
||||
pthread_create(&state->allocThread, NULL, ncclRemoteMemAllocationService, state->allocState);
|
||||
NCCLCHECK(bootstrapAllGather(state, state->peerAllocAddresses, sizeof(union socketAddress)));
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
|
||||
|
||||
@@ -378,9 +435,9 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
|
||||
size_t sslice = (rank - i + nranks) % nranks;
|
||||
|
||||
// Send slice to the right
|
||||
NCCLCHECK(bootstrapNetSend(state->extBstrapRingSendComm, data+sslice*size, size));
|
||||
NCCLCHECK(bootstrapNetSend(state->extRingSendFd, data+sslice*size, size));
|
||||
// Recv slice from the left
|
||||
NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
|
||||
NCCLCHECK(bootstrapNetRecv(state->extRingRecvFd, data+rslice*size, size));
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
|
||||
@@ -389,20 +446,20 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
|
||||
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
void* tmpSendComm;
|
||||
NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles+peer, &tmpSendComm));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
|
||||
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
|
||||
int tmpSendFd;
|
||||
NCCLCHECK(connectAddress(&tmpSendFd, state->peerCommAddresses+peer));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendFd, &state->rank, sizeof(int)));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendFd, data, size));
|
||||
close(tmpSendFd);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
|
||||
ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int fd) {
|
||||
// New unex
|
||||
struct unexConn* unex;
|
||||
NCCLCHECK(ncclCalloc(&unex, 1));
|
||||
unex->peer = peer;
|
||||
unex->comm = comm;
|
||||
unex->fd = fd;
|
||||
|
||||
// Enqueue
|
||||
struct unexConn* list = state->unexpectedConnections;
|
||||
@@ -415,7 +472,7 @@ ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void* unexpectedDequeue(struct extState* state, int peer) {
|
||||
int unexpectedDequeue(struct extState* state, int peer) {
|
||||
struct unexConn* elem = state->unexpectedConnections;
|
||||
struct unexConn* prev = NULL;
|
||||
while (elem) {
|
||||
@@ -425,55 +482,61 @@ void* unexpectedDequeue(struct extState* state, int peer) {
|
||||
} else {
|
||||
prev->next = elem->next;
|
||||
}
|
||||
void* comm = elem->comm;
|
||||
int fd = elem->fd;
|
||||
free(elem);
|
||||
return comm;
|
||||
return fd;
|
||||
}
|
||||
prev = elem;
|
||||
elem = elem->next;
|
||||
}
|
||||
return NULL;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// We can't know who we'll receive from, so we need to receive everything at once
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
|
||||
void* tmpRecvComm;
|
||||
int tmpRecvFd;
|
||||
|
||||
// Search unexpected connections first
|
||||
if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) {
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
|
||||
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
|
||||
if ((tmpRecvFd = unexpectedDequeue(state, peer)) != -1) {
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size));
|
||||
close(tmpRecvFd);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Then look for new connections
|
||||
while (1) {
|
||||
NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm));
|
||||
NCCLCHECK(bootstrapNetAccept(state->extListenFd, &tmpRecvFd));
|
||||
int newPeer;
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int)));
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &newPeer, sizeof(int)));
|
||||
if (newPeer == peer) {
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
|
||||
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size));
|
||||
close(tmpRecvFd);
|
||||
return ncclSuccess;
|
||||
}
|
||||
// Unexpected connection. Save for later.
|
||||
NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm));
|
||||
NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvFd));
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapClose(void* commState) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
if (state->unexpectedConnections != NULL) {
|
||||
WARN("Unexpected connections are not empty.\n");
|
||||
WARN("Unexpected connections are not empty");
|
||||
return ncclInternalError;
|
||||
}
|
||||
NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm));
|
||||
NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm));
|
||||
NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm));
|
||||
close(state->extListenFd);
|
||||
close(state->extRingSendFd);
|
||||
close(state->extRingRecvFd);
|
||||
|
||||
free(state->peerBstrapHandles);
|
||||
state->allocState->stop = 1;
|
||||
|
||||
// Join the allocThread so we catch resource leaks as being hung here
|
||||
// pthread_join(state->allocThread, nullptr);
|
||||
|
||||
free(state->peerCommAddresses);
|
||||
free(state->peerAllocAddresses);
|
||||
free(state);
|
||||
|
||||
return ncclSuccess;
|
||||
@@ -481,10 +544,12 @@ ncclResult_t bootstrapClose(void* commState) {
|
||||
|
||||
ncclResult_t bootstrapAbort(void* commState) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
bootstrapNetCloseListen(state->extBstrapListenComm);
|
||||
bootstrapNetCloseSend(state->extBstrapRingSendComm);
|
||||
bootstrapNetCloseRecv(state->extBstrapRingRecvComm);
|
||||
free(state->peerBstrapHandles);
|
||||
close(state->extListenFd);
|
||||
close(state->extRingSendFd);
|
||||
close(state->extRingRecvFd);
|
||||
state->allocState->stop = 2;
|
||||
free(state->peerCommAddresses);
|
||||
free(state->peerAllocAddresses);
|
||||
free(state);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -26,16 +26,16 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
|
||||
}
|
||||
|
||||
// Per-channel operation list.
|
||||
NCCLCHECK(ncclCudaHostCalloc(&channel->collectives, NCCL_MAX_OPS));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&channel->collectivesExtra, comm->nRanks*NCCL_MAX_OPS*4));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&channel->a2avParams, comm->nRanks*NCCL_MAX_OPS*4));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
|
||||
if (channel->id == -1) return ncclSuccess;
|
||||
// Operation list
|
||||
NCCLCHECK(ncclCudaHostFree(channel->collectivesExtra));
|
||||
NCCLCHECK(ncclCudaHostFree(channel->collectives));
|
||||
NCCLCHECK(ncclCudaHostFree(channel->a2avParams));
|
||||
NCCLCHECK(ncclCudaHostFree(channel->workFifo));
|
||||
|
||||
// Free Ring index to rank tables
|
||||
free(channel->ring.userRanks);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -29,14 +29,13 @@ THE SOFTWARE.
|
||||
#include "common_kernel.h"
|
||||
|
||||
template <class FUNC, typename T, int NUM_RANKS>
|
||||
__device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
|
||||
__device__ void AllReduceCliqueSplitKernel(struct ncclWorkElem* args)
|
||||
{
|
||||
// Clique-specific kernel arguments
|
||||
cliqueDevicePtrs_t* cliquePtrs = args->clique.ptrs; // Collection of all input/output pointers across ranks in clique
|
||||
size_t const N = args->clique.count; // Total number of elements to reduce
|
||||
int const nBlocks = args->clique.nChannels; // Total number of blocks assigned to this kernel (may be different than gridDim.x)
|
||||
int const blockId = args->clique.bid; // 0-indexed blockIdx for this threadblock (may be different than blockIdx.x)
|
||||
int const verbose = args->clique.verbose; // For debug purposes
|
||||
int const rank = args->comm->rank; // Current rank
|
||||
|
||||
// Each threadblock works independently of others on a subsection of the input
|
||||
@@ -47,10 +46,6 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
|
||||
size_t const currBlockStop = min(currBlockStart + perBlockN, N);
|
||||
size_t const blockN = currBlockStop - currBlockStart;
|
||||
|
||||
if (verbose && threadIdx.x == 0)
|
||||
{
|
||||
printf("Rank %d block %d of %d %lu -> %lu [%lu]\n", rank, blockId, nBlocks, currBlockStart, currBlockStop, blockN);
|
||||
}
|
||||
if (blockN > 0)
|
||||
{
|
||||
// Prepare input / output subarrays
|
||||
@@ -74,12 +69,7 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
|
||||
|
||||
// Even if there was nothing for this GPU to do, it must participate in a barrier
|
||||
// because other GPUs may be modifying this GPUs output buffer still
|
||||
if (blockId == 0)
|
||||
{
|
||||
if (verbose && threadIdx.x == 0) printf("Rank %d enters GPU barrier\n", rank);
|
||||
WaitForBarrier<NUM_RANKS>(cliquePtrs->barrier, rank, verbose);
|
||||
if (verbose && threadIdx.x == 0) printf("Rank %d exits GPU barrier\n", rank);
|
||||
}
|
||||
if (blockId == 0) WaitForBarrier<NUM_RANKS>(cliquePtrs->barrier);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -62,7 +62,7 @@ typedef struct
|
||||
|
||||
// Multi-GPU (on same node) barrier. One thread per grid per GPU updates barrier / waits
|
||||
template <int NUM_RANKS>
|
||||
__forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier, int const rank, int const verbose)
|
||||
__forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier)
|
||||
{
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
@@ -71,7 +71,6 @@ __forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier, int
|
||||
int localSense = *barrier.localSense;
|
||||
|
||||
int val = __atomic_add_fetch(barrier.globalCount, 1, __ATOMIC_SEQ_CST);
|
||||
if (verbose) printf("Rank %d arrived at GPU barrier %d\n", rank, val);
|
||||
if (val == NUM_RANKS)
|
||||
{
|
||||
// Last arrival resets barrier
|
||||
@@ -81,18 +80,7 @@ __forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier, int
|
||||
else
|
||||
{
|
||||
// Wait for all ranks to reach barrier
|
||||
int counter = 0;
|
||||
while (__atomic_load_n(barrier.globalSense, __ATOMIC_SEQ_CST) != localSense)
|
||||
{
|
||||
if (verbose)
|
||||
{
|
||||
counter++;
|
||||
if (counter == 100000000)
|
||||
{
|
||||
printf("Rank %d waiting on GPU barrier: (%d != %d)", rank, *barrier.globalSense, localSense);
|
||||
}
|
||||
}
|
||||
}
|
||||
while (__atomic_load_n(barrier.globalSense, __ATOMIC_SEQ_CST) != localSense);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -49,7 +49,6 @@ int* CliqueManager::m_staticGpuBarrierMem = NULL;
|
||||
RCCL_PARAM(EnableClique, "ENABLE_CLIQUE", 0); // Opt-in environment variable for clique-based kernels
|
||||
RCCL_PARAM(AllReduceCliqueByteLimit, "CLIQUE_ALLREDUCE_BYTE_LIMIT", 16777216); // Max number of bytes to use clique-based kernels for all reduce
|
||||
RCCL_PARAM(AllReduceNumChannels, "CLIQUE_ALLREDUCE_NCHANNELS", 0); // Number of channels to use for all-reduce. (0 for auto-select)
|
||||
RCCL_PARAM(CliqueDebug, "CLIQUE_DEBUG", 0); // Emit debug messages
|
||||
|
||||
CliqueManager::CliqueManager(int const rank,
|
||||
int const numRanks,
|
||||
@@ -83,11 +82,7 @@ void CliqueManager::CleanUp()
|
||||
{
|
||||
// Release caches
|
||||
if (m_ipcHandleSendCache) delete m_ipcHandleSendCache;
|
||||
if (m_ipcHandleRecvCache)
|
||||
{
|
||||
m_ipcHandleRecvCache->close();
|
||||
delete m_ipcHandleRecvCache;
|
||||
}
|
||||
if (m_ipcHandleRecvCache) delete m_ipcHandleRecvCache;
|
||||
|
||||
// Close shared memory
|
||||
m_shmHandles.Close();
|
||||
@@ -220,7 +215,6 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
|
||||
if (m_rank == 0)
|
||||
{
|
||||
NCCLCHECKGOTO(ncclCudaCalloc(&m_staticGpuBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), true), res, dropback);
|
||||
|
||||
// Prepare all barriers
|
||||
for (int opIndex = 0; opIndex < NCCL_MAX_OPS; opIndex++)
|
||||
{
|
||||
@@ -253,7 +247,7 @@ bool CliqueManager::IsSupported(ncclFunc_t const coll,
|
||||
|
||||
// Filter based on total input size for each collective type
|
||||
size_t totalBytes = count * ncclTypeSize(datatype);
|
||||
if (coll == ncclCollAllReduce && (totalBytes <= rcclParamAllReduceCliqueByteLimit())) return true;
|
||||
if (coll == ncclFuncAllReduce && (totalBytes <= rcclParamAllReduceCliqueByteLimit())) return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
@@ -321,7 +315,7 @@ ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll,
|
||||
size_t const totalBytes = count * ncclTypeSize(datatype);
|
||||
*numChannelstoUse = 1;
|
||||
|
||||
if (coll == ncclCollAllReduce) {
|
||||
if (coll == ncclFuncAllReduce) {
|
||||
if (rcclParamAllReduceNumChannels() == 0)
|
||||
{
|
||||
// NOTE: These are currently based on collected data and not necessarily ideal for all hardware
|
||||
@@ -345,7 +339,7 @@ ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll,
|
||||
|
||||
|
||||
|
||||
ncclResult_t CliqueManager::SetCliqueCollectiveArgs(CollectiveArgs* args)
|
||||
ncclResult_t CliqueManager::SetCliqueArgs(ncclWorkElem* args)
|
||||
{
|
||||
// Do nothing if disabled
|
||||
if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
|
||||
@@ -358,7 +352,6 @@ ncclResult_t CliqueManager::SetCliqueCollectiveArgs(CollectiveArgs* args)
|
||||
// Prepare clique argments (NOTE: clique pointers are not ready yet)
|
||||
int opIndex = args->opCount % NCCL_MAX_OPS;
|
||||
args->clique.ptrs = &m_pinnedCliquePtrs[opIndex];
|
||||
args->clique.verbose = rcclParamCliqueDebug();
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -495,33 +488,16 @@ ncclResult_t CliqueManager::CheckCacheForHandle(std::pair<hipIpcMemHandle_t, siz
|
||||
|
||||
void CliqueManager::WaitForBarrier()
|
||||
{
|
||||
int const verbose = rcclParamCliqueDebug();
|
||||
|
||||
// Sense inversion barrier
|
||||
m_cpuBarrierLocalSense = 1 - m_cpuBarrierLocalSense;
|
||||
|
||||
int val = __sync_add_and_fetch(m_cpuBarrierGlobalCount, 1);
|
||||
if (verbose) INFO(NCCL_INIT, "Rank %d reaches barrier at %d", m_rank, val);
|
||||
|
||||
if (val == m_numRanks)
|
||||
if (__sync_add_and_fetch(m_cpuBarrierGlobalCount, 1) == m_numRanks)
|
||||
{
|
||||
// Reset the barrier
|
||||
STORE(m_cpuBarrierGlobalCount, 0);
|
||||
STORE(m_cpuBarrierGlobalSense, m_cpuBarrierLocalSense);
|
||||
} else {
|
||||
size_t counter = 0;
|
||||
while (LOAD(m_cpuBarrierGlobalSense) != m_cpuBarrierLocalSense)
|
||||
{
|
||||
if (verbose)
|
||||
{
|
||||
counter++;
|
||||
if (counter == 100000000)
|
||||
{
|
||||
WARN("Rank %d waiting in CPU barrier: (%d != %d)", m_rank, *m_cpuBarrierGlobalSense, m_cpuBarrierLocalSense);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (verbose) INFO(NCCL_INIT, "Rank %d leaves CPU barrier", m_rank);
|
||||
while (LOAD(m_cpuBarrierGlobalSense) != m_cpuBarrierLocalSense);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -72,7 +72,7 @@ public:
|
||||
|
||||
// Set pointers for where clique-related arguments will be found
|
||||
// This sets pointers to device-accessible memory where the arguments will eventually reside
|
||||
ncclResult_t SetCliqueCollectiveArgs(CollectiveArgs* args);
|
||||
ncclResult_t SetCliqueArgs(ncclWorkElem* args);
|
||||
|
||||
// Blocking call that only returns after all out-standing clique pointers are ready
|
||||
ncclResult_t WaitForPointers();
|
||||
@@ -121,8 +121,8 @@ protected:
|
||||
|
||||
// For use in bootstrapping code
|
||||
struct bootstrapRootStruct {
|
||||
void* listenComm;
|
||||
unsigned long hash;
|
||||
int listenFd;
|
||||
unsigned long hash;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -103,16 +103,14 @@ public:
|
||||
|
||||
return std::pair<iterator, bool>(it, inserted);
|
||||
}
|
||||
ncclResult_t close();
|
||||
|
||||
private:
|
||||
// tag for dispatch
|
||||
template<class U>
|
||||
struct CloseTag{};
|
||||
|
||||
hipError_t CloseIfPointer(CloseTag<hipIpcMemHandle_t> tag, iterator it);
|
||||
hipError_t CloseIfPointer(CloseTag<void*> tag, iterator it);
|
||||
|
||||
void pop();
|
||||
void pop()
|
||||
{
|
||||
typename LRUCache::iterator it = m_cache.find(m_lruHistory.front());
|
||||
m_cache.erase(it);
|
||||
m_lruHistory.pop_front();
|
||||
}
|
||||
|
||||
void updateHistory(const iterator& it)
|
||||
{
|
||||
@@ -135,60 +133,8 @@ auto hipIpcMemHandleEqual = [](const hipIpcMemHandle_t& l, const hipIpcMemHandle
|
||||
return memcmp(l.reserved, r.reserved, sizeof(l.reserved)) == 0;
|
||||
};
|
||||
|
||||
template <
|
||||
class Key,
|
||||
class Value,
|
||||
class Hash,
|
||||
class KeyEqual,
|
||||
class Allocator
|
||||
>
|
||||
ncclResult_t NcclIpcHandleCache<Key, Value, Hash, KeyEqual, Allocator>::close()
|
||||
{
|
||||
for (auto it = m_cache.begin(); it != m_cache.end(); it ++)
|
||||
{
|
||||
CUDACHECK(CloseIfPointer(CloseTag<Value>{}, it));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <
|
||||
class Key,
|
||||
class Value,
|
||||
class Hash,
|
||||
class KeyEqual,
|
||||
class Allocator
|
||||
>
|
||||
void NcclIpcHandleCache<Key, Value, Hash, KeyEqual, Allocator>::pop()
|
||||
{
|
||||
typename LRUCache::iterator it = m_cache.find(m_lruHistory.front());
|
||||
CloseIfPointer(CloseTag<Value>{}, it);
|
||||
m_cache.erase(it);
|
||||
m_lruHistory.pop_front();
|
||||
}
|
||||
|
||||
template <
|
||||
class Key,
|
||||
class Value,
|
||||
class Hash,
|
||||
class KeyEqual,
|
||||
class Allocator
|
||||
>
|
||||
hipError_t NcclIpcHandleCache<Key, Value, Hash, KeyEqual, Allocator>::CloseIfPointer(CloseTag<void*> tag, iterator it)
|
||||
{
|
||||
return hipIpcCloseMemHandle(it->second.first);
|
||||
}
|
||||
|
||||
template <
|
||||
class Key,
|
||||
class Value,
|
||||
class Hash,
|
||||
class KeyEqual,
|
||||
class Allocator
|
||||
>
|
||||
hipError_t NcclIpcHandleCache<Key, Value, Hash, KeyEqual, Allocator>::CloseIfPointer(CloseTag<hipIpcMemHandle_t> tag, iterator it)
|
||||
{
|
||||
return hipSuccess;
|
||||
}
|
||||
//typedef llvm::DenseMap<uint64_t, hipIpcMemHandle_t> SendCache;
|
||||
//typedef llvm::DenseMap<hipIpcMemHandle_t, void*, decltype(&HandleHash), decltype(HandleEqual)> RecvCache;
|
||||
|
||||
typedef NcclIpcHandleCache<uint64_t, hipIpcMemHandle_t, std::hash<uint64_t>, std::equal_to<uint64_t>, std::allocator< std::pair<const uint64_t, std::pair<hipIpcMemHandle_t, std::list<uint64_t>::iterator>>>> NcclIpcHandleSendCache;
|
||||
typedef NcclIpcHandleCache<hipIpcMemHandle_t, void*, decltype(&hipIpcMemHandleHash), decltype(hipIpcMemHandleEqual), std::allocator< std::pair<const hipIpcMemHandle_t, std::pair<void*, std::list<hipIpcMemHandle_t>::iterator>>>> NcclIpcHandleRecvCache;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollAllGather, "AllGather",
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
|
||||
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -11,7 +11,8 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
|
||||
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
|
||||
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
|
||||
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
|
||||
@@ -25,7 +25,7 @@ ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, nc
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ncclSuccess;
|
||||
} else {
|
||||
struct ncclInfo info = { ncclCollAllToAll, "AllToAll",
|
||||
struct ncclInfo info = { ncclFuncAllToAll, "AllToAll",
|
||||
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||
ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
|
||||
@@ -37,7 +37,7 @@ ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], cons
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ncclSuccess;
|
||||
} else {
|
||||
struct ncclInfo info = { ncclCollAllToAllv, "AllToAllv",
|
||||
struct ncclInfo info = { ncclFuncAllToAllv, "AllToAllv",
|
||||
sendbuff, recvbuff, 0, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||
ALLTOALLV_CHUNKSTEPS, ALLTOALLV_SLICESTEPS, sendcounts, sdispls, recvcounts, rdispls };
|
||||
return ncclEnqueueCheck(&info);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
|
||||
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
|
||||
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,4 +8,4 @@
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_C(ncclAllGather, ncclCollAllGather);
|
||||
IMPL_COLL_C(AllGather);
|
||||
|
||||
@@ -9,206 +9,201 @@
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm);
|
||||
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
|
||||
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
if (thisInput + chunkOffset == thisOutput + offset) { // In place
|
||||
prims.directSend(thisInput+chunkOffset, offset, nelem);
|
||||
} else {
|
||||
prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
|
||||
if (thisInput + chunkOffset == thisOutput + offset) { // In place
|
||||
prims.directSend(thisInput+chunkOffset, offset, nelem);
|
||||
} else {
|
||||
prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
|
||||
}
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
rankDest = ring->devUserRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
// Final wait/copy.
|
||||
prims.directRecv(thisOutput+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
if (thisInput + chunkOffset == thisOutput + offset) { // In place
|
||||
LLprims.send(thisInput+chunkOffset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: final store
|
||||
rankDest = ring->devUserRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
rankDest = ring->devUserRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
// Final wait/copy.
|
||||
prims.directRecv(thisOutput+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
if (thisInput + chunkOffset == thisOutput + offset) { // In place
|
||||
LLprims.send(thisInput+chunkOffset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: final store
|
||||
rankDest = ring->devUserRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { }
|
||||
};
|
||||
|
||||
#include "prims_ll128.h"
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
if (thisInput + chunkOffset == thisOutput + offset) { // In place
|
||||
LLprims.send(thisInput+chunkOffset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
|
||||
if (thisInput + chunkOffset == thisOutput + offset) { // In place
|
||||
LLprims.send(thisInput+chunkOffset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: final store
|
||||
rankDest = ring->devUserRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
template<int PROTO, class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_TREE, PROTO, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
template<int PROTO, class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllGather, NCCL_ALGO_COLLNET, PROTO, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
// step k-1: final store
|
||||
rankDest = ring->devUserRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherCollNetLL128Kernel(struct CollectiveArgs* args) { }
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
@@ -8,4 +8,7 @@
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_R(ncclAllReduce, ncclCollAllReduce);
|
||||
// [RCCL]
|
||||
// IMPL_COLL_R(AllReduce);
|
||||
IMPL_COLL_CLIQUE(AllReduce);
|
||||
// [/RCCL]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -10,478 +10,460 @@
|
||||
#include "collectives.h"
|
||||
#include "clique/AllReduceCliqueKernel.h" // [RCCL] AllReduce Clique-based kernel support
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
#ifdef ENABLE_PROFILING
|
||||
auto devProf = comm->devProf;
|
||||
uint64_t clk, t0 = 0ULL, ws;
|
||||
if (tid == 0) clk = __rtc64();
|
||||
auto devProf = comm->devProf;
|
||||
uint64_t clk, t0 = 0ULL, ws;
|
||||
if (tid == 0) clk = __builtin_amdgcn_s_memrealtime();
|
||||
#endif
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm);
|
||||
ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
|
||||
ssize_t realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
|
||||
ssize_t realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
|
||||
|
||||
/////////////// begin AllReduce steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
int chunk;
|
||||
/////////////// begin AllReduce steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
int chunk;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
chunk = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + chunk * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
INIT_COUNTER;
|
||||
prims.send(thisInput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(send);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
// step 0: push data to next GPU
|
||||
chunk = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + chunk * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
INIT_COUNTER;
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(recvReduceSend);
|
||||
}
|
||||
prims.send(thisInput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(send);
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
chunk = ring->devUserRanks[0];
|
||||
offset = chunkOffset + chunk * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + chunk * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
INIT_COUNTER;
|
||||
prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
|
||||
ACCUMULATE_COUNTER(directRecvReduceCopySend);
|
||||
INIT_COUNTER;
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(recvReduceSend);
|
||||
}
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
chunk = ring->devUserRanks[0];
|
||||
offset = chunkOffset + chunk * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
INIT_COUNTER;
|
||||
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
|
||||
ACCUMULATE_COUNTER(directRecvCopySend);
|
||||
}
|
||||
prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
|
||||
ACCUMULATE_COUNTER(directRecvReduceCopySend);
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
chunk = ring->devUserRanks[1];
|
||||
offset = chunkOffset + chunk * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + chunk * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
INIT_COUNTER;
|
||||
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
|
||||
ACCUMULATE_COUNTER(directRecvCopySend);
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
chunk = ring->devUserRanks[1];
|
||||
offset = chunkOffset + chunk * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
// Final wait/copy.
|
||||
INIT_COUNTER;
|
||||
prims.directRecv(thisOutput+offset, offset, nelem);
|
||||
ACCUMULATE_COUNTER(directRecv);
|
||||
}
|
||||
INIT_COUNTER;
|
||||
prims.directRecv(thisOutput+offset, offset, nelem);
|
||||
ACCUMULATE_COUNTER(directRecv);
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), __rtc64() - clk, __ATOMIC_SEQ_CST);
|
||||
if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), __builtin_amdgcn_s_memrealtime() - clk, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
int chunkSize = args->coll.lastChunkSize;
|
||||
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
};
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclTree* tree = &channel->tree;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
int chunkSize = args->coll.lastChunkSize;
|
||||
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
do {
|
||||
struct ncclTree* tree = &channel->treeUp;
|
||||
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
ncclPrimitives<UNROLL/2, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, 0, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
prims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
} while(0);
|
||||
|
||||
do {
|
||||
struct ncclTree* tree = &channel->treeDn;
|
||||
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
ncclPrimitives<UNROLL/2, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, &tree->up, tree->down, thisOutput, stepSize, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
prims.directSend(thisOutput+offset, offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
prims.directRecv(thisOutput+offset, offset, nelem);
|
||||
} else {
|
||||
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
} while(0);
|
||||
}
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
int chunkSize = args->coll.lastChunkSize;
|
||||
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
if (blockIdx.x < nChannels) { // first half of the channels do reduce
|
||||
struct ncclTree* tree = &channel->collTreeUp;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
prims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
|
||||
struct ncclTree* tree = &channel->collTreeDn;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
prims.send(thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
prims.recv(thisOutput+offset, nelem);
|
||||
} else {
|
||||
prims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*nranks*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
|
||||
|
||||
/////////////// begin AllReduce steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
int chunk;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
chunk = ring->devUserRanks[nranks-1];
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
chunk = ring->devUserRanks[0];
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
chunk = ring->devUserRanks[1];
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
// Here we need to copy from buffer to this output.
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
do {
|
||||
struct ncclTree* tree = &channel->treeUp;
|
||||
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
} while(0);
|
||||
|
||||
do {
|
||||
struct ncclTree* tree = &channel->treeDn;
|
||||
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
LLprims.send(thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
} while(0);
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
if (blockIdx.x < nChannels) { // first half of the channels do reduce
|
||||
struct ncclTree* tree = &channel->collTreeUp;
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
|
||||
struct ncclTree* tree = &channel->collTreeDn;
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
LLprims.send(thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#include "prims_ll128.h"
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
|
||||
// [RCCL] RingLL128 is re-purposed as clique-based kernel
|
||||
LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, FUNC, T, args);
|
||||
// [/RCCL]
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclTree* treeUp = &channel->treeUp;
|
||||
struct ncclTree* treeDn = &channel->treeDn;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = args->coll.lastChunkSize;
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
if (treeUp->up == -1) {
|
||||
// ReduceAndBroadcast : max number of recv is 3, max number of send is 3
|
||||
ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, stepSize, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
}
|
||||
} else {
|
||||
if (tid < nthreadsSplit) {
|
||||
#if 1
|
||||
if (tid < nthreads) {
|
||||
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, stepSize, channel, comm);
|
||||
ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_DEV_ARITY, 1, 0, FUNC>
|
||||
prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (treeUp->down[0] == -1) {
|
||||
if (tree->up == -1) {
|
||||
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
prims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (tid < nthreads) {
|
||||
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_DEV_ARITY, 1, FUNC>
|
||||
prims(tid, nthreads, &tree->up, tree->down, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
prims.directSend(thisOutput+offset, offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
prims.directRecv(thisOutput+offset, offset, nelem);
|
||||
} else {
|
||||
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
int nthreadsSplit = nthreads/2;
|
||||
if (nthreadsSplit == 256) nthreadsSplit += 64;
|
||||
if (tree->up == -1) {
|
||||
if (tid < nthreads) {
|
||||
// ReduceAndBroadcast : max number of recv is 3, max number of send is 3
|
||||
ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_DEV_ARITY, NCCL_MAX_DEV_ARITY, 1, FUNC>
|
||||
prims(tid, nthreads, tree->down, tree->down, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (tid < nthreadsSplit) {
|
||||
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_DEV_ARITY, 1, 0, FUNC>
|
||||
prims(tid, nthreadsSplit, tree->down, &tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->down[0] == -1) {
|
||||
prims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_DEV_ARITY, 1, FUNC>
|
||||
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 2);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->down[0] == -1) {
|
||||
prims.directRecv(thisOutput+offset, offset, nelem);
|
||||
} else {
|
||||
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_COLLNET, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclTree* tree = &channel->collTree;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
int chunkSize = args->coll.lastChunkSize;
|
||||
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
if (blockIdx.x < nChannels) { // first half of the channels do reduce
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
prims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
prims.send(thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
prims.recv(thisOutput+offset, nelem);
|
||||
} else {
|
||||
prims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*nranks*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
|
||||
|
||||
/////////////// begin AllReduce steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
int chunk;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
chunk = ring->devUserRanks[nranks-1];
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
chunk = ring->devUserRanks[0];
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
chunk = ring->devUserRanks[1];
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
// Here we need to copy from buffer to this output.
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_LL, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclTree* tree = &channel->tree;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
do {
|
||||
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
ncclLLPrimitives<T, FUNC, NCCL_MAX_DEV_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
} while(0);
|
||||
|
||||
do {
|
||||
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, stepSize, channel, comm);
|
||||
ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_DEV_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (treeDn->down[0] == -1) {
|
||||
if (tree->up == -1) {
|
||||
LLprims.send(thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
} while(0);
|
||||
}
|
||||
};
|
||||
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_COLLNET, NCCL_PROTO_LL, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclTree* tree = &channel->collTree;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
if (blockIdx.x < nChannels) { // first half of the channels do reduce
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
LLprims.send(thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
@@ -489,8 +471,87 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
|
||||
#include "prims_ll128.h"
|
||||
// [RCCL] RingLL128 is re-purposed as clique-based kernel
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_CLIQUE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, FUNC, T, args);
|
||||
}
|
||||
};
|
||||
// [/RCCL]
|
||||
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclTree* tree = &channel->tree;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = args->coll.lastChunkSize;
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
if (tree->up == -1) {
|
||||
// ReduceAndBroadcast : max number of recv is 3, max number of send is 3
|
||||
ncclLL128Primitives<T, FUNC, NCCL_MAX_DEV_ARITY, NCCL_MAX_DEV_ARITY> LLprims(tid, nthreads, tree->down, tree->down, stepSize, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
}
|
||||
} else {
|
||||
if (tid < nthreadsSplit) {
|
||||
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
ncclLL128Primitives<T, FUNC, NCCL_MAX_DEV_ARITY, 1> LLprims(tid, nthreadsSplit, tree->down, &tree->up, stepSize, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->down[0] == -1) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_DEV_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, stepSize, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->down[0] == -1) {
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_COLLNET, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) { }
|
||||
};
|
||||
|
||||
@@ -8,5 +8,4 @@
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_FUNC(ncclAllToAll, copy, FuncSum, i8, int8_t);
|
||||
IMPL_COLL_KERN(ncclAllToAll, copy, FuncSum, i8, int8_t, 0);
|
||||
IMPL_COLL_FUNC(AllToAll, RING, SIMPLE, Sum, int8_t);
|
||||
|
||||
@@ -9,80 +9,81 @@
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllToAllKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int nranks = comm->nRanks;
|
||||
const int bid = args->coll.bid;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLTOALL_CHUNKSTEPS;
|
||||
const int peersPerChan = DIVUP(nranks, nChannels);
|
||||
const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllToAll, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int nranks = comm->nRanks;
|
||||
const int bid = args->coll.bid;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLTOALL_CHUNKSTEPS;
|
||||
const int peersPerChan = DIVUP(nranks, nChannels);
|
||||
const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
for (int i = 0; i < peersPerChan; i++) {
|
||||
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
|
||||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
|
||||
continue;
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
if ((blockIdx.x*peersPerChan+i)%nranks == 0) {
|
||||
if (tid < nthreads && thisInput != thisOutput) {
|
||||
const T* sendbuff = thisInput+chunkOffset+rank*size;
|
||||
T* recvbuff = thisOutput+chunkOffset+rank*size;
|
||||
// local copy
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
for (int i = 0; i < peersPerChan; i++) {
|
||||
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
|
||||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
|
||||
continue;
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
if ((blockIdx.x*peersPerChan+i)%nranks == 0) {
|
||||
if (tid < nthreads && thisInput != thisOutput) {
|
||||
const T* sendbuff = thisInput+chunkOffset+rank*size;
|
||||
T* recvbuff = thisOutput+chunkOffset+rank*size;
|
||||
// local copy
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
|
||||
if ((blockIdx.x*peersPerChan+i)%nranks != 0) {
|
||||
int nthreadsSplit = nthreads/2;
|
||||
if (tid < nthreadsSplit ) {
|
||||
int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
|
||||
ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 0, 1, 0, FUNC>
|
||||
prims(tid, nthreadsSplit, NULL, &peerSend, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
ssize_t send_offset = chunkOffset + peerSend*size;
|
||||
prims.send(thisInput+send_offset, nelem);
|
||||
}
|
||||
} else {
|
||||
int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
|
||||
ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 1, 0, 0, FUNC>
|
||||
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, NULL, NULL, stepSize, channel, comm, ncclShmem->ptrs, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
ssize_t recv_offset = chunkOffset + peerRecv*size;
|
||||
prims.recv(thisOutput+recv_offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < peersPerChan; i++) {
|
||||
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
|
||||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
|
||||
continue;
|
||||
if ((blockIdx.x*peersPerChan+i)%nranks != 0) {
|
||||
int nthreadsSplit = nthreads/2;
|
||||
int peerNone[2] = {-1,-1};
|
||||
if (tid < nthreadsSplit ) {
|
||||
int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
|
||||
ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 2, 1, 0, FUNC>
|
||||
prims(tid, nthreadsSplit, peerNone, &peerSend, NULL, stepSize, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
ssize_t send_offset = chunkOffset + peerSend*size;
|
||||
prims.send(thisInput+send_offset, nelem);
|
||||
}
|
||||
} else {
|
||||
int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
|
||||
ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 1, 2, 0, FUNC>
|
||||
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, peerNone, NULL, stepSize, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
ssize_t recv_offset = chunkOffset + peerRecv*size;
|
||||
prims.recv(thisOutput+recv_offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
Fichier exécutable → Fichier normal
+1
-2
@@ -8,5 +8,4 @@
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_FUNC(ncclAllToAllv, copy, FuncSum, i8, int8_t);
|
||||
IMPL_COLL_KERN(ncclAllToAllv, copy, FuncSum, i8, int8_t, 0);
|
||||
IMPL_COLL_FUNC(AllToAllv, RING, SIMPLE, Sum, int8_t);
|
||||
|
||||
Fichier exécutable → Fichier normal
+76
-74
@@ -9,87 +9,89 @@
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllToAllvKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->a2av.nThreads;
|
||||
const int nChannels = args->a2av.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t typesize = args->a2av.count;
|
||||
const int nranks = comm->nRanks;
|
||||
const int bid = args->a2av.bid;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLTOALLV_CHUNKSTEPS;
|
||||
const int peersPerChan = DIVUP(nranks, nChannels);
|
||||
const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncAllToAllv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int nChannels = args->a2av.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t typesize = args->a2av.count;
|
||||
const int nranks = comm->nRanks;
|
||||
const int bid = args->a2av.bid;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLTOALLV_CHUNKSTEPS;
|
||||
const int peersPerChan = DIVUP(nranks, nChannels);
|
||||
const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
size_t* params = channel->a2avParams + nranks*4*args->index;
|
||||
size_t *sendcounts = params;
|
||||
size_t *sdispls = params + nranks;
|
||||
size_t *recvcounts = params + nranks*2;
|
||||
size_t *rdispls = params + nranks*3;
|
||||
ssize_t size = sendcounts[rank]*typesize;
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
for (int i = 0; i < peersPerChan; i++) {
|
||||
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
|
||||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
|
||||
continue;
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
if ((blockIdx.x*peersPerChan+i)%nranks == 0) {
|
||||
if (tid < nthreads && thisInput != thisOutput) {
|
||||
const T* sendbuff = thisInput+chunkOffset+sdispls[rank]*typesize;
|
||||
T* recvbuff = thisOutput+chunkOffset+rdispls[rank]*typesize;
|
||||
// local copy
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t *sendcounts = args->a2av.extra;
|
||||
size_t *sdispls = args->a2av.extra + nranks;
|
||||
size_t *recvcounts = args->a2av.extra + nranks*2;
|
||||
size_t *rdispls = args->a2av.extra + nranks*3;
|
||||
ssize_t size = sendcounts[rank]*typesize;
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
for (int i = 0; i < peersPerChan; i++) {
|
||||
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
|
||||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
|
||||
continue;
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
if ((blockIdx.x*peersPerChan+i)%nranks == 0) {
|
||||
if (tid < nthreads && thisInput != thisOutput) {
|
||||
const T* sendbuff = thisInput+chunkOffset+sdispls[rank]*typesize;
|
||||
T* recvbuff = thisOutput+chunkOffset+rdispls[rank]*typesize;
|
||||
// local copy
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
|
||||
if ((blockIdx.x*peersPerChan+i)%nranks != 0) {
|
||||
int nthreadsSplit = nthreads/2;
|
||||
if (tid < nthreadsSplit ) {
|
||||
int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
|
||||
ncclPrimitives<UNROLL, ALLTOALLV_CHUNKSTEPS/ALLTOALLV_SLICESTEPS, ALLTOALLV_SLICESTEPS, T, 0, 1, 0, FUNC>
|
||||
prims(tid, nthreadsSplit, NULL, &peerSend, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
size = sendcounts[peerSend]*typesize;
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
|
||||
ALIGN_SIZE(realChunkSize, nthreadsSplit*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
ssize_t send_offset = chunkOffset + sdispls[peerSend]*typesize;
|
||||
prims.send(thisInput+send_offset, nelem);
|
||||
}
|
||||
} else {
|
||||
int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
|
||||
ncclPrimitives<UNROLL, ALLTOALLV_CHUNKSTEPS/ALLTOALLV_SLICESTEPS, ALLTOALLV_SLICESTEPS, T, 1, 0, 0, FUNC>
|
||||
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, NULL, NULL, stepSize, channel, comm, ncclShmem->ptrs, 1);
|
||||
size = recvcounts[peerRecv]*typesize;
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
|
||||
ALIGN_SIZE(realChunkSize, (nthreads-nthreadsSplit)*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
ssize_t recv_offset = chunkOffset + rdispls[peerRecv]*typesize;
|
||||
prims.recv(thisOutput+recv_offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < peersPerChan; i++) {
|
||||
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
|
||||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
|
||||
continue;
|
||||
if ((blockIdx.x*peersPerChan+i)%nranks != 0) {
|
||||
int nthreadsSplit = nthreads/2;
|
||||
int peerNone[2] = {-1,-1};
|
||||
if (tid < nthreadsSplit ) {
|
||||
int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
|
||||
ncclPrimitives<UNROLL, ALLTOALLV_CHUNKSTEPS/ALLTOALLV_SLICESTEPS, ALLTOALLV_SLICESTEPS, T, 2, 1, 0, FUNC>
|
||||
prims(tid, nthreadsSplit, peerNone, &peerSend, NULL, stepSize, channel, comm);
|
||||
size = sendcounts[peerSend]*typesize;
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
|
||||
ALIGN_SIZE(realChunkSize, nthreadsSplit*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
ssize_t send_offset = chunkOffset + sdispls[peerSend]*typesize;
|
||||
prims.send(thisInput+send_offset, nelem);
|
||||
}
|
||||
} else {
|
||||
int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
|
||||
ncclPrimitives<UNROLL, ALLTOALLV_CHUNKSTEPS/ALLTOALLV_SLICESTEPS, ALLTOALLV_SLICESTEPS, T, 1, 2, 0, FUNC>
|
||||
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, peerNone, NULL, stepSize, channel, comm);
|
||||
size = recvcounts[peerRecv]*typesize;
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
|
||||
ALIGN_SIZE(realChunkSize, (nthreads-nthreadsSplit)*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
ssize_t recv_offset = chunkOffset + rdispls[peerRecv]*typesize;
|
||||
prims.recv(thisOutput+recv_offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,4 +8,4 @@
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_C(ncclBroadcast, ncclCollBroadcast);
|
||||
IMPL_COLL_C(Broadcast);
|
||||
|
||||
@@ -9,177 +9,155 @@
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->coll.root;
|
||||
#ifdef ENABLE_PROFILING
|
||||
auto devProf = comm->devProf;
|
||||
uint64_t clk, t0 = 0ULL, ws;
|
||||
if (tid == 0) clk = __rtc64();
|
||||
#endif
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->coll.root;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
|
||||
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
|
||||
if (rank == root) {
|
||||
if (thisInput == thisOutput) {
|
||||
INIT_COUNTER;
|
||||
prims.send(thisInput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(send);
|
||||
} else {
|
||||
INIT_COUNTER;
|
||||
prims.copySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(copySend);
|
||||
if (rank == root) {
|
||||
if (thisInput == thisOutput) {
|
||||
prims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
prims.copySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
prims.recv(thisOutput+offset, nelem);
|
||||
} else {
|
||||
prims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
INIT_COUNTER;
|
||||
prims.recv(thisOutput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(recv);
|
||||
} else {
|
||||
INIT_COUNTER;
|
||||
prims.recvCopySend(thisOutput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(recvCopySend);
|
||||
}
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), __rtc64() - clk, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->coll.root;
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastCollNetKernel(struct CollectiveArgs* args) { }
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->coll.root;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (rank == root) {
|
||||
if (thisInput == thisOutput) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (rank == root) {
|
||||
if (thisInput == thisOutput) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
LLprims.recv(thisOutput + offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput + offset, nelem);
|
||||
}
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
LLprims.recv(thisOutput + offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput + offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastCollNetLLKernel(struct CollectiveArgs* args) { }
|
||||
};
|
||||
|
||||
#include "prims_ll128.h"
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->coll.root;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->coll.root;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (rank == root) {
|
||||
if (thisInput == thisOutput) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (rank == root) {
|
||||
if (thisInput == thisOutput) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
LLprims.recv(thisOutput + offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput + offset, nelem);
|
||||
}
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
LLprims.recv(thisOutput + offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput + offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
|
||||
template<int PROTO, class REDOP, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastCollNetLL128Kernel(struct CollectiveArgs* args) { }
|
||||
template<int PROTO, class REDOP, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -11,154 +11,129 @@
|
||||
#include "collectives.h"
|
||||
#include "devcomm.h"
|
||||
|
||||
__device__
|
||||
inline __attribute((always_inline))
|
||||
long long int __rtc64() {
|
||||
#if __HIP__
|
||||
return (long long int) __builtin_amdgcn_s_memrealtime();
|
||||
#else
|
||||
return (long long int) __clock_u64();
|
||||
#endif
|
||||
}
|
||||
#define COLL_UNROLL 2
|
||||
#define NCCL_MAX_DEV_ARITY NCCL_MAX_TREE_ARITY
|
||||
|
||||
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
|
||||
// Each thread sets a predicate to true if abort == 1
|
||||
// all CTA's threads enter the barrier and do a popc on their predicates being True
|
||||
// If any of the thread's predicate was True, all the threads call exit()
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define exitIfAbortBarrier(abort, abortCount) \
|
||||
if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \
|
||||
__syncthreads(); \
|
||||
if (LOAD(abortCount)) { /*asm volatile ("s_endpgm");*/ return false; }
|
||||
#define __syncwarp()
|
||||
#else
|
||||
static inline __device__ void exitIfAbortBarrier(int abort) {
|
||||
uint32_t popc;
|
||||
asm ("{");
|
||||
asm volatile (" .reg .pred barr_pred;");
|
||||
asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
|
||||
asm volatile (" bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc));
|
||||
asm ("}");
|
||||
if (popc) { asm volatile ("exit;"); }
|
||||
}
|
||||
#endif
|
||||
|
||||
#define NCCL_FUNC5(coll, op, dtype) \
|
||||
NCCL_COLL_NAME(coll##LL, op, dtype), \
|
||||
NCCL_COLL_NAME(coll##LL, op, dtype), \
|
||||
NCCL_COLL_NAME(coll, op, dtype)
|
||||
#define NCCL_FUNC5(func, algo, redop, type) \
|
||||
NCCL_FUNC_NAME(func, algo, LL, redop, type), \
|
||||
NCCL_FUNC_NAME(func, algo, LL, redop, type), \
|
||||
NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)
|
||||
|
||||
#define NCCL_FUNC4(coll, op, dtype) \
|
||||
NCCL_FUNC5(coll##Tree, op, dtype), \
|
||||
NCCL_FUNC5(coll##Ring, op, dtype), \
|
||||
NCCL_FUNC5(coll##CollNet, op, dtype)
|
||||
#define NCCL_FUNC4(func, redop, type) \
|
||||
NCCL_FUNC5(func, TREE, redop, type), \
|
||||
NCCL_FUNC5(func, RING, redop, type), \
|
||||
NCCL_FUNC5(func, COLLNET, redop, type)
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, u8), \
|
||||
NCCL_FUNC4(coll, op, i32), \
|
||||
NCCL_FUNC4(coll, op, u32), \
|
||||
NCCL_FUNC4(coll, op, i64), \
|
||||
NCCL_FUNC4(coll, op, u64), \
|
||||
NCCL_FUNC4(coll, op, f16), \
|
||||
NCCL_FUNC4(coll, op, f32), \
|
||||
NCCL_FUNC4(coll, op, f64), \
|
||||
NCCL_FUNC4(coll, op, b16)
|
||||
#define NCCL_FUNCS3B(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8)
|
||||
#define NCCL_FUNCS3A(func, redop) \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, uint8_t), \
|
||||
NCCL_FUNC4(func, redop, int32_t), \
|
||||
NCCL_FUNC4(func, redop, uint32_t), \
|
||||
NCCL_FUNC4(func, redop, int64_t), \
|
||||
NCCL_FUNC4(func, redop, uint64_t), \
|
||||
NCCL_FUNC4(func, redop, half), \
|
||||
NCCL_FUNC4(func, redop, float), \
|
||||
NCCL_FUNC4(func, redop, double), \
|
||||
NCCL_FUNC4(func, redop, rccl_bfloat16)
|
||||
#define NCCL_FUNCS3B(func, redop) \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t)
|
||||
|
||||
// Must be consistent with ncclRedOp_t
|
||||
#define NCCL_FUNCS2A(coll) \
|
||||
NCCL_FUNCS3A(coll, sum ), \
|
||||
NCCL_FUNCS3A(coll, prod), \
|
||||
NCCL_FUNCS3A(coll, max ), \
|
||||
NCCL_FUNCS3A(coll, min )
|
||||
#define NCCL_FUNCS2B(coll) \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy)
|
||||
#define NCCL_FUNCS2A(func) \
|
||||
NCCL_FUNCS3A(func, Sum ), \
|
||||
NCCL_FUNCS3A(func, Prod), \
|
||||
NCCL_FUNCS3A(func, Max ), \
|
||||
NCCL_FUNCS3A(func, Min )
|
||||
#define NCCL_FUNCS2B(func) \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum)
|
||||
|
||||
// [RCCL] Adding clique-based kernels for AllReduce, in-place of unused RingLL28 kernels
|
||||
#define NCCL_FUNC5B(coll, op, dtype) \
|
||||
NCCL_COLL_NAME(coll##LL, op, dtype), \
|
||||
NCCL_COLL_NAME(coll##LL128, op, dtype), \
|
||||
NCCL_COLL_NAME(coll, op, dtype)
|
||||
#define NCCL_FUNC5B(func, algo, redop, type) \
|
||||
NCCL_FUNC_NAME(func, algo, LL, redop, type), \
|
||||
NCCL_FUNC_NAME(func, algo, LL128, redop, type), \
|
||||
NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)
|
||||
|
||||
#define NCCL_FUNC4B(coll, op, dtype) \
|
||||
NCCL_FUNC5(coll##Tree, op, dtype), \
|
||||
NCCL_FUNC5B(coll##Ring, op, dtype), \
|
||||
NCCL_FUNC5(coll##CollNet, op, dtype)
|
||||
#define NCCL_FUNC4B(func, redop, type) \
|
||||
NCCL_FUNC5(func, TREE, redop, type), \
|
||||
NCCL_FUNC5B(func, RING, redop, type), \
|
||||
NCCL_FUNC5(func, COLLNET, redop, type)
|
||||
|
||||
#define NCCL_FUNCS3C(coll, op) \
|
||||
NCCL_FUNC4B(coll, op, i8), \
|
||||
NCCL_FUNC4B(coll, op, u8), \
|
||||
NCCL_FUNC4B(coll, op, i32), \
|
||||
NCCL_FUNC4B(coll, op, u32), \
|
||||
NCCL_FUNC4B(coll, op, i64), \
|
||||
NCCL_FUNC4B(coll, op, u64), \
|
||||
NCCL_FUNC4B(coll, op, f16), \
|
||||
NCCL_FUNC4B(coll, op, f32), \
|
||||
NCCL_FUNC4B(coll, op, f64), \
|
||||
NCCL_FUNC4B(coll, op, b16)
|
||||
|
||||
#define NCCL_FUNCS2C(coll) \
|
||||
NCCL_FUNCS3C(coll, sum ), \
|
||||
NCCL_FUNCS3C(coll, prod), \
|
||||
NCCL_FUNCS3C(coll, max ), \
|
||||
NCCL_FUNCS3C(coll, min )
|
||||
|
||||
// [/RCCL]
|
||||
#define NCCL_FUNCS3C(func, redop) \
|
||||
NCCL_FUNC4B(func, redop, int8_t), \
|
||||
NCCL_FUNC4B(func, redop, uint8_t), \
|
||||
NCCL_FUNC4B(func, redop, int32_t), \
|
||||
NCCL_FUNC4B(func, redop, uint32_t), \
|
||||
NCCL_FUNC4B(func, redop, int64_t), \
|
||||
NCCL_FUNC4B(func, redop, uint64_t), \
|
||||
NCCL_FUNC4B(func, redop, half), \
|
||||
NCCL_FUNC4B(func, redop, float), \
|
||||
NCCL_FUNC4B(func, redop, double), \
|
||||
NCCL_FUNC4B(func, redop, rccl_bfloat16)
|
||||
|
||||
#define NCCL_FUNCS2C(func) \
|
||||
NCCL_FUNCS3C(func, Sum ), \
|
||||
NCCL_FUNCS3C(func, Prod), \
|
||||
NCCL_FUNCS3C(func, Max ), \
|
||||
NCCL_FUNCS3C(func, Min )
|
||||
|
||||
// Must be consistent with ncclFunc_t
|
||||
#define NCCL_FUNCS() { \
|
||||
NCCL_FUNCS2B(ncclBroadcast), \
|
||||
NCCL_FUNCS2A(ncclReduce), \
|
||||
NCCL_FUNCS2B(ncclAllGather), \
|
||||
NCCL_FUNCS2A(ncclReduceScatter), \
|
||||
NCCL_FUNCS2C(ncclAllReduce), \
|
||||
NCCL_COLL_NAME(ncclGather, copy, i8), \
|
||||
NCCL_COLL_NAME(ncclScatter, copy, i8), \
|
||||
NCCL_COLL_NAME(ncclAllToAll, copy, i8), \
|
||||
NCCL_COLL_NAME(ncclAllToAllv, copy, i8), \
|
||||
NCCL_COLL_NAME(ncclSendRecv, copy, i8) }
|
||||
NCCL_FUNCS2B(Broadcast), \
|
||||
NCCL_FUNCS2A(Reduce), \
|
||||
NCCL_FUNCS2B(AllGather), \
|
||||
NCCL_FUNCS2A(ReduceScatter), \
|
||||
NCCL_FUNCS2C(AllReduce), \
|
||||
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), \
|
||||
NCCL_FUNC_NAME(AllToAll, RING, SIMPLE, Sum, int8_t), \
|
||||
NCCL_FUNC_NAME(AllToAllv, RING, SIMPLE, Sum, int8_t) }
|
||||
// [/RCCL]
|
||||
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
using ncclKernelFunc_t = void (*)(struct CollectiveArgs*);
|
||||
using ncclKernelFunc_t = void (*)(struct ncclWorkElem* args);
|
||||
|
||||
static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
|
||||
// Don't try to initialize the host shadow copy of this device-side global
|
||||
// variable. There is no host pointer to a device-side function, which
|
||||
// confuses clang. This will be fixed in the next clang release.
|
||||
#if defined(__HIP_DEVICE_COMPILE__)
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
NCCL_FUNCS2A(ncclReduceScatter),
|
||||
NCCL_FUNCS2C(ncclAllReduce),
|
||||
NCCL_COLL_NAME(ncclGather, copy, i8),
|
||||
NCCL_COLL_NAME(ncclScatter, copy, i8),
|
||||
NCCL_COLL_NAME(ncclAllToAll, copy, i8),
|
||||
NCCL_COLL_NAME(ncclAllToAllv, copy, i8),
|
||||
NCCL_COLL_NAME(ncclSendRecv, copy, i8)
|
||||
NCCL_FUNCS2B(Broadcast),
|
||||
NCCL_FUNCS2A(Reduce),
|
||||
NCCL_FUNCS2B(AllGather),
|
||||
NCCL_FUNCS2A(ReduceScatter),
|
||||
NCCL_FUNCS2C(AllReduce),
|
||||
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
||||
NCCL_FUNC_NAME(AllToAll, RING, SIMPLE, Sum, int8_t),
|
||||
NCCL_FUNC_NAME(AllToAllv, RING, SIMPLE, Sum, int8_t),
|
||||
#endif
|
||||
};
|
||||
|
||||
template<unsigned short f, unsigned short l>
|
||||
struct Caller {
|
||||
static __device__ __host__
|
||||
void call(ncclColl* const c) noexcept
|
||||
void call(struct ncclWorkElem* const c) noexcept
|
||||
{
|
||||
constexpr unsigned short m = f + (l - f) / 2;
|
||||
|
||||
@@ -169,78 +144,88 @@ struct Caller {
|
||||
template<unsigned short f>
|
||||
struct Caller<f, f + 1>{
|
||||
static __device__ __host__
|
||||
void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
|
||||
void call(struct ncclWorkElem* const c) noexcept { ncclFuncs[f](c); }
|
||||
};
|
||||
|
||||
static_assert(FUNC_INDEX_P2P == 1800, "Wrong P2P function index");
|
||||
|
||||
inline
|
||||
__device__
|
||||
void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
|
||||
void NCCL_CALL_FUNCTIONS(struct ncclWorkElem* const c) noexcept {
|
||||
if (c->funcIndex < 360) {
|
||||
if (c->funcIndex % 9 == 0) ncclBroadcastTreeLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 1) ncclBroadcastTreeLL128_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 2) ncclBroadcastTree_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 3) ncclBroadcastRingLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 4) ncclBroadcastRingLL128_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 5) ncclBroadcastRing_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 6) ncclBroadcastCollNetLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 7) ncclBroadcastCollNetLL128_copy_i8(&c->args);
|
||||
else ncclBroadcastCollNet_copy_i8(&c->args);
|
||||
if (c->funcIndex % 9 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 6) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 7) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(c);
|
||||
else ncclFunction_Broadcast_COLLNET_SIMPLE_Sum_int8_t(c);
|
||||
}
|
||||
else if (c->funcIndex < 720) Caller<360, 720>::call(c);
|
||||
else if (c->funcIndex < 1080) {
|
||||
if (c->funcIndex % 9 == 0) ncclAllGatherTreeLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 1) ncclAllGatherTreeLL128_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 2) ncclAllGatherTree_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 3) ncclAllGatherRingLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 4) ncclAllGatherRingLL128_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 5) ncclAllGatherRing_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 6) ncclAllGatherCollNetLL_copy_i8(&c->args);
|
||||
else if (c->funcIndex % 9 == 7) ncclAllGatherCollNetLL128_copy_i8(&c->args);
|
||||
else ncclAllGatherCollNet_copy_i8(&c->args);
|
||||
if (c->funcIndex % 9 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 6) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(c);
|
||||
else if (c->funcIndex % 9 == 7) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(c);
|
||||
else ncclFunction_AllGather_COLLNET_SIMPLE_Sum_int8_t(c);
|
||||
}
|
||||
else if (c->funcIndex < 1800) Caller<1080, 1800>::call(c);
|
||||
else if (c->funcIndex == 1800) {
|
||||
ncclGather_copy_i8(&c->args);
|
||||
}
|
||||
else if (c->funcIndex == 1801) {
|
||||
ncclScatter_copy_i8(&c->args);
|
||||
}
|
||||
else if (c->funcIndex == 1802) {
|
||||
ncclAllToAll_copy_i8(&c->args);
|
||||
}
|
||||
else if (c->funcIndex == 1803) {
|
||||
ncclAllToAllv_copy_i8(&c->args);
|
||||
}
|
||||
else ncclSendRecv_copy_i8(&c->args);
|
||||
else if (c->funcIndex == 1800) ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t(c);
|
||||
else if (c->funcIndex == 1801) ncclFunction_AllToAll_RING_SIMPLE_Sum_int8_t(c);
|
||||
else if (c->funcIndex == 1802) ncclFunction_AllToAllv_RING_SIMPLE_Sum_int8_t(c);
|
||||
}
|
||||
|
||||
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
|
||||
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
|
||||
int* d = (int*)dst;
|
||||
int* s = (int*)src;
|
||||
for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
|
||||
}
|
||||
|
||||
static __device__ bool load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, struct ncclDevComm* comm, uint32_t* abortCount) {
|
||||
// Check whether the last operation was aborted and make sure all threads exit
|
||||
int abort = tid == 0 ? *(comm->abortFlag) : 0;
|
||||
exitIfAbortBarrier(abort, abortCount);
|
||||
load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid, abortCount);
|
||||
static __device__ bool load_coll(struct ncclWork* localWork, struct ncclWork* hostWork, int tid, struct ncclDevComm* comm, uint32_t* abortCount) {
|
||||
__syncthreads();
|
||||
if (tid == 0) hostColl->active = 0;
|
||||
load_parallel(localWork, hostWork, sizeof(struct ncclWork), tid);
|
||||
// Check whether the last operation was aborted and make sure all threads exit
|
||||
int abort = tid == 0 ? LOAD(comm->abortFlag) : 0;
|
||||
exitIfAbortBarrier(abort, abortCount);
|
||||
if (tid == 0) hostWork->elems[0].active = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL>
|
||||
class ncclFunction {
|
||||
public:
|
||||
__device__ void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
#define traceColl(fIdx) \
|
||||
uint32_t pos = __atomic_fetch_add(comm->collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
comm->collTrace[pos].timeStamp = __rtc64(); \
|
||||
comm->collTrace[pos].opCount = localColl.args.opCount; \
|
||||
comm->collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
comm->collTrace[pos].opCount = w->opCount; \
|
||||
comm->collTrace[pos].bid = bid; \
|
||||
comm->collTrace[pos].funcIndex = fIdx;
|
||||
comm->collTrace[pos].funcIndex = fIdx; \
|
||||
if (fIdx == FUNC_INDEX_P2P) { \
|
||||
comm->collTrace[pos].p2p.nThreads = w->p2p.nThreads; \
|
||||
comm->collTrace[pos].p2p.delta = (uint16_t)(w->p2p.delta); \
|
||||
} else if (fIdx == FUNC_INDEX_A2AV) { \
|
||||
comm->collTrace[pos].coll.nThreads = w->nThreads; \
|
||||
comm->collTrace[pos].coll.bid = w->a2av.bid; \
|
||||
comm->collTrace[pos].coll.nChannels = w->a2av.nChannels; \
|
||||
} else { \
|
||||
comm->collTrace[pos].coll.nThreads = w->nThreads; \
|
||||
comm->collTrace[pos].coll.bid = w->coll.bid; \
|
||||
comm->collTrace[pos].coll.nChannels = w->coll.nChannels; \
|
||||
}
|
||||
#define traceKernelLaunch(fIdx) { \
|
||||
traceColl(fIdx); \
|
||||
comm->collTrace[pos].type = ncclCollTraceKernelLaunchType; \
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (comm->collTrace[pos].data_0)); \
|
||||
comm->collTrace[pos].type = ncclCollTraceKernelLaunchType; \
|
||||
}
|
||||
#define traceCollEnd(fIdx) { \
|
||||
traceColl(fIdx); \
|
||||
@@ -250,124 +235,188 @@ static __device__ bool load_coll(struct ncclColl* localColl, struct ncclColl* ho
|
||||
traceColl(fIdx); \
|
||||
comm->collTrace[pos].type = ncclCollTraceAbortType; \
|
||||
}
|
||||
// traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1)
|
||||
#define traceData(data2, data4, data8_0, data8_1) { \
|
||||
uint32_t pos = __atomic_fetch_add(comm->collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
comm->collTrace[pos].bid = blockIdx.x; \
|
||||
comm->collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
comm->collTrace[pos].funcIndex = data2; \
|
||||
comm->collTrace[pos].data_0 = data4; \
|
||||
comm->collTrace[pos].opCount = data8_0; \
|
||||
comm->collTrace[pos].data_1 = data8_1; \
|
||||
comm->collTrace[pos].type = ncclCollTraceDataType; \
|
||||
}
|
||||
#else
|
||||
#define traceKernelLaunch()
|
||||
#define traceCollEnd()
|
||||
#define traceAbort()
|
||||
#define traceKernelLaunch(fIdx)
|
||||
#define traceCollEnd(fIdx)
|
||||
#define traceAbort(fIdx)
|
||||
#define traceData(data2, data4, data8_0, data8_1)
|
||||
#endif
|
||||
|
||||
extern __device__ volatile uint64_t* ncclShmem;
|
||||
#define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
|
||||
|
||||
struct ncclShmemPtrs {
|
||||
void* srcs[NCCL_MAX_DEV_ARITY+1];
|
||||
void* dsts[NCCL_MAX_DEV_ARITY+1];
|
||||
uint64_t barrier;
|
||||
uint64_t barrier_next[MAXWARPS];
|
||||
};
|
||||
|
||||
struct ncclShmemData {
|
||||
union {
|
||||
#ifdef ENABLE_LL128
|
||||
#define ALLOCATE_SHMEM \
|
||||
__shared__ volatile uint64_t shmem[NCCL_LL128_SHMEM_SIZE]; \
|
||||
ncclShmem = shmem; \
|
||||
__shared__ uint32_t sync[NCCL_LL128_MAX_NTHREADS/WARP_SIZE];
|
||||
volatile uint64_t data[NCCL_LL128_SHMEM_SIZE];
|
||||
#else
|
||||
#define ALLOCATE_SHMEM \
|
||||
uint32_t* sync = 0;
|
||||
volatile uint64_t* data;
|
||||
#endif
|
||||
struct ncclShmemPtrs ptrs[NCCL_MAX_GROUPS];
|
||||
};
|
||||
uint32_t sync[MAXWARPS];
|
||||
struct ncclWork localWork;
|
||||
};
|
||||
|
||||
/* Functions for aggregation case */
|
||||
#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
|
||||
__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
|
||||
coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(args); \
|
||||
extern __device__ struct ncclShmemData *ncclShmem;
|
||||
template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL, int FINDEX, bool COLLTRACE>
|
||||
__device__ void ncclKernel(struct ncclWorkElem first) {
|
||||
int tid = threadIdx.x;
|
||||
int bid = blockIdx.x;
|
||||
__shared__ struct ncclShmemData shmem;
|
||||
ncclShmem = &shmem;
|
||||
__shared__ uint32_t abortCount;
|
||||
if (tid == 0) {
|
||||
abortCount = 0;
|
||||
for (auto i = 0; i < NCCL_MAX_GROUPS; i++) {
|
||||
shmem.ptrs[i].barrier = 0;
|
||||
for (auto j = 0; j < MAXWARPS; j++) shmem.ptrs[i].barrier_next[j] = 0;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
auto f = ncclFunction<FUNCTION, ALGO, PROTO, REDOP, T, UNROLL>();
|
||||
|
||||
struct ncclDevComm* comm = first.comm;
|
||||
struct ncclChannel* channel = comm->channels+bid;
|
||||
struct ncclWorkElem* w = NULL;
|
||||
uint16_t index = first.index;
|
||||
bool firstLaunch = true;
|
||||
|
||||
if (bid == 0 && first.funcIndex != FUNC_INDEX_P2P) w = &first;
|
||||
|
||||
while (1) {
|
||||
if (w == NULL) {
|
||||
w = shmem.localWork.elems;
|
||||
if (!load_coll(&shmem.localWork, channel->workFifo+index, tid, comm, &abortCount)) {
|
||||
if (COLLTRACE && tid == 0) traceAbort(0xffff);
|
||||
return;
|
||||
}
|
||||
if (COLLTRACE && tid == 0) {
|
||||
if (firstLaunch) traceKernelLaunch(w->funcIndex);
|
||||
if (!firstLaunch) traceCollEnd(w->funcIndex);
|
||||
firstLaunch = false;
|
||||
}
|
||||
} else if (COLLTRACE && tid == 0) {
|
||||
traceKernelLaunch(w->funcIndex);
|
||||
firstLaunch = false;
|
||||
}
|
||||
if (tid < w->nThreads) {
|
||||
if (w->funcIndex == FINDEX) {
|
||||
f.run(w);
|
||||
} else {
|
||||
NCCL_CALL_FUNCTIONS(w);
|
||||
}
|
||||
}
|
||||
index = (index+1) % NCCL_MAX_OPS;
|
||||
if (w->active == 2) {
|
||||
if (COLLTRACE && tid == 0) traceCollEnd(0xffff);
|
||||
return;
|
||||
}
|
||||
w = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* Kernels with the first operation inlined */
|
||||
#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
|
||||
#define IMPL_COLL_KERN(func, algo, proto, redop, type, fIndex) \
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
|
||||
__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm) { \
|
||||
int tid = threadIdx.x; \
|
||||
int bid = blockIdx.x; \
|
||||
ALLOCATE_SHMEM; \
|
||||
__shared__ struct ncclColl localColl; \
|
||||
__shared__ uint32_t abortCount; \
|
||||
__shared__ uint64_t barrier[MAXBARRIERS]; \
|
||||
__shared__ uint64_t barrier_next[MAXBARRIERS*MAXWARPS]; \
|
||||
if (tid == 0) abortCount = 0; \
|
||||
__syncthreads(); \
|
||||
\
|
||||
struct ncclChannel* channel = comm->channels+bid; \
|
||||
if (tid == 0) { \
|
||||
channel->sync = sync; \
|
||||
channel->barrier = barrier; \
|
||||
channel->barrier_next = barrier_next; \
|
||||
for (auto i = 0; i < MAXBARRIERS; i++) barrier[i] = 0; \
|
||||
for (auto i = 0; i < MAXBARRIERS*MAXWARPS; i++) barrier_next[i] = 0; \
|
||||
} \
|
||||
if (!load_coll(&localColl, channel->collectives+channel->collFifoHead, tid, comm, &abortCount)) { \
|
||||
if (tid == 0) traceAbort(-1); \
|
||||
return; \
|
||||
} \
|
||||
if (tid == 0) traceKernelLaunch(localColl.funcIndex); \
|
||||
while (1) { \
|
||||
if (tid < localColl.args.common.nThreads) { \
|
||||
if (localColl.funcIndex == fIndex) { \
|
||||
coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&localColl.args); \
|
||||
} else { \
|
||||
NCCL_CALL_FUNCTIONS(&localColl); \
|
||||
} \
|
||||
} \
|
||||
int nextIndex = localColl.nextIndex; \
|
||||
if (tid == 0) channel->collFifoHead = nextIndex; \
|
||||
\
|
||||
if (localColl.active == 2) { \
|
||||
if (tid == 0) traceCollEnd(-1); \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
/* Load next collective operation*/ \
|
||||
if (!load_coll(&localColl, channel->collectives+nextIndex, tid, comm, &abortCount)) { \
|
||||
if (tid == 0) traceAbort(-1); \
|
||||
break; \
|
||||
} \
|
||||
if (tid == 0) traceCollEnd(localColl.funcIndex); \
|
||||
} \
|
||||
__global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(struct ncclWorkElem first) { \
|
||||
if (first.comm->collTraceThread) \
|
||||
ncclKernel<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL, fIndex, true>(first); \
|
||||
else \
|
||||
ncclKernel<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL, fIndex, false>(first); \
|
||||
}
|
||||
|
||||
#define IMPL_COLL_KERN_sum(coll, op, ncclFunc, dtype, ctype, fIndex) \
|
||||
IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
|
||||
#define IMPL_COLL_KERN_copy(coll, op, ncclFunc, dtype, ctype, fIndex) \
|
||||
IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
|
||||
#define IMPL_COLL_KERN_prod(coll, op, ncclFunc, dtype, ctype, fIndex)
|
||||
#define IMPL_COLL_KERN_min(coll, op, ncclFunc, dtype, ctype, fIndex)
|
||||
#define IMPL_COLL_KERN_max(coll, op, ncclFunc, dtype, ctype, fIndex)
|
||||
// Examples : AllReduce, RING, LL, Sum, uint8
|
||||
/* Functions for aggregation case */
|
||||
#define IMPL_COLL_FUNC(func, algo, proto, redop, type) \
|
||||
__device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, redop, type)(struct ncclWorkElem* args) { \
|
||||
auto f = ncclFunction<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL>(); \
|
||||
f.run(args); \
|
||||
}
|
||||
|
||||
// Only generate inline kernels for LL
|
||||
#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
|
||||
IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
|
||||
IMPL_COLL_FUNC(coll##LL128, op, ncclFunc, dtype, ctype) \
|
||||
IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
|
||||
#define IMPL_COLL4(func, algo, redop, type, ncclType) \
|
||||
IMPL_COLL_FUNC(func, algo, LL, redop, type) \
|
||||
IMPL_COLL_FUNC(func, algo, SIMPLE, redop, type)
|
||||
|
||||
#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
|
||||
IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
|
||||
IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING) \
|
||||
IMPL_COLL4(coll##CollNet, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_COLLNET)
|
||||
#define IMPL_COLL3(func, redop, type, ncclType) \
|
||||
IMPL_COLL4(func, TREE, redop, type, ncclType) \
|
||||
IMPL_COLL4(func, RING, redop, type, ncclType) \
|
||||
IMPL_COLL4(func, COLLNET, redop, type, ncclType)
|
||||
|
||||
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, u8, uint8_t, ncclColl, ncclOp, ncclUint8) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, i32, int32_t, ncclColl, ncclOp, ncclInt32) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, i64, int64_t, ncclColl, ncclOp, ncclInt64) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, f16, half, ncclColl, ncclOp, ncclFloat16) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, f64, double, ncclColl, ncclOp, ncclFloat64) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, b16, rccl_bfloat16, ncclColl, ncclOp, ncclBfloat16)
|
||||
#define IMPL_COLL2(func, redop) \
|
||||
IMPL_COLL3(func, redop, int8_t, ncclInt8) \
|
||||
IMPL_COLL3(func, redop, uint8_t, ncclUint8) \
|
||||
IMPL_COLL3(func, redop, int32_t, ncclInt32) \
|
||||
IMPL_COLL3(func, redop, uint32_t, ncclUint32) \
|
||||
IMPL_COLL3(func, redop, int64_t, ncclInt64) \
|
||||
IMPL_COLL3(func, redop, uint64_t, ncclUint64) \
|
||||
IMPL_COLL3(func, redop, half, ncclFloat16) \
|
||||
IMPL_COLL3(func, redop, float, ncclFloat32) \
|
||||
IMPL_COLL3(func, redop, double, ncclFloat64) \
|
||||
IMPL_COLL3(func, redop, rccl_bfloat16, ncclBfloat16)
|
||||
|
||||
// Reduction define all functions
|
||||
#define IMPL_COLL_R(collf, colln) \
|
||||
IMPL_COLL2(collf, sum, FuncSum, colln, ncclSum); \
|
||||
IMPL_COLL2(collf, prod, FuncProd, colln, ncclProd); \
|
||||
IMPL_COLL2(collf, min, FuncMin, colln, ncclMin); \
|
||||
IMPL_COLL2(collf, max, FuncMax, colln, ncclMax);
|
||||
#define IMPL_COLL_R(func) \
|
||||
IMPL_COLL2(func, Sum) \
|
||||
IMPL_COLL2(func, Prod) \
|
||||
IMPL_COLL2(func, Min) \
|
||||
IMPL_COLL2(func, Max)
|
||||
|
||||
// Copy primitives only define one
|
||||
#define IMPL_COLL_C(collf, colln) \
|
||||
IMPL_COLL3(collf, copy, FuncSum, i8, int8_t, colln, ncclSum, ncclInt8);
|
||||
// [RCCL] Define clique-based implementations (repurposed LL128)
|
||||
#define IMPL_COLL4_CLIQUE(func, algo, redop, type, ncclType) \
|
||||
IMPL_COLL_FUNC(func, algo, LL, redop, type) \
|
||||
IMPL_COLL_FUNC(func, algo, LL128, redop, type) \
|
||||
IMPL_COLL_FUNC(func, algo, SIMPLE, redop, type)
|
||||
|
||||
#define COLL_UNROLL 2
|
||||
#define IMPL_COLL3_CLIQUE(func, redop, type, ncclType) \
|
||||
IMPL_COLL4(func, TREE, redop, type, ncclType) \
|
||||
IMPL_COLL4_CLIQUE(func, RING, redop, type, ncclType) \
|
||||
IMPL_COLL4(func, COLLNET, redop, type, ncclType)
|
||||
|
||||
#define IMPL_COLL2_CLIQUE(func, redop) \
|
||||
IMPL_COLL3_CLIQUE(func, redop, int8_t, ncclInt8) \
|
||||
IMPL_COLL3_CLIQUE(func, redop, uint8_t, ncclUint8) \
|
||||
IMPL_COLL3_CLIQUE(func, redop, int32_t, ncclInt32) \
|
||||
IMPL_COLL3_CLIQUE(func, redop, uint32_t, ncclUint32) \
|
||||
IMPL_COLL3_CLIQUE(func, redop, int64_t, ncclInt64) \
|
||||
IMPL_COLL3_CLIQUE(func, redop, uint64_t, ncclUint64) \
|
||||
IMPL_COLL3_CLIQUE(func, redop, half, ncclFloat16) \
|
||||
IMPL_COLL3_CLIQUE(func, redop, float, ncclFloat32) \
|
||||
IMPL_COLL3_CLIQUE(func, redop, double, ncclFloat64) \
|
||||
IMPL_COLL3_CLIQUE(func, redop, rccl_bfloat16, ncclBfloat16)
|
||||
|
||||
#define IMPL_COLL_CLIQUE(func) \
|
||||
IMPL_COLL2_CLIQUE(func, Sum) \
|
||||
IMPL_COLL2_CLIQUE(func, Prod) \
|
||||
IMPL_COLL2_CLIQUE(func, Min) \
|
||||
IMPL_COLL2_CLIQUE(func, Max)
|
||||
// [/RCCL]
|
||||
|
||||
|
||||
// Copy primitives only define one function for copy
|
||||
#define IMPL_COLL_C(func) IMPL_COLL3(func, Sum, int8_t, ncclInt8);
|
||||
|
||||
// Point-to-point primitives only have one function/kernel.
|
||||
#define IMPL_COLL_P(func) \
|
||||
IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t); \
|
||||
IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -282,28 +282,57 @@ inline __device__ void Store128(Pack128* p, Pack128& v) {
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
|
||||
__device__ void ReduceCopyMulti(const int tid, const int nthreads,
|
||||
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
|
||||
const int offset, const int N) {
|
||||
for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
|
||||
T val = vFetch(srcs[0]+idx);
|
||||
#pragma unroll
|
||||
for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
|
||||
#pragma unroll 1
|
||||
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
|
||||
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
|
||||
__device__ __forceinline__ void ReduceCopyMulti(const int w, const int nw, const int t,
|
||||
int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Nelem) {
|
||||
const int inc = nw * UNROLL * WARP_SIZE;
|
||||
int offset = w * UNROLL * WARP_SIZE + t;
|
||||
|
||||
const T* srcs[MAXSRCS];
|
||||
for (int i=0; i<MAXSRCS; i++) srcs[i] = s[i]+elemOffset+offset;
|
||||
T* dsts[MAXDSTS];
|
||||
for (int i=0; i<MAXDSTS; i++) dsts[i] = d[i]+elemOffset+offset;
|
||||
|
||||
while (offset < Nelem) {
|
||||
T vals[UNROLL];
|
||||
// Load and reduce
|
||||
for (int u = 0; u < UNROLL; ++u) vals[u] = vFetch(srcs[0]+u*WARP_SIZE);
|
||||
|
||||
#pragma unroll
|
||||
for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
|
||||
#pragma unroll 1
|
||||
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
|
||||
for (int i=1; i<MINSRCS; i++) {
|
||||
T vals2[UNROLL];
|
||||
for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
|
||||
for (int u = 0; u < UNROLL; ++u) vals[u] = FUNC()(vals[u], vals2[u]);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i=MINSRCS; i<MAXSRCS; i++) {
|
||||
if (i<nsrcs) {
|
||||
T vals2[UNROLL];
|
||||
for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
|
||||
for (int u = 0; u < UNROLL; ++u) vals[u] = FUNC()(vals[u], vals2[u]);
|
||||
}
|
||||
}
|
||||
|
||||
// Store
|
||||
#pragma unroll
|
||||
for (int i = 0; i < MINDSTS; i++) {
|
||||
for (int u = 0; u < UNROLL; ++u) vStore(dsts[i]+u*WARP_SIZE, vals[u]);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i=MINDSTS; i<MAXDSTS; i++) {
|
||||
if (i<ndsts) {
|
||||
for (int u = 0; u < UNROLL; ++u) vStore(dsts[i]+u*WARP_SIZE, vals[u]);
|
||||
}
|
||||
}
|
||||
for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
|
||||
for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
|
||||
offset += inc;
|
||||
}
|
||||
}
|
||||
|
||||
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
|
||||
__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
|
||||
int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
|
||||
const int elemOffset, const int Npack) {
|
||||
__device__ void ReduceCopy128bMulti(const int w, const int nw, const int t,
|
||||
int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Npack) {
|
||||
const int inc = nw * UNROLL * WARP_SIZE;
|
||||
int offset = w * UNROLL * WARP_SIZE + t;
|
||||
|
||||
@@ -334,8 +363,10 @@ __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
|
||||
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
|
||||
}
|
||||
#pragma unroll 1
|
||||
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
|
||||
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
|
||||
for (int i=MINDSTS; i<MAXDSTS; i++) {
|
||||
if (i<ndsts) {
|
||||
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
|
||||
}
|
||||
}
|
||||
for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
|
||||
for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
|
||||
@@ -343,12 +374,10 @@ __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
template <typename T>
|
||||
__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(int32_t); }
|
||||
#else
|
||||
__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
|
||||
#endif
|
||||
|
||||
#define PACKELEMS (sizeof(Pack128) / sizeof(T))
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
// Multiply UNROLL by 2 if single source/single destination
|
||||
@@ -360,73 +389,62 @@ __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
|
||||
#endif
|
||||
|
||||
template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
|
||||
__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
|
||||
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
|
||||
__device__ __forceinline__ void ReduceOrCopyMulti(const int tid, const int nthreads,
|
||||
int nsrcs, const T** srcs, int ndsts, T** dsts,
|
||||
int N) {
|
||||
int Nrem = N;
|
||||
if (Nrem <= 0) return;
|
||||
|
||||
int alignDiff = 0;
|
||||
int align = ptrAlign128(srcs[0]);
|
||||
#pragma unroll
|
||||
for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
|
||||
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
|
||||
#pragma unroll
|
||||
for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
|
||||
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
int Npreamble = alignDiff ? Nrem :
|
||||
N < alignof(int32_t) ? N :
|
||||
(alignof(int32_t) - align) % alignof(int32_t);
|
||||
#else
|
||||
int Npreamble = alignDiff ? Nrem :
|
||||
N < alignof(Pack128) ? N :
|
||||
(alignof(Pack128) - align) % alignof(Pack128);
|
||||
#endif
|
||||
|
||||
// stage 1: preamble: handle any elements up to the point of everything coming
|
||||
// into alignment
|
||||
if (Npreamble) {
|
||||
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
|
||||
Nrem -= Npreamble;
|
||||
if (Nrem == 0) return;
|
||||
}
|
||||
int offset = Npreamble;
|
||||
|
||||
// stage 2: fast path: use 128b loads/stores to do the bulk of the work,
|
||||
// assuming the pointers we have are all 128-bit alignable.
|
||||
int w = tid / WARP_SIZE; // Warp number
|
||||
int nw = nthreads / WARP_SIZE; // Number of warps
|
||||
int t = tid % WARP_SIZE; // Thread (inside the warp)
|
||||
|
||||
const int packFactor = sizeof(Pack128) / sizeof(T);
|
||||
// Check that all is 16B aligned. If not don't use 16B load/stores.
|
||||
int align = 0;
|
||||
#pragma unroll
|
||||
for (int i=0; i<MINSRCS; i++) align |= ptrAlign128(srcs[i]);
|
||||
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) align |= ptrAlign128(srcs[i]);
|
||||
#pragma unroll
|
||||
for (int i=0; i<MINDSTS; i++) align |= ptrAlign128(dsts[i]);
|
||||
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) align |= ptrAlign128(dsts[i]);
|
||||
|
||||
// stage 2a: main loop
|
||||
int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
|
||||
* (AUTOUNROLL * WARP_SIZE); // round down
|
||||
int Nelem2a = Npack2a * packFactor;
|
||||
int offset = 0;
|
||||
if (align == 0) {
|
||||
// fast path: use 128b loads/stores to do the bulk of the work,
|
||||
// assuming the pointers we have are all 128-bit aligned.
|
||||
|
||||
ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
|
||||
// main loop
|
||||
int Npack = (Nrem / (PACKELEMS*AUTOUNROLL*WARP_SIZE)) * (AUTOUNROLL*WARP_SIZE); // round down
|
||||
int Nelem = Npack * PACKELEMS;
|
||||
|
||||
Nrem -= Nelem2a;
|
||||
ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack);
|
||||
|
||||
Nrem -= Nelem;
|
||||
if (Nrem == 0) return;
|
||||
offset += Nelem;
|
||||
|
||||
// slightly less optimized for section when we don't have full unrolling
|
||||
Npack = Nrem / PACKELEMS;
|
||||
Nelem = Npack * PACKELEMS;
|
||||
|
||||
ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack);
|
||||
|
||||
Nrem -= Nelem;
|
||||
if (Nrem == 0) return;
|
||||
offset += Nelem;
|
||||
}
|
||||
|
||||
// unrolled, by-type (mostly for unaligned buffers)
|
||||
int Nelem = (Nrem / (UNROLL*PACKELEMS/2*WARP_SIZE)) * (UNROLL*PACKELEMS/2*WARP_SIZE); // round down
|
||||
|
||||
ReduceCopyMulti<FUNC, T, UNROLL*PACKELEMS/2, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nelem);
|
||||
|
||||
Nrem -= Nelem;
|
||||
if (Nrem == 0) return;
|
||||
offset += Nelem2a;
|
||||
offset += Nelem;
|
||||
|
||||
// stage 2b: slightly less optimized for section when we don't have full
|
||||
// unrolling
|
||||
|
||||
int Npack2b = Nrem / packFactor;
|
||||
int Nelem2b = Npack2b * packFactor;
|
||||
|
||||
ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
|
||||
|
||||
Nrem -= Nelem2b;
|
||||
if (Nrem == 0) return;
|
||||
offset += Nelem2b;
|
||||
|
||||
// stage 2c: tail
|
||||
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
|
||||
// no unroll, by type. Should finish what's remaining.
|
||||
ReduceCopyMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nrem);
|
||||
}
|
||||
|
||||
#endif // COMMON_KERNEL_H_
|
||||
|
||||
@@ -9,62 +9,62 @@
|
||||
#include "collectives.h"
|
||||
#include "common.h"
|
||||
|
||||
__device__ volatile uint64_t* ncclShmem;
|
||||
__device__ struct ncclShmemData* ncclShmem;
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
NCCL_FUNC5(coll, op, dtype) \
|
||||
NCCL_COLL_NAME(coll##LL, op, dtype), \
|
||||
NCCL_COLL_NAME(coll##LL128, op, dtype), \
|
||||
NCCL_COLL_NAME(coll, op, dtype)
|
||||
#define NCCL_FUNC5(func, algo, redop, type) \
|
||||
NCCL_FUNC_NAME(func, algo, LL, redop, type), \
|
||||
NCCL_FUNC_NAME(func, algo, LL128, redop, type), \
|
||||
NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)
|
||||
|
||||
#define NCCL_FUNC4(coll, op, dtype) \
|
||||
NCCL_FUNC5(coll##Tree, op, dtype), \
|
||||
NCCL_FUNC5(coll##Ring, op, dtype), \
|
||||
NCCL_FUNC5(coll##CollNet, op, dtype)
|
||||
#define NCCL_FUNC4(func, redop, type) \
|
||||
NCCL_FUNC5(func, TREE, redop, type), \
|
||||
NCCL_FUNC5(func, RING, redop, type), \
|
||||
NCCL_FUNC5(func, COLLNET, redop, type)
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, u8), \
|
||||
NCCL_FUNC4(coll, op, i32), \
|
||||
NCCL_FUNC4(coll, op, u32), \
|
||||
NCCL_FUNC4(coll, op, i64), \
|
||||
NCCL_FUNC4(coll, op, u64), \
|
||||
NCCL_FUNC4(coll, op, f16), \
|
||||
NCCL_FUNC4(coll, op, f32), \
|
||||
NCCL_FUNC4(coll, op, f64)
|
||||
#define NCCL_FUNCS3B(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8)
|
||||
#define NCCL_FUNCS3A(func, redop) \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, uint8_t), \
|
||||
NCCL_FUNC4(func, redop, int32_t), \
|
||||
NCCL_FUNC4(func, redop, uint32_t), \
|
||||
NCCL_FUNC4(func, redop, int64_t), \
|
||||
NCCL_FUNC4(func, redop, uint64_t), \
|
||||
NCCL_FUNC4(func, redop, half), \
|
||||
NCCL_FUNC4(func, redop, float), \
|
||||
NCCL_FUNC4(func, redop, double)
|
||||
#define NCCL_FUNCS3B(func, redop) \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t)
|
||||
|
||||
// Must be consistent with ncclRedOp_t
|
||||
#define NCCL_FUNCS2A(coll) \
|
||||
NCCL_FUNCS3A(coll, sum ), \
|
||||
NCCL_FUNCS3A(coll, prod), \
|
||||
NCCL_FUNCS3A(coll, max ), \
|
||||
NCCL_FUNCS3A(coll, min )
|
||||
#define NCCL_FUNCS2B(coll) \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy)
|
||||
#define NCCL_FUNCS2A(func) \
|
||||
NCCL_FUNCS3A(func, Sum ), \
|
||||
NCCL_FUNCS3A(func, Prod), \
|
||||
NCCL_FUNCS3A(func, Max ), \
|
||||
NCCL_FUNCS3A(func, Min )
|
||||
#define NCCL_FUNCS2B(func) \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum)
|
||||
|
||||
// Must be consistent with ncclFunc_t
|
||||
#define NCCL_FUNCS() { \
|
||||
NCCL_COLL_NAME(ncclSendRecv, copy, i8),\
|
||||
NCCL_FUNCS2B(ncclBroadcast), \
|
||||
NCCL_FUNCS2A(ncclReduce), \
|
||||
NCCL_FUNCS2B(ncclAllGather), \
|
||||
NCCL_FUNCS2A(ncclReduceScatter), \
|
||||
NCCL_FUNCS2A(ncclAllReduce) }
|
||||
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),\
|
||||
NCCL_FUNCS2B(Broadcast), \
|
||||
NCCL_FUNCS2A(Reduce), \
|
||||
NCCL_FUNCS2B(AllGather), \
|
||||
NCCL_FUNCS2A(ReduceScatter), \
|
||||
NCCL_FUNCS2A(AllReduce) }
|
||||
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
__device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
|
||||
@@ -72,12 +72,12 @@ __device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCC
|
||||
// variable. There is no host pointer to a device-side function, which
|
||||
// confuses clang. This will be fixed in the next clang release.
|
||||
#if __CUDA_ARCH__
|
||||
NCCL_COLL_NAME(ncclSendRecv, copy, i8),
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
NCCL_FUNCS2A(ncclReduceScatter),
|
||||
NCCL_FUNCS2A(ncclAllReduce)
|
||||
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
||||
NCCL_FUNCS2B(Broadcast),
|
||||
NCCL_FUNCS2A(Reduce),
|
||||
NCCL_FUNCS2B(AllGather),
|
||||
NCCL_FUNCS2A(ReduceScatter),
|
||||
NCCL_FUNCS2A(AllReduce)
|
||||
#endif
|
||||
};
|
||||
#endif
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "gather.h"
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_FUNC(ncclGather, copy, FuncSum, i8, int8_t);
|
||||
IMPL_COLL_KERN(ncclGather, copy, FuncSum, i8, int8_t, 0);
|
||||
@@ -1,75 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "devcomm.h"
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclGatherKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int nranks = comm->nRanks;
|
||||
const int bid = args->coll.bid;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * GATHER_CHUNKSTEPS;
|
||||
const int peersPerChan = DIVUP(nranks, nChannels);
|
||||
const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
|
||||
const int root = args->coll.root;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
for (int i = 0; i < peersPerChan; i++) {
|
||||
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
|
||||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
|
||||
continue;
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
if ((blockIdx.x*peersPerChan+i)%nranks == 0 && rank == root) {
|
||||
const T* sendbuff = thisInput+chunkOffset;
|
||||
T* recvbuff = thisOutput+chunkOffset+rank*size;
|
||||
if (tid < nthreads && sendbuff != recvbuff) {
|
||||
// local copy
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
|
||||
int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
|
||||
int peerNone = -1;
|
||||
if (rank == root) {
|
||||
ncclPrimitives<UNROLL, GATHER_CHUNKSTEPS/GATHER_SLICESTEPS, GATHER_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &peerRecv, &peerNone, NULL, stepSize, channel, comm);
|
||||
|
||||
ssize_t recv_offset = chunkOffset + peerRecv*size;
|
||||
prims.recv(thisOutput+recv_offset, nelem);
|
||||
}
|
||||
else {
|
||||
if (peerSend == root) {
|
||||
ncclPrimitives<UNROLL, GATHER_CHUNKSTEPS/GATHER_SLICESTEPS, GATHER_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &peerNone, &peerSend, NULL, stepSize, channel, comm);
|
||||
|
||||
ssize_t send_offset = chunkOffset;
|
||||
prims.send(thisInput+send_offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -9,29 +9,9 @@
|
||||
#define OP128_H_
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
|
||||
v0=LOAD(ptr);
|
||||
v1=LOAD(ptr+1);
|
||||
}
|
||||
|
||||
inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
|
||||
STORE(ptr, v0);
|
||||
STORE(ptr+1, v1);
|
||||
}
|
||||
|
||||
inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
|
||||
return (uint64_t*)shmemGenericPtr;
|
||||
}
|
||||
|
||||
inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
|
||||
v0=LOAD(shmemAsmPtr);
|
||||
v1=LOAD(shmemAsmPtr+1);
|
||||
}
|
||||
|
||||
inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
|
||||
STORE(shmemAsmPtr, v0);
|
||||
STORE(shmemAsmPtr+1, v1);
|
||||
}
|
||||
#else
|
||||
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
|
||||
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
|
||||
|
||||
@@ -32,87 +32,79 @@
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define barrier_by_id(id) do { \
|
||||
#define barrier_by_group() do { \
|
||||
const int w = threadIdx.x/WARP_SIZE; \
|
||||
barrier_next[id*MAXWARPS+w] += nthreads/WARP_SIZE; \
|
||||
__atomic_fetch_add(barriers+id, 1, __ATOMIC_SEQ_CST); \
|
||||
while (LOAD(barriers+id) < barrier_next[id*MAXWARPS+w]) /* spin */; \
|
||||
const int wid = threadIdx.x%WARP_SIZE; \
|
||||
if (wid == 0) { \
|
||||
barrier_next[w] += nthreads/WARP_SIZE; \
|
||||
__atomic_fetch_add(barriers, 1, __ATOMIC_SEQ_CST); \
|
||||
while (LOAD(barriers) < barrier_next[w]) /* spin */; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define ROLE_SRC 0x01
|
||||
#define ROLE_DST 0x02
|
||||
#define ROLE_WAIT_RECV 0x04
|
||||
#define ROLE_WAIT_SEND 0x08
|
||||
#define ROLE_POST_SEND 0x10
|
||||
#define ROLE_POST_RECV 0x20
|
||||
|
||||
// Implementation of primitive types
|
||||
template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, int DIRECT, class FUNC>
|
||||
class ncclPrimitives {
|
||||
private:
|
||||
const int tid;
|
||||
const int nthreads;
|
||||
const int wid;
|
||||
int nthreads;
|
||||
int nworkers;
|
||||
const int stepSize;
|
||||
int nrecv = 0;
|
||||
int nsend = 0;
|
||||
struct ncclConnInfo* recvConn = NULL;
|
||||
volatile uint64_t* recvConnHeadPtr = NULL;
|
||||
uint64_t recvConnHead;
|
||||
volatile uint64_t* recvConnTailPtr = NULL;
|
||||
uint64_t recvConnTail;
|
||||
uint64_t recvConnTailCache; // Cache last seen value
|
||||
struct ncclConnInfo* conn = NULL;
|
||||
volatile int* connSizesFifoPtr = NULL;
|
||||
void** connPtrsFifoPtr = NULL;
|
||||
volatile uint64_t* connHeadPtr = NULL;
|
||||
volatile uint64_t* connTailPtr = NULL;
|
||||
uint64_t connTailCache; // Cache last seen value
|
||||
uint64_t connHeadCache; // Cache last seen value
|
||||
|
||||
struct ncclConnInfo* sendConn = NULL;
|
||||
volatile int* sendConnFifoPtr = NULL;
|
||||
volatile uint64_t* sendConnTailPtr = NULL;
|
||||
uint64_t sendConnTail;
|
||||
volatile uint64_t* sendConnHeadPtr = NULL;
|
||||
uint64_t sendConnHead;
|
||||
uint64_t sendConnHeadCache; // Cache last seen value
|
||||
|
||||
uint64_t recvStep[NRECV];
|
||||
uint64_t sendStep[NSEND];
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
const T* recvDirectBuff[NRECV];
|
||||
T* sendDirectBuff[NSEND];
|
||||
#endif
|
||||
const T* recvBuff[NRECV];
|
||||
T* sendBuff[NSEND];
|
||||
int index; // Peer index I'm responsible for
|
||||
int peer = -1;
|
||||
int role = 0;
|
||||
int group;
|
||||
uint64_t step;
|
||||
T* direct = NULL;
|
||||
T* buff;
|
||||
struct ncclDevComm* comm;
|
||||
|
||||
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
|
||||
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
|
||||
inline __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
|
||||
inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
|
||||
const T** srcs;
|
||||
T** dsts;
|
||||
|
||||
uint64_t* barriers;
|
||||
uint64_t* barrier_next;
|
||||
|
||||
// Don't use barrier 0 as it's used by the final sync
|
||||
inline __device__ void barrier() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
if (wid == 0) {
|
||||
if (NRECV < NSEND) barrier_by_id(0);
|
||||
else barrier_by_id(1);
|
||||
}
|
||||
if (nthreads == WARP_SIZE) __syncwarp();
|
||||
else barrier_by_group();
|
||||
#else
|
||||
if (NSEND>NRECV) {
|
||||
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads+WARP_SIZE));
|
||||
} else {
|
||||
asm volatile ("bar.sync 2, %0;" :: "r"(nthreads+WARP_SIZE));
|
||||
}
|
||||
if (nthreads == WARP_SIZE) __syncwarp();
|
||||
else asm volatile ("bar.sync %0, %1;" :: "r"(group+1), "r"(nthreads));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ void subBarrier() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
__syncthreads();
|
||||
barrier();
|
||||
#else
|
||||
if (NSEND>NRECV) {
|
||||
asm volatile ("bar.sync 3, %0;" :: "r"(nthreads));
|
||||
} else {
|
||||
asm volatile ("bar.sync 4, %0;" :: "r"(nthreads));
|
||||
}
|
||||
if (nworkers == nthreads) barrier();
|
||||
else asm volatile ("bar.sync %0, %1;" :: "r"(group+2), "r"(nworkers));
|
||||
#endif
|
||||
}
|
||||
|
||||
uint32_t spins = 0;
|
||||
uint32_t abort = 0;
|
||||
|
||||
inline __device__ int checkAbort(int i, int send) {
|
||||
inline __device__ int checkAbort() {
|
||||
spins++;
|
||||
if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
|
||||
abort = LOAD(comm->abortFlag);
|
||||
@@ -121,90 +113,54 @@ class ncclPrimitives {
|
||||
return abort;
|
||||
}
|
||||
|
||||
inline __device__ void waitSend(int nbytes) {
|
||||
spins = 0;
|
||||
if (sendConnHeadPtr) {
|
||||
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + SLICESTEPS) {
|
||||
sendConnHeadCache = LOAD(sendConnHeadPtr);
|
||||
if (checkAbort(wid, 1)) break;
|
||||
}
|
||||
if (sendConnFifoPtr) {
|
||||
STORE(sendConnFifoPtr+sendConnHead%NCCL_STEPS, nbytes);
|
||||
}
|
||||
sendConnHead += SLICESTEPS;
|
||||
}
|
||||
template <int DIRECTPTR>
|
||||
inline __device__ T* directPtr(ssize_t directOffset) {
|
||||
return DIRECTPTR && direct ? direct+directOffset : buff+(step%NCCL_STEPS)*stepSize;
|
||||
}
|
||||
|
||||
inline __device__ void waitRecv() {
|
||||
template <int DST, int DIRECTSEND>
|
||||
inline __device__ void waitSend(ssize_t directOffset, int nbytes) {
|
||||
spins = 0;
|
||||
if (recvConnTailPtr) {
|
||||
#ifdef ENABLE_PROFILING
|
||||
uint64_t t0 = __rtc64();
|
||||
#endif
|
||||
while (recvConnTailCache < recvConnTail + SLICESTEPS) {
|
||||
recvConnTailCache = LOAD(recvConnTailPtr);
|
||||
if (checkAbort(wid, 0)) break;
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
__atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __rtc64() - t0, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
recvConnTail += SLICESTEPS;
|
||||
while (connHeadCache + NCCL_STEPS < step + SLICESTEPS) {
|
||||
connHeadCache = LOAD(connHeadPtr);
|
||||
if (checkAbort()) break;
|
||||
}
|
||||
if (connSizesFifoPtr) {
|
||||
STORE(connSizesFifoPtr+step%NCCL_STEPS, nbytes);
|
||||
}
|
||||
|
||||
if (connPtrsFifoPtr) dsts[DST+index] = ((T **)connPtrsFifoPtr)[step%NCCL_STEPS];
|
||||
else dsts[DST+index] = directPtr<DIRECTSEND>(directOffset);
|
||||
step += SLICESTEPS;
|
||||
}
|
||||
|
||||
inline __device__ void incRecv(int i) {
|
||||
recvStep[i] += SLICESTEPS;
|
||||
template <int SRC, int DIRECTRECV>
|
||||
inline __device__ void waitRecv(ssize_t directOffset) {
|
||||
spins = 0;
|
||||
#ifdef ENABLE_PROFILING
|
||||
uint64_t t0 = __builtin_amdgcn_s_memrealtime();
|
||||
#endif
|
||||
while (connTailCache < step + SLICESTEPS) {
|
||||
connTailCache = LOAD(connTailPtr);
|
||||
if (checkAbort()) break;
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
if (connPtrsFifoPtr) srcs[SRC+index] = ((T **)connPtrsFifoPtr)[step%NCCL_STEPS];
|
||||
else srcs[SRC+index] = directPtr<DIRECTRECV>(directOffset);
|
||||
step += SLICESTEPS;
|
||||
}
|
||||
|
||||
inline __device__ void postRecv() {
|
||||
if (recvConnHeadPtr) STORE(recvConnHeadPtr, recvConnHead += SLICESTEPS);
|
||||
STORE(connHeadPtr, step += SLICESTEPS);
|
||||
}
|
||||
|
||||
inline __device__ void incSend(int i) {
|
||||
sendStep[i] += SLICESTEPS;
|
||||
}
|
||||
inline __device__ void postSend() {
|
||||
if (sendConnTailPtr) {
|
||||
if (sendConn->next_hdp_reg) STORE(sendConn->next_hdp_reg, 0x1);
|
||||
STORE(sendConnTailPtr, sendConnTail += SLICESTEPS);
|
||||
}
|
||||
if (conn->next_hdp_reg) STORE(conn->next_hdp_reg, 0x1);
|
||||
STORE(connTailPtr, step += SLICESTEPS);
|
||||
}
|
||||
|
||||
template <int DIRECTRECV>
|
||||
inline __device__ const T* directRecvPtr(int i, ssize_t directOffset) {
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
|
||||
#else
|
||||
return recvPtr(i);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int DIRECTSEND>
|
||||
inline __device__ T* directSendPtr(int i, ssize_t directOffset) {
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
|
||||
#else
|
||||
return sendPtr(i);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int DIRECTRECV>
|
||||
inline __device__ int directRecvInc(int i, int directInc, int sliceInc) {
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
return DIRECTRECV && recvDirectBuff[i] ? directInc : sliceInc;
|
||||
#else
|
||||
return sliceInc;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int DIRECTSEND>
|
||||
inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
return DIRECTSEND && sendDirectBuff[i] ? directInc : sliceInc;
|
||||
#else
|
||||
return sliceInc;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
|
||||
inline __device__ void
|
||||
GenericOp(const T* srcPtr, T* dstPtr, int nelem, ssize_t directOffset) {
|
||||
@@ -212,148 +168,126 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
|
||||
int sliceSize = stepSize*SLICESTEPS;
|
||||
int dataSize = max(DIVUP(nelem, 16*SLICESPERCHUNK)*16, sliceSize/32);
|
||||
|
||||
const T* srcs[RECV*NRECV+SRC];
|
||||
srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
|
||||
if (RECV) {
|
||||
if (SRC) srcs[1] = recvPtr(0);
|
||||
for (int i=1; i<NRECV && i<nrecv; i++) srcs[SRC+i] = recvPtr(i);
|
||||
}
|
||||
|
||||
T* dsts[SEND*NSEND+DST];
|
||||
dsts[0] = DST ? dstPtr : directSendPtr<DIRECTSEND>(0, directOffset);
|
||||
if (SEND) {
|
||||
if (DST) dsts[1] = directSendPtr<DIRECTSEND>(0, directOffset);
|
||||
for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
|
||||
int realSize = max(0, min(dataSize, nelem-offset));
|
||||
#ifdef ENABLE_PROFILING
|
||||
uint64_t t0 = __rtc64();
|
||||
uint64_t t0 = __builtin_amdgcn_s_memrealtime();
|
||||
#endif
|
||||
if (SEND) waitSend(realSize*sizeof(T));
|
||||
if (RECV) waitRecv();
|
||||
if (realSize > 0) {
|
||||
barrier();
|
||||
if (tid < nworkers) {
|
||||
if (SRC && (role & ROLE_SRC)) srcs[0] = srcPtr+offset;
|
||||
if (RECV && (role & ROLE_WAIT_RECV)) waitRecv<SRC, DIRECTRECV>(directOffset+offset);
|
||||
if (DST && (role & ROLE_DST)) dsts[0] = dstPtr+offset;
|
||||
if (SEND && (role & ROLE_WAIT_SEND)) waitSend<DST, DIRECTSEND>(directOffset+offset, realSize*sizeof(T));
|
||||
if (realSize > 0) {
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_cycle[blockIdx.x], __rtc64() - t0, __ATOMIC_SEQ_CST);
|
||||
if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
if (DIRECTRECV && recvDirectBuff[0]) {
|
||||
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
|
||||
if (SEND) {
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
|
||||
}
|
||||
} else {
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
|
||||
subBarrier();
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nworkers, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
|
||||
}
|
||||
#else
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
|
||||
#endif
|
||||
}
|
||||
barrier();
|
||||
FOR_SEND(incSend);
|
||||
FOR_RECV(incRecv);
|
||||
if (tid >= nthreads-WARP_SIZE) {
|
||||
if (SEND) {
|
||||
if (realSize > 0 && wid == 0) __threadfence_system();
|
||||
__syncwarp();
|
||||
postSend();
|
||||
}
|
||||
if (RECV) postRecv();
|
||||
}
|
||||
srcs[0] += SRC ? realSize : directRecvInc<DIRECTRECV>(0, realSize, sliceSize);
|
||||
for (int i=1-SRC; i<RECV*NRECV; i++) srcs[SRC+i] += sliceSize;
|
||||
dsts[0] += DST ? realSize : directSendInc<DIRECTSEND>(0, realSize, sliceSize);
|
||||
for (int i=1-DST; i<SEND*NSEND; i++) dsts[DST+i] += directSendInc<DIRECTSEND>(i, realSize, sliceSize);
|
||||
if (SEND && (role & ROLE_POST_SEND) && realSize > 0 && index == 0) __threadfence_system();
|
||||
__syncwarp();
|
||||
if (SEND && (role & ROLE_POST_SEND)) postSend();
|
||||
if (RECV && (role & ROLE_POST_RECV)) postRecv();
|
||||
offset += realSize;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
|
||||
recvBuff[i] = (const T*)LOAD(conn->buffs+NCCL_PROTO_SIMPLE);
|
||||
recvStep[i] = LOAD(&conn->step);
|
||||
recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
recvDirectBuff[i] = NULL;
|
||||
if (DIRECT && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
|
||||
recvDirectBuff[i] = directBuff;
|
||||
if (tid == 0) STORE(conn->ptrExchange, directBuff);
|
||||
}
|
||||
#endif
|
||||
if (wid == i) recvConn = conn;
|
||||
if (wid == i) recvConnTail = recvConnHead = recvStep[i]; // Make sure we set this after rounding up
|
||||
nrecv++;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadRecvSync() {
|
||||
if (tid >= WARP_SIZE && tid < 2*WARP_SIZE && wid<nrecv) {
|
||||
recvConnTailPtr = LOAD(&recvConn->tail);
|
||||
recvConnTailCache = LOAD(recvConnTailPtr);
|
||||
}
|
||||
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
|
||||
recvConnHeadPtr = LOAD(&recvConn->head);
|
||||
// Return credits in case we rounded up.
|
||||
STORE(recvConnHeadPtr, recvConnHead);
|
||||
__device__ __forceinline__ void loadRecvConn(struct ncclChannel* channel, T* directBuff) {
|
||||
if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) {
|
||||
conn = &channel->devPeers[peer].recv.conn;
|
||||
step = conn->step;
|
||||
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
|
||||
if (role & ROLE_POST_RECV) {
|
||||
connHeadPtr = conn->head;
|
||||
// Return credits in case we rounded up.
|
||||
STORE(connHeadPtr, step);
|
||||
}
|
||||
if (role & ROLE_WAIT_RECV) {
|
||||
buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
//if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
|
||||
// direct = directBuff;
|
||||
// *conn->ptrExchange = directBuff;
|
||||
//}
|
||||
connTailPtr = conn->tail;
|
||||
connTailCache = LOAD(connTailPtr);
|
||||
connPtrsFifoPtr = conn->ptrsFifo;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
|
||||
sendBuff[i] = (T*)LOAD(conn->buffs+NCCL_PROTO_SIMPLE);
|
||||
sendStep[i] = LOAD(&conn->step);
|
||||
sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
sendDirectBuff[i] = NULL;
|
||||
if (DIRECT && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
|
||||
void* volatile* ptr = LOAD(&conn->ptrExchange);
|
||||
while ((sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL);
|
||||
barrier();
|
||||
if (tid == 0) STORE(ptr, NULL);
|
||||
}
|
||||
#endif
|
||||
if (wid == i) sendConn = conn;
|
||||
if (wid == i) sendConnTail = sendConnHead = sendStep[i]; // Make sure we set this after rounding up
|
||||
nsend++;
|
||||
}
|
||||
__device__ __forceinline__ void loadSendSync() {
|
||||
if (tid < nsend) {
|
||||
sendConnHeadPtr = LOAD(&sendConn->head);
|
||||
sendConnHeadCache = LOAD(sendConnHeadPtr);
|
||||
sendConnFifoPtr = LOAD(&sendConn->fifo);
|
||||
}
|
||||
if (tid >= nthreads-WARP_SIZE && wid < nsend) {
|
||||
sendConnTailPtr = LOAD(&sendConn->tail);
|
||||
__device__ __forceinline__ void loadSendConn(struct ncclChannel* channel) {
|
||||
if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) {
|
||||
conn = &channel->devPeers[peer].send.conn;
|
||||
step = conn->step;
|
||||
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
|
||||
if (role & ROLE_POST_SEND) {
|
||||
connTailPtr = conn->tail;
|
||||
}
|
||||
if (role & ROLE_WAIT_SEND) {
|
||||
buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
//if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
|
||||
// void* volatile* ptr = conn->ptrExchange;
|
||||
// while ((direct = (T*)(*ptr)) == NULL);
|
||||
// *ptr = NULL;
|
||||
//}
|
||||
connHeadPtr = conn->head;
|
||||
connHeadCache = LOAD(connHeadPtr);
|
||||
connSizesFifoPtr = conn->sizesFifo;
|
||||
connPtrsFifoPtr = conn->ptrsFifo;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void saveRecvSync() {
|
||||
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
|
||||
STORE(&recvConn->step, recvConnHead);
|
||||
__threadfence_system();
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void saveSendSync() {
|
||||
if (tid < nsend) {
|
||||
STORE(&sendConn->step, sendConnHead);
|
||||
__device__ __forceinline__ void saveSync() {
|
||||
if (role & (ROLE_POST_SEND|ROLE_POST_RECV)) {
|
||||
conn->step = step;
|
||||
__threadfence_system();
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
__device__ __forceinline__
|
||||
ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepSize(stepSize) {
|
||||
barriers = channel->barrier;
|
||||
barrier_next = channel->barrier_next;
|
||||
ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group)
|
||||
: comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group), barriers(&ptrs[group].barrier), barrier_next(ptrs[group].barrier_next) {
|
||||
nthreads = nworkers;
|
||||
// For send operations, we need an extra warp to overlap the threadfence and the copy
|
||||
// int postThreads = NSEND && nworkers >= 64 ? WARP_SIZE : 0;
|
||||
// nthreads += postThreads;
|
||||
|
||||
// Make sure step is updated before we read it.
|
||||
barrier();
|
||||
|
||||
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, 0);
|
||||
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
|
||||
loadRecvSync();
|
||||
loadSendSync();
|
||||
for (int i=0; i<NRECV; i++) if (recvPeers[i] != -1) nrecv++;
|
||||
for (int i=0; i<NSEND; i++) if (sendPeers[i] != -1) nsend++;
|
||||
|
||||
#define SYNC_GROUP 8
|
||||
static_assert(NSEND < SYNC_GROUP && NRECV < SYNC_GROUP, "Not enough threads to cover all peers");
|
||||
|
||||
int g = tid / SYNC_GROUP;
|
||||
int ng = nthreads / SYNC_GROUP;
|
||||
index = tid % SYNC_GROUP;
|
||||
|
||||
if (g == 0) {
|
||||
if (index < nrecv) role |= ROLE_WAIT_RECV;
|
||||
if (index == nrecv) role |= ROLE_SRC;
|
||||
} else if (g == 1) {
|
||||
if (index < nsend) role |= ROLE_WAIT_SEND;
|
||||
if (index == nsend) role |= ROLE_DST;
|
||||
} else if (g == ng - 2) {
|
||||
if (index < nrecv) role |= ROLE_POST_RECV;
|
||||
} else if (g == ng - 1) {
|
||||
if (index < nsend) role |= ROLE_POST_SEND;
|
||||
}
|
||||
|
||||
if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) peer = recvPeers[index];
|
||||
if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) peer = sendPeers[index];
|
||||
|
||||
loadRecvConn(channel, directBuff);
|
||||
loadSendConn(channel);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void
|
||||
@@ -414,8 +348,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
|
||||
|
||||
__device__ __forceinline__ ~ncclPrimitives() {
|
||||
// Save steps for the next operation
|
||||
saveRecvSync();
|
||||
saveSendSync();
|
||||
saveSync();
|
||||
}
|
||||
};
|
||||
|
||||
@@ -424,10 +357,10 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
#define INIT_COUNTER \
|
||||
if (tid == 0) { t0 = __rtc64(); ws = LOAD(&(devProf->wait_cycle[blockIdx.x])); }
|
||||
if (tid == 0) { t0 = __builtin_amdgcn_s_memrealtime(); ws = LOAD(&(devProf->wait_cycle[blockIdx.x])); }
|
||||
|
||||
#define ACCUMULATE_COUNTER(prim) \
|
||||
if (tid == 0) { __atomic_fetch_add(&(devProf->prim##_cycle), __rtc64() - t0 \
|
||||
if (tid == 0) { __atomic_fetch_add(&(devProf->prim##_cycle), __builtin_amdgcn_s_memrealtime() - t0 \
|
||||
+ ws - LOAD(&(devProf->wait_cycle[blockIdx.x])), __ATOMIC_SEQ_CST); \
|
||||
__atomic_fetch_add(&(devProf->prim##_byte), nelem * sizeof(T), __ATOMIC_SEQ_CST); }
|
||||
#else
|
||||
|
||||
@@ -205,7 +205,7 @@ class ncclLLPrimitives {
|
||||
sendConnHeadPtr = LOAD(&sendConn->head);
|
||||
sendConnHeadCache = LOAD(sendConnHeadPtr);
|
||||
sendConnHead = LOAD(&sendConn->step);
|
||||
sendConnFifoPtr = LOAD(&sendConn->fifo);
|
||||
sendConnFifoPtr = LOAD(&sendConn->sizesFifo);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -118,9 +118,14 @@ class ncclLL128Primitives {
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
if (u*WARP_SIZE < maxOffset) {
|
||||
uint64_t v0, v1;
|
||||
load128(src64Ptr+u*WARP_SIZE, v0, v1);
|
||||
storeShmem128(shmemAsmPtr+u*WARP_SIZE, v0, v1);
|
||||
using Vec = uint64_t __attribute__((ext_vector_type(2)));
|
||||
Vec i2;
|
||||
//load128(src64Ptr+u*WARP_SIZE, v0, v1);
|
||||
asm volatile ("flat_load_dwordx4 %0, %1\n"
|
||||
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(src64Ptr+u*WARP_SIZE));
|
||||
//storeShmem128(shmemAsmPtr+u*WARP_SIZE, i2[0], i2[1]);
|
||||
*(shmemAsmPtr+u*WARP_SIZE) = i2[0];
|
||||
*(shmemAsmPtr+u*WARP_SIZE+1) = i2[1];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -135,15 +140,24 @@ class ncclLL128Primitives {
|
||||
|
||||
template <int ELEMS_PER_THREAD>
|
||||
inline __device__ void storeShmemToDst128(int maxOffset, uint64_t* dst64Ptr) {
|
||||
uint64_t v[ELEMS_PER_THREAD];
|
||||
using Velem = uint64_t __attribute__((ext_vector_type(ELEMS_PER_THREAD)));
|
||||
Velem v;
|
||||
uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
|
||||
v[u] = *(shmemAsmPtr+u*WARP_SIZE);
|
||||
v[u+1] = *(shmemAsmPtr+u*WARP_SIZE+1);
|
||||
//loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]);
|
||||
//if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]);
|
||||
using Vec = uint64_t __attribute__((ext_vector_type(2)));
|
||||
Vec i2;
|
||||
i2[0] = v[u];
|
||||
i2[1] = v[u+1];//
|
||||
if (u*WARP_SIZE < maxOffset) asm volatile ("flat_store_dwordx4 %0, %1\n"
|
||||
"s_waitcnt vmcnt(0)\n" : : "v"(dst64Ptr+u*WARP_SIZE), "v"(i2));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -176,45 +190,52 @@ class ncclLL128Primitives {
|
||||
uint64_t flag = recvFlag(0);
|
||||
uint64_t* ptr = recvPtr(0)+ll128Offset;
|
||||
bool needReload;
|
||||
uint64_t v0, v1;
|
||||
using Vec = uint64_t __attribute__((ext_vector_type(2)));
|
||||
Vec i2;
|
||||
do {
|
||||
if (wid == 0) STORE(sync, 0);
|
||||
needReload = false;
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
needReload |= flagThread && (v1 != flag);
|
||||
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
|
||||
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
|
||||
//load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
needReload |= flagThread && (i2[1] != flag);
|
||||
}
|
||||
if (needReload) __atomic_fetch_add(sync, 1, __ATOMIC_SEQ_CST);
|
||||
if (LOAD(sync) == 0) break;
|
||||
} while (checkAbort(0, 0) == 0);
|
||||
} while (LOAD(sync) && checkAbort(0, 0) == 0);
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
v[u] = SRC ? MULTI<FUNC, T>()(v0, v[u]) : v0;
|
||||
v[u+1] = SRC ? MULTI<FUNC, T>()(v1, v[u+1]) : v1;
|
||||
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
|
||||
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
|
||||
//load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
v[u] = SRC ? MULTI<FUNC, T>()(i2[0], v[u]) : i2[0];
|
||||
v[u+1] = SRC ? MULTI<FUNC, T>()(i2[1], v[u+1]) : i2[1];
|
||||
}
|
||||
|
||||
for (int i=1; i<NRECV && i<nrecv; i++) {
|
||||
uint64_t flag = recvFlag(i);
|
||||
uint64_t* ptr = recvPtr(i)+ll128Offset;
|
||||
uint64_t v0, v1;
|
||||
Vec i2;
|
||||
do {
|
||||
if (wid == 0) STORE(sync, 0);
|
||||
needReload = false;
|
||||
needReload = 0;
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
needReload |= flagThread && (v1 != flag);
|
||||
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
|
||||
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
|
||||
//load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
needReload |= flagThread && (i2[1] != flag);
|
||||
}
|
||||
if (needReload) __atomic_fetch_add(sync, 1, __ATOMIC_SEQ_CST);
|
||||
if (LOAD(sync) == 0) break;
|
||||
} while (checkAbort(i, 0) == 0);
|
||||
} while (LOAD(sync) && checkAbort(i, 0) == 0);
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
v[u] = MULTI<FUNC, T>()(v0, v[u]);
|
||||
v[u+1] = MULTI<FUNC, T>()(v1, v[u+1]);
|
||||
asm volatile ("flat_load_dwordx4 %0, %1, glc, slc\n"
|
||||
"s_waitcnt vmcnt(0)\n" : "=v"(i2) : "v"(ptr+u*WARP_SIZE));
|
||||
//load128(ptr+u*WARP_SIZE, v0, v1);
|
||||
v[u] = MULTI<FUNC, T>()(i2[0], v[u]);
|
||||
v[u+1] = MULTI<FUNC, T>()(i2[1], v[u+1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -223,18 +244,30 @@ class ncclLL128Primitives {
|
||||
/************************ Send **************************/
|
||||
if (SEND) {
|
||||
for (int i=1; i<NSEND && i<nsend; i++) {
|
||||
int flag = sendFlag(i);
|
||||
uint64_t flag = sendFlag(i);
|
||||
uint64_t* ptr = sendPtr(i)+ll128Offset;
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
|
||||
//store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
|
||||
using Vec = uint64_t __attribute__((ext_vector_type(2)));
|
||||
Vec i2;
|
||||
i2[0] = v[u];
|
||||
i2[1] = flagThread ? flag : v[u+1];//
|
||||
asm volatile ("flat_store_dwordx4 %0, %1, glc, slc\n"
|
||||
"s_waitcnt vmcnt(0)\n" : : "v"(ptr+u*WARP_SIZE), "v"(i2));
|
||||
}
|
||||
}
|
||||
int flag = sendFlag(0);
|
||||
uint64_t flag = sendFlag(0);
|
||||
uint64_t* ptr = sendPtr(0)+ll128Offset;
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
|
||||
store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
|
||||
//store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
|
||||
using Vec = uint64_t __attribute__((ext_vector_type(2)));
|
||||
Vec i2;
|
||||
i2[0] = v[u];
|
||||
i2[1] = flagThread ? flag : v[u+1];//
|
||||
asm volatile ("flat_store_dwordx4 %0, %1, glc, slc\n"
|
||||
"s_waitcnt vmcnt(0)\n" : : "v"(ptr+u*WARP_SIZE), "v"(i2));
|
||||
}
|
||||
}
|
||||
/********************** End Send ************************/
|
||||
@@ -279,7 +312,7 @@ class ncclLL128Primitives {
|
||||
const int maxOffset = min(nelem-(elemOffset*((int)(sizeof(uint64_t)/sizeof(T)))), (int)(ELEMINC*(sizeof(uint64_t)/sizeof(T))));
|
||||
if (SRC) {
|
||||
int done = 0;
|
||||
if ((((uint64_t)srcPtr)&0xf) == 0) {
|
||||
if ((((uint64_t)srcPtr)&0x3) == 0) {
|
||||
loadSrcToShmem128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, src64Ptr+elemOffset+2*wid);
|
||||
done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
@@ -290,7 +323,7 @@ class ncclLL128Primitives {
|
||||
__syncwarp();
|
||||
if (DST) {
|
||||
int done = 0;
|
||||
if ((((uint64_t)dstPtr)&0xf) == 0) {
|
||||
if ((((uint64_t)dstPtr)&0x3) == 0) {
|
||||
storeShmemToDst128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, dst64Ptr+elemOffset+2*wid);
|
||||
done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
@@ -330,10 +363,10 @@ class ncclLL128Primitives {
|
||||
sendConnHeadPtr = LOAD(&sendConn->head);
|
||||
sendConnHeadCache = LOAD(sendConnHeadPtr);
|
||||
sendConnHead = LOAD(&sendConn->step);
|
||||
sendConnFifoPtr = LOAD(&sendConn->fifo);
|
||||
sendConnFifoPtr = LOAD(&sendConn->sizesFifo);
|
||||
}
|
||||
if (tid >= nthreads-WARP_SIZE && wid<nsend) {
|
||||
if (sendConn->fifo) {
|
||||
if (sendConn->sizesFifo) {
|
||||
sendConnTailPtr = LOAD(&sendConn->tail);
|
||||
sendConnTail = LOAD(&sendConn->step);
|
||||
}
|
||||
@@ -357,12 +390,7 @@ class ncclLL128Primitives {
|
||||
public:
|
||||
__device__ __forceinline__
|
||||
ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
|
||||
// for __any_sync
|
||||
if (NSEND > NRECV)
|
||||
sync = channel->sync + 2 + tid/WARP_SIZE;
|
||||
else
|
||||
sync = channel->sync + tid/WARP_SIZE;
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), shmem(ncclShmem->data+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid), sync(ncclShmem->sync+warp) {
|
||||
// Make sure step is updated before we read it.
|
||||
barrier();
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,4 +8,4 @@
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_R(ncclReduce, ncclCollReduce);
|
||||
IMPL_COLL_R(Reduce);
|
||||
|
||||
@@ -9,151 +9,145 @@
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->coll.root;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->coll.root;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
|
||||
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
if (prevRank == root) {
|
||||
prims.send(thisInput+offset, nelem);
|
||||
} else if (rank == root) {
|
||||
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else {
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
if (prevRank == root) {
|
||||
prims.send(thisInput+offset, nelem);
|
||||
} else if (rank == root) {
|
||||
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else {
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = comm->rank;
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->coll.root;
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceCollNetKernel(struct CollectiveArgs* args) { }
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = comm->rank;
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->coll.root;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (prevRank == root) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else if (rank == root) {
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (prevRank == root) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else if (rank == root) {
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceCollNetLLKernel(struct CollectiveArgs* args) { }
|
||||
};
|
||||
|
||||
#include "prims_ll128.h"
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = comm->rank;
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->coll.root;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = comm->rank;
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->coll.root;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (prevRank == root) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else if (rank == root) {
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (prevRank == root) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else if (rank == root) {
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { }
|
||||
template<int PROTO, class REDOP, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduce, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
|
||||
template<int PROTO, class REDOP, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduce, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,4 +8,4 @@
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_R(ncclReduceScatter, ncclCollReduceScatter);
|
||||
IMPL_COLL_R(ReduceScatter);
|
||||
|
||||
@@ -9,195 +9,189 @@
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm);
|
||||
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
|
||||
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
prims.send(thisInput+offset, nelem);
|
||||
prims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final result
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final result
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
|
||||
}
|
||||
}
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterCollNetKernel(struct CollectiveArgs* args) { }
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
|
||||
}
|
||||
}
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterCollNetLLKernel(struct CollectiveArgs* args) { }
|
||||
};
|
||||
|
||||
#include "prims_ll128.h"
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
template<int PROTO, class REDOP, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterCollNetLL128Kernel(struct CollectiveArgs* args) { }
|
||||
template<int PROTO, class REDOP, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
|
||||
};
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "scatter.h"
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_FUNC(ncclScatter, copy, FuncSum, i8, int8_t);
|
||||
IMPL_COLL_KERN(ncclScatter, copy, FuncSum, i8, int8_t, 0);
|
||||
@@ -1,75 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "devcomm.h"
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclScatterKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int nranks = comm->nRanks;
|
||||
const int bid = args->coll.bid;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * SCATTER_CHUNKSTEPS;
|
||||
const int peersPerChan = DIVUP(nranks, nChannels);
|
||||
const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
|
||||
const int root = args->coll.root;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
for (int i = 0; i < peersPerChan; i++) {
|
||||
if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
|
||||
(peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
|
||||
continue;
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
if ((blockIdx.x*peersPerChan+i)%nranks == 0 && rank == root) {
|
||||
const T* sendbuff = thisInput+chunkOffset+rank*size;
|
||||
T* recvbuff = thisOutput+chunkOffset;
|
||||
if (tid < nthreads && sendbuff != recvbuff) {
|
||||
// local copy
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
|
||||
int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
|
||||
int peerNone = -1;
|
||||
if (rank == root) {
|
||||
ncclPrimitives<UNROLL, SCATTER_CHUNKSTEPS/SCATTER_SLICESTEPS, SCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &peerNone, &peerSend, NULL, stepSize, channel, comm);
|
||||
|
||||
ssize_t send_offset = chunkOffset + peerSend*size;
|
||||
prims.send(thisInput+send_offset, nelem);
|
||||
}
|
||||
else {
|
||||
if (peerRecv == root) {
|
||||
ncclPrimitives<UNROLL, SCATTER_CHUNKSTEPS/SCATTER_SLICESTEPS, SCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &peerRecv, &peerNone, NULL, stepSize, channel, comm);
|
||||
|
||||
ssize_t recv_offset = chunkOffset;
|
||||
prims.recv(thisOutput+recv_offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -9,5 +8,4 @@
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_FUNC(ncclSendRecv, copy, FuncSum, i8, int8_t);
|
||||
IMPL_COLL_KERN(ncclSendRecv, copy, FuncSum, i8, int8_t, 0);
|
||||
IMPL_COLL_P(SendRecv);
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -9,74 +8,85 @@
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__device__ void ncclSendRecvKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->p2p.nThreads;
|
||||
template<class FUNC, typename T, int UNROLL>
|
||||
class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
|
||||
public:
|
||||
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* firstArgs) {
|
||||
struct ncclWorkElem* args = firstArgs;
|
||||
int tid = threadIdx.x;
|
||||
int group = 0;
|
||||
for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
|
||||
int nThreadsSegment = args->p2p.nThreads;
|
||||
if (nThreadsSegment == 0) return; // Nothing else to do
|
||||
int groupRecv = group;
|
||||
group += 1;
|
||||
int groupSend = group;
|
||||
group += 1;
|
||||
if (tid < nThreadsSegment) {
|
||||
const int nThreads = nThreadsSegment;
|
||||
|
||||
// Compute pointers
|
||||
const T* sendbuff = (const T*)args->sendbuff;
|
||||
T* recvbuff = (T*)args->recvbuff;
|
||||
// Compute pointers
|
||||
const T* sendbuff = (const T*)args->sendbuff;
|
||||
T* recvbuff = (T*)args->recvbuff;
|
||||
const ssize_t sendCount = args->p2p.sendCount;
|
||||
const ssize_t recvCount = args->p2p.recvCount;
|
||||
|
||||
if (args->p2p.delta < 0 ) return; // No-op
|
||||
const int delta = args->p2p.delta;
|
||||
if (delta == 0) {
|
||||
if (tid < nThreads && sendbuff != recvbuff) {
|
||||
// local copy : ReduceOrCopyMulti takes an int as number of elements,
|
||||
// so we split it in blocks of 1G elements.
|
||||
int blockSize = 1<<30;
|
||||
for (size_t offset=0; offset<sendCount; offset += blockSize) {
|
||||
size_t remaining = sendCount - offset;
|
||||
if (remaining < blockSize) blockSize = remaining;
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nThreads, 1, &sendbuff, 1, &recvbuff, blockSize);
|
||||
sendbuff += blockSize; recvbuff += blockSize;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
|
||||
if (args->p2p.delta == 0) {
|
||||
if (tid < nthreads && sendbuff != recvbuff) {
|
||||
// local copy : ReduceOrCopyMulti takes an int as number of elements,
|
||||
// so we split it in blocks of 1G elements.
|
||||
int blockSize = 1<<30;
|
||||
for (size_t offset=0; offset<args->p2p.sendCount; offset += blockSize) {
|
||||
size_t remaining = args->p2p.sendCount - offset;
|
||||
if (remaining < blockSize) blockSize = remaining;
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, blockSize);
|
||||
sendbuff += blockSize; recvbuff += blockSize;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize/SENDRECV_SLICEFACTOR;
|
||||
|
||||
int nThreadsSplit = nThreads/2;
|
||||
if ((tid < nThreadsSplit) && recvCount >= 0) {
|
||||
int peer = (comm->rank-delta+comm->nRanks)%comm->nRanks;
|
||||
int nt = nThreadsSplit;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, 0, 1, FUNC>
|
||||
prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv);
|
||||
|
||||
if (recvCount == 0) {
|
||||
prims.recv(recvbuff, 0);
|
||||
} else for (ssize_t offset = 0; offset < recvCount; offset += chunkSize) {
|
||||
int realChunkSize = min(chunkSize, recvCount-offset);
|
||||
ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
|
||||
int nelem = min(realChunkSize, recvCount-offset);
|
||||
prims.directRecv(recvbuff+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
if ((tid >= nThreadsSplit) && sendCount >= 0) {
|
||||
int peer = (comm->rank+delta)%comm->nRanks;
|
||||
int nt = nThreads-nThreadsSplit;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 0, 1, 1, FUNC>
|
||||
prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend);
|
||||
|
||||
if (sendCount == 0) {
|
||||
prims.send(sendbuff, 0);
|
||||
} else for (ssize_t offset = 0; offset < sendCount; offset += chunkSize) {
|
||||
int realChunkSize = min(chunkSize, sendCount-offset);
|
||||
ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
|
||||
int nelem = min(realChunkSize, sendCount-offset);
|
||||
prims.directSend(sendbuff+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
tid -= nThreadsSegment;
|
||||
if (tid < 0) return;
|
||||
args++;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS)/SENDRECV_SLICEFACTOR;
|
||||
|
||||
int nthreadsSplit = nthreads/2;
|
||||
// We set NRECV or NSEND to 2 to use different barriers in primitives for the send threads and
|
||||
// receive threads, but then we define all peers to -1 since sender threads don't receive and
|
||||
// receive threads don't send.
|
||||
int peerNone[2] = {-1,-1};
|
||||
|
||||
if (tid < nthreadsSplit ) {
|
||||
const ssize_t sendSize = args->p2p.sendCount;
|
||||
if (sendSize < 0) return;
|
||||
|
||||
int peer = (comm->rank+(int)args->p2p.delta)%comm->nRanks;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 2, 1, 1, FUNC>
|
||||
prims(tid, nthreadsSplit, peerNone, &peer, recvbuff, stepSize*SENDRECV_SLICEFACTOR, channel, comm);
|
||||
|
||||
if (sendSize == 0) {
|
||||
prims.send(sendbuff, 0);
|
||||
} else for (ssize_t offset = 0; offset < sendSize; offset += stepSize) {
|
||||
int realChunkSize = min(stepSize, sendSize-offset);
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
int nelem = min(realChunkSize, sendSize-offset);
|
||||
prims.directSend(sendbuff+offset, offset, nelem);
|
||||
}
|
||||
} else {
|
||||
const ssize_t recvSize = args->p2p.recvCount;
|
||||
if (recvSize < 0) return;
|
||||
|
||||
int peer = (comm->rank-(int)args->p2p.delta+comm->nRanks)%comm->nRanks;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, 2, 1, FUNC>
|
||||
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peer, peerNone, recvbuff, stepSize*SENDRECV_SLICEFACTOR, channel, comm);
|
||||
|
||||
if (recvSize == 0) {
|
||||
prims.recv(recvbuff, 0);
|
||||
} else for (ssize_t offset = 0; offset < recvSize; offset += stepSize) {
|
||||
int realChunkSize = min(stepSize, recvSize-offset);
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
int nelem = min(realChunkSize, recvSize-offset);
|
||||
prims.directRecv(recvbuff+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -12,7 +12,6 @@ NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
if (comm->alltoallDisable) {
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
size_t rankOffset = sendcount * ncclTypeSize(datatype);
|
||||
@@ -27,11 +26,4 @@ ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, root, comm, stream));
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ncclSuccess;
|
||||
}
|
||||
else {
|
||||
struct ncclInfo info = { ncclCollGather, "Gather",
|
||||
sendbuff, recvbuff, sendcount, datatype, ncclSum, root, comm, stream, /* Args */
|
||||
GATHER_CHUNKSTEPS, GATHER_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollReduce, "Reduce",
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
struct ncclInfo info = { ncclFuncReduce, "Reduce",
|
||||
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
|
||||
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -12,7 +12,8 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
|
||||
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
|
||||
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
|
||||
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
|
||||
@@ -12,7 +12,6 @@ NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
if (comm->alltoallDisable) {
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
size_t rankOffset = recvcount * ncclTypeSize(datatype);
|
||||
@@ -27,11 +26,4 @@ ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
NCCLCHECK(ncclRecv(recvbuff, recvcount, datatype, root, comm, stream));
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ncclSuccess;
|
||||
}
|
||||
else {
|
||||
struct ncclInfo info = { ncclCollScatter, "Scatter",
|
||||
sendbuff, recvbuff, recvcount, datatype, ncclSum, root, comm, stream, /* Args */
|
||||
SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,8 @@ NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataTyp
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollSendRecv, "Send",
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
struct ncclInfo info = { ncclFuncSendRecv, "Send",
|
||||
sendbuff, NULL, count, datatype, ncclSum, peer, comm, stream, /* Args */
|
||||
1, 1 };
|
||||
ncclResult_t ret;
|
||||
@@ -27,7 +28,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollSendRecv, "Recv",
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
struct ncclInfo info = { ncclFuncSendRecv, "Recv",
|
||||
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
|
||||
1, 1 };
|
||||
ncclResult_t ret;
|
||||
|
||||
+3
-3
@@ -128,7 +128,7 @@ void ncclDebugInit() {
|
||||
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
|
||||
if (ncclDebugLevel == -1) ncclDebugInit();
|
||||
if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
|
||||
if (ncclDebugLevel < level) return;
|
||||
if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;
|
||||
|
||||
// Gather the rank information. This can take > 1us so we want to make sure
|
||||
// we only do it when needed.
|
||||
@@ -145,11 +145,11 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
if (level == NCCL_LOG_WARN)
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line);
|
||||
else if (level == NCCL_LOG_INFO && (flags & ncclDebugMask))
|
||||
else if (level == NCCL_LOG_INFO)
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
|
||||
#ifdef ENABLE_TRACE
|
||||
else if (level == NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
|
||||
else if (level == NCCL_LOG_TRACE) {
|
||||
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
|
||||
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
|
||||
+298
-212
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,61 +8,61 @@
|
||||
#include "enqueue.h"
|
||||
#include "argcheck.h"
|
||||
#include "coll_net.h"
|
||||
#include "../graph/topo.h"
|
||||
#include "graph/topo.h"
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_ext.h>
|
||||
|
||||
// Only generate inline kernels for LL
|
||||
#define NCCL_FUNC5(coll, op, dtype) \
|
||||
NCCL_KERN_NAME(coll##LL, op, dtype), \
|
||||
NCCL_KERN_NAME(coll##LL, op, dtype), \
|
||||
NCCL_KERN_NAME(coll##LL, op, dtype)
|
||||
#define NCCL_FUNC5(func, algo, redop, dtype) \
|
||||
NCCL_KERN_NAME(func, algo, LL, redop, dtype), \
|
||||
NCCL_KERN_NAME(func, algo, LL, redop, dtype), \
|
||||
NCCL_KERN_NAME(func, algo, LL, redop, dtype)
|
||||
|
||||
#define NCCL_FUNC4(coll, op, dtype) \
|
||||
NCCL_FUNC5(coll##Tree, op, dtype), \
|
||||
NCCL_FUNC5(coll##Ring, op, dtype), \
|
||||
NCCL_FUNC5(coll##CollNet, op, dtype)
|
||||
#define NCCL_FUNC4(func, redop, type) \
|
||||
NCCL_FUNC5(func, TREE, redop, type), \
|
||||
NCCL_FUNC5(func, RING, redop, type), \
|
||||
NCCL_FUNC5(func, COLLNET, redop, type)
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, u8), \
|
||||
NCCL_FUNC4(coll, op, i32), \
|
||||
NCCL_FUNC4(coll, op, u32), \
|
||||
NCCL_FUNC4(coll, op, i64), \
|
||||
NCCL_FUNC4(coll, op, u64), \
|
||||
NCCL_FUNC4(coll, op, f16), \
|
||||
NCCL_FUNC4(coll, op, f32), \
|
||||
NCCL_FUNC4(coll, op, f64), \
|
||||
NCCL_FUNC4(coll, op, b16)
|
||||
#define NCCL_FUNCS3B(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8)
|
||||
#define NCCL_FUNCS3A(func, redop) \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, uint8_t), \
|
||||
NCCL_FUNC4(func, redop, int32_t), \
|
||||
NCCL_FUNC4(func, redop, uint32_t), \
|
||||
NCCL_FUNC4(func, redop, int64_t), \
|
||||
NCCL_FUNC4(func, redop, uint64_t), \
|
||||
NCCL_FUNC4(func, redop, half), \
|
||||
NCCL_FUNC4(func, redop, float), \
|
||||
NCCL_FUNC4(func, redop, double), \
|
||||
NCCL_FUNC4(func, redop, rccl_bfloat16)
|
||||
#define NCCL_FUNCS3B(func, redop) \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t), \
|
||||
NCCL_FUNC4(func, redop, int8_t)
|
||||
|
||||
// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
|
||||
#define NCCL_FUNCS2A(coll) \
|
||||
NCCL_FUNCS3A(coll, sum), \
|
||||
NCCL_FUNCS3A(coll, sum), \
|
||||
NCCL_FUNCS3A(coll, sum), \
|
||||
NCCL_FUNCS3A(coll, sum)
|
||||
#define NCCL_FUNCS2B(coll) \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy)
|
||||
#define NCCL_FUNCS2A(func) \
|
||||
NCCL_FUNCS3A(func, Sum), \
|
||||
NCCL_FUNCS3A(func, Sum), \
|
||||
NCCL_FUNCS3A(func, Sum), \
|
||||
NCCL_FUNCS3A(func, Sum)
|
||||
#define NCCL_FUNCS2B(func) \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum), \
|
||||
NCCL_FUNCS3B(func, Sum)
|
||||
|
||||
typedef void(*ncclKern_t)(struct ncclDevComm*);
|
||||
typedef void(*ncclKern_t)(struct ncclWorkElem first);
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
static ncclKern_t const ncclKerns[1] = {
|
||||
NCCL_KERN_NAME(ncclSendRecv, copy, i8)
|
||||
NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
||||
};
|
||||
|
||||
/*****************************************************************************/
|
||||
@@ -72,12 +72,8 @@ static ncclKern_t const ncclKerns[1] = {
|
||||
ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
|
||||
if (cgMode & 0x01) {
|
||||
CUDACHECK(hipExtLaunchMultiKernelMultiDevice(paramsList, numDevices,
|
||||
// These flags are to reduce the latency of using this API
|
||||
#if __HIP__
|
||||
hipCooperativeLaunchMultiDeviceNoPreSync|hipCooperativeLaunchMultiDeviceNoPostSync));
|
||||
#else
|
||||
0));
|
||||
#endif
|
||||
// These flags are to reduce the latency of using this API
|
||||
hipCooperativeLaunchMultiDeviceNoPreSync|hipCooperativeLaunchMultiDeviceNoPostSync));
|
||||
return ncclSuccess;
|
||||
}
|
||||
int savedDev;
|
||||
@@ -85,37 +81,51 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList,
|
||||
for (int i = 0; i < numDevices; i++) {
|
||||
hipLaunchParams* params = paramsList+i;
|
||||
CUDACHECK(hipSetDevice(cudaDevs[i]));
|
||||
hipLaunchKernelGGL(((void (*)(struct ncclDevComm*))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclDevComm ***)(params->args)));
|
||||
hipLaunchKernelGGL(((void (*)(struct ncclWorkElem))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclWorkElem**)params->args));
|
||||
}
|
||||
CUDACHECK(hipSetDevice(savedDev));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
|
||||
static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** work, struct ncclWorkElem* base) {
|
||||
if (channel->workCount == NCCL_MAX_OPS) {
|
||||
WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
int opIndex = channel->workFifoTail%NCCL_MAX_OPS;
|
||||
struct ncclWork* w = channel->workFifo+opIndex;
|
||||
struct ncclWorkElem* e = w->elems;
|
||||
volatile uint8_t* activePtr = (volatile uint8_t*)&e->active;
|
||||
while (LOAD(activePtr) != 0) sched_yield();
|
||||
memset(w, 0, sizeof(struct ncclWork));
|
||||
// Initialize with work elem if provided
|
||||
if (base) memcpy(e, base, sizeof(struct ncclWorkElem));
|
||||
STORE(&e->active, 1);
|
||||
e->index = opIndex;
|
||||
channel->workFifoTail++;
|
||||
channel->workCount++;
|
||||
if (work) *work = w;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
|
||||
// Only launch blocks where we have work to do.
|
||||
for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
|
||||
if (comm->channels[c].collCount) params->gridDim.x = c+1;
|
||||
if (comm->channels[c].workCount) params->gridDim.x = c+1;
|
||||
}
|
||||
|
||||
// Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
|
||||
for (int c=0; c<params->gridDim.x; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
if (channel->collCount == 0) {
|
||||
int opIndex = channel->collFifoTail;
|
||||
struct ncclColl* c = channel->collectives+opIndex;
|
||||
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
|
||||
while (activePtr[0] != 0) sched_yield();
|
||||
|
||||
c->args.p2p.delta = -1; // no-op
|
||||
c->funcIndex = FUNC_INDEX_P2P;
|
||||
c->args.comm = comm->devComm;
|
||||
c->active = 1;
|
||||
opIndex = (opIndex+1)%NCCL_MAX_OPS;
|
||||
c->nextIndex = opIndex;
|
||||
channel->collFifoTail = opIndex;
|
||||
channel->collCount++;
|
||||
if (channel->workCount == 0) {
|
||||
struct ncclWork* w;
|
||||
NCCLCHECK(getNextOp(channel, &w, NULL));
|
||||
struct ncclWorkElem* e = w->elems;
|
||||
e->comm = comm->devComm;
|
||||
e->funcIndex = FUNC_INDEX_P2P;
|
||||
e->p2p.nThreads = 0;
|
||||
}
|
||||
STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2);
|
||||
STORE(&channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active, 2);
|
||||
}
|
||||
|
||||
{ // [RCCL] Wait for any clique-based collectives
|
||||
@@ -124,9 +134,13 @@ ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
|
||||
|
||||
// Find the first operation, choose the kernel accordingly and pass it
|
||||
// as the first argument.
|
||||
struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
|
||||
struct ncclChannel* c0 = comm->channels;
|
||||
struct ncclWork* work = c0->workFifo+((c0->workFifoTail-c0->workCount)%NCCL_MAX_OPS);
|
||||
struct ncclWorkElem* elem = work->elems;
|
||||
memcpy(&comm->args, elem, sizeof(struct ncclWorkElem));
|
||||
// As we inline the first coll directly, we can free it immediately.
|
||||
if (elem->funcIndex != FUNC_INDEX_P2P) elem->active = 0;
|
||||
|
||||
comm->args = comm->devComm;
|
||||
params->func = (void *)ncclKerns[0];
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -137,7 +151,7 @@ ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
|
||||
bool done = false;
|
||||
while (done == false) {
|
||||
if (val >= comm->intraRanks) {
|
||||
WARN("Trying to launch too many collectives");
|
||||
WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
if (val+1 == comm->intraRanks) {
|
||||
@@ -157,7 +171,7 @@ ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
|
||||
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
|
||||
int val = LOAD(ptr);
|
||||
if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
|
||||
WARN("Trying to launch too many collectives");
|
||||
WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
|
||||
return ncclInternalError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -219,7 +233,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
|
||||
hipEvent_t startEvent;
|
||||
hipEvent_t stopEvent;
|
||||
if (comm->launchMode == ncclComm::PARALLEL) {
|
||||
hipLaunchKernelGGL(((void (*)(struct ncclDevComm*))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclDevComm ***)(params->args)));
|
||||
hipLaunchKernelGGL(((void (*)(struct ncclWorkElem))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclWorkElem**)params->args));
|
||||
} else {
|
||||
NCCLCHECK(ncclCpuBarrierOut(comm));
|
||||
}
|
||||
@@ -229,13 +243,18 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
|
||||
// launch and the ncclProxyStart call could cause a deadlock.
|
||||
// Also, starting the proxies after the CUDA launch seems to be better for
|
||||
// performance (latency).
|
||||
uint64_t max = 0ULL;
|
||||
for (int r=0; r<params->gridDim.x; r++) {
|
||||
struct ncclChannel* channel = comm->channels+r;
|
||||
channel->collStart = channel->collFifoTail;
|
||||
channel->collCount = 0;
|
||||
max = std::max(max, channel->workFifoTail);
|
||||
channel->workCount = 0;
|
||||
}
|
||||
for (int r=0; r<std::max(comm->nChannels, comm->p2pnChannels); r++) {
|
||||
struct ncclChannel* channel = comm->channels+r;
|
||||
channel->workFifoTail = max;
|
||||
}
|
||||
params->gridDim.x = params->blockDim.x = 0;
|
||||
comm->lastOpCount = comm->opCount;
|
||||
comm->lastOpCount = max;
|
||||
NCCLCHECK(ncclProxyStart(comm));
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -261,6 +280,13 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
|
||||
struct ncclComm* comm = info->comm;
|
||||
float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
|
||||
// Find algorithm / protocol.
|
||||
if (info->coll == ncclFuncAllToAll || info->coll == ncclFuncAllToAllv) {
|
||||
info->algorithm = NCCL_ALGO_RING;
|
||||
info->protocol = NCCL_PROTO_SIMPLE;
|
||||
info->nChannels = comm->nChannels;
|
||||
info->nThreads = NCCL_MAX_NTHREADS;
|
||||
return ncclSuccess;
|
||||
}
|
||||
info->algorithm = -1;
|
||||
info->protocol = -1;
|
||||
int nAlgos = NCCL_NUM_ALGORITHMS;
|
||||
@@ -281,10 +307,6 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if (info->coll == ncclCollAllToAll || info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAllv) {
|
||||
info->algorithm = NCCL_ALGO_RING;
|
||||
info->protocol = NCCL_PROTO_SIMPLE;
|
||||
}
|
||||
if (info->algorithm == -1 || info->protocol == -1) {
|
||||
WARN("Error : no algorithm/protocol available");
|
||||
return ncclInternalError;
|
||||
@@ -292,16 +314,12 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
|
||||
//if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
|
||||
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
|
||||
|
||||
int nc = (info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
|
||||
if (info->comm->topo->type == RCCL_TOPO_4P2H_ROME && (info->coll == ncclCollAllToAll ||
|
||||
info->coll == ncclCollGather || info->coll == ncclCollScatter || info->coll == ncclCollAllToAllv))
|
||||
nc = 2;
|
||||
int nc = (info->nChannels > 0) ? info->nChannels :
|
||||
(info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
|
||||
int nt = comm->maxThreads[info->algorithm][info->protocol];
|
||||
int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
|
||||
while (info->nBytes < nc*nt*threadThreshold) {
|
||||
// do not reduce channels in case of alltoall
|
||||
if (info->algorithm != NCCL_ALGO_COLLNET && info->coll != ncclCollAllToAll &&
|
||||
info->coll != ncclCollGather && info->coll != ncclCollScatter && info->coll != ncclCollAllToAllv && nc >= 2) nc--;
|
||||
if (info->algorithm != NCCL_ALGO_COLLNET && nc >= 2) nc--;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
// do not reduce threads count on VEGA
|
||||
#else
|
||||
@@ -312,6 +330,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
|
||||
if (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_TREE) nt += WARP_SIZE;
|
||||
#endif
|
||||
info->nChannels = nc;
|
||||
info->nThreads = nt;
|
||||
@@ -320,19 +339,17 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
|
||||
|
||||
static ncclResult_t getPatternInfo(struct ncclInfo* info) {
|
||||
switch (info->coll) {
|
||||
case ncclCollBroadcast:
|
||||
case ncclFuncBroadcast:
|
||||
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break;
|
||||
case ncclCollReduce:
|
||||
case ncclFuncReduce:
|
||||
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
|
||||
case ncclCollReduceScatter:
|
||||
case ncclCollAllGather:
|
||||
case ncclFuncReduceScatter:
|
||||
case ncclFuncAllGather:
|
||||
info->pattern = ncclPatternRing; break;
|
||||
case ncclCollAllReduce:
|
||||
case ncclFuncAllReduce:
|
||||
info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
|
||||
case ncclCollGather:
|
||||
case ncclCollScatter:
|
||||
case ncclCollAllToAll:
|
||||
case ncclCollAllToAllv:
|
||||
case ncclFuncAllToAll:
|
||||
case ncclFuncAllToAllv:
|
||||
info->pattern = ncclPatternAll; break;
|
||||
default:
|
||||
WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
|
||||
@@ -350,55 +367,47 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
|
||||
case ncclPatternPipelineTo:
|
||||
case ncclPatternCollTreeUp:
|
||||
case ncclPatternCollTreeDown:
|
||||
case ncclPatternAll:
|
||||
info->nstepsPerLoop = info->nchunksPerLoop = 1; break;
|
||||
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
|
||||
case ncclPatternRing:
|
||||
info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
|
||||
case ncclPatternRingTwice:
|
||||
info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
|
||||
case ncclPatternAll:
|
||||
info->nstepsPerLoop = 1;
|
||||
info->nchunksPerLoop = info->comm->nRanks; break;
|
||||
default:
|
||||
WARN("Unknown pattern %d\n", info->pattern);
|
||||
WARN("Unknown pattern %d", info->pattern);
|
||||
return ncclInternalError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
|
||||
coll->args.sendbuff = info->sendbuff;
|
||||
coll->args.recvbuff = info->recvbuff;
|
||||
coll->args.comm = info->comm->devComm;
|
||||
coll->args.opCount = info->comm->opCount;
|
||||
|
||||
if (info->coll == ncclCollSendRecv) {
|
||||
coll->args.p2p.sendCount = info->sendbytes;
|
||||
coll->args.p2p.recvCount = info->recvbytes;
|
||||
coll->args.p2p.delta = info->delta;
|
||||
coll->funcIndex = FUNC_INDEX_P2P;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
coll->args.p2p.nThreads = info->nThreads = info->comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
|
||||
#else
|
||||
coll->args.p2p.nThreads = info->nThreads = info->comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]+2*WARP_SIZE;
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyArgs* proxyArgs /* output */) {
|
||||
work->comm = info->comm->devComm;
|
||||
|
||||
// Set nstepsPerLoop and nchunksPerLoop
|
||||
NCCLCHECK(getAlgoInfo(info));
|
||||
NCCLCHECK(getPatternInfo(info));
|
||||
NCCLCHECK(getLoopInfo(info));
|
||||
|
||||
if (info->coll == ncclCollAllToAllv) {
|
||||
coll->args.a2av.count = info->count;
|
||||
coll->args.a2av.nChannels = info->nChannels;
|
||||
coll->args.a2av.nThreads = info->nThreads;
|
||||
} else {
|
||||
coll->args.coll.root = info->root;
|
||||
coll->args.coll.count = info->count;
|
||||
coll->args.coll.nChannels = info->nChannels;
|
||||
coll->args.coll.nThreads = info->nThreads;
|
||||
}
|
||||
if ((info->coll == ncclFuncAllToAll || info->coll == ncclFuncAllToAllv)
|
||||
&& info->comm->topo->nodes[NET].count == 0 && info->comm->topo->type == RCCL_TOPO_4P2H_ROME)
|
||||
info->nChannels =info->comm->p2pnChannels;
|
||||
|
||||
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
|
||||
work->opCount = info->comm->opCount;
|
||||
work->sendbuff = info->sendbuff;
|
||||
work->recvbuff = info->recvbuff;
|
||||
if (info->coll == ncclFuncAllToAllv) {
|
||||
work->a2av.count = info->count;
|
||||
work->a2av.nChannels = info->nChannels;
|
||||
} else {
|
||||
work->coll.root = info->root;
|
||||
work->coll.count = info->count;
|
||||
work->coll.nChannels = info->nChannels;
|
||||
}
|
||||
work->nThreads = info->nThreads;
|
||||
|
||||
work->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
|
||||
|
||||
{ // [RCCL] Check for clique-based kernel support
|
||||
if (info->comm->cliqueManager->IsSupported(info->coll,
|
||||
@@ -416,13 +425,13 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
|
||||
info->protocol = NCCL_PROTO_CLIQUE;
|
||||
// Determine the number of channels to use for clique-kernel
|
||||
NCCLCHECK(info->comm->cliqueManager->GetNumChannelsToUse(info->coll,
|
||||
info->count,
|
||||
info->datatype,
|
||||
info->op,
|
||||
info->comm->nChannels,
|
||||
&coll->args.clique.nChannels));
|
||||
coll->args.clique.count = info->count;
|
||||
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
|
||||
info->count,
|
||||
info->datatype,
|
||||
info->op,
|
||||
info->comm->nChannels,
|
||||
&work->clique.nChannels));
|
||||
work->clique.count = info->count;
|
||||
work->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
|
||||
return ncclSuccess;
|
||||
}
|
||||
} // [RCCL]
|
||||
@@ -436,25 +445,25 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
|
||||
if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_SIMPLE) {
|
||||
if (info->pattern == ncclPatternTreeUpDown) {
|
||||
// Optimize chunkSize / nSteps
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*8 && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
}
|
||||
// Use lastChunkSize as chunkSize
|
||||
coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
} else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
|
||||
// Optimize chunkSize / nSteps
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*16 && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*16 && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*4 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
// Use lastChunkSize as chunkSize
|
||||
coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
} else if (info->protocol == NCCL_PROTO_LL) {
|
||||
const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
|
||||
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
|
||||
coll->args.coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
|
||||
ALIGN_SIZE(coll->args.coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
|
||||
coll->args.coll.lastChunkSize /= ncclTypeSize(info->datatype);
|
||||
work->coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
|
||||
ALIGN_SIZE(work->coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
|
||||
work->coll.lastChunkSize /= ncclTypeSize(info->datatype);
|
||||
} else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
|
||||
int nNodes = info->comm->nNodes;
|
||||
float ppn = info->comm->nRanks / (float)nNodes;
|
||||
@@ -462,7 +471,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
|
||||
// Use lastChunkSize as chunkSize
|
||||
coll->args.coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
|
||||
work->coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
|
||||
}
|
||||
|
||||
// Compute nSteps for proxies
|
||||
@@ -470,20 +479,20 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
|
||||
if (info->protocol == NCCL_PROTO_LL) chunkEffectiveSize /= 2;
|
||||
if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
|
||||
//if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
|
||||
int nLoops;
|
||||
if (info->pattern != ncclPatternAll)
|
||||
nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
|
||||
else
|
||||
nLoops = (int)(DIVUP(info->nBytes, (((size_t)((info->nChannels >= info->comm->nRanks ? (info->nChannels/info->comm->nRanks) : 1))))*info->comm->nRanks*info->nchunksPerLoop*chunkEffectiveSize));
|
||||
int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
|
||||
proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
|
||||
proxyArgs->sliceSteps = sliceSteps;
|
||||
proxyArgs->chunkSteps = chunkSteps;
|
||||
proxyArgs->protocol = info->protocol;
|
||||
proxyArgs->opCount = info->comm->opCount;
|
||||
proxyArgs->dtype = info->datatype;
|
||||
proxyArgs->redOp = info->op;
|
||||
if (info->coll != ncclCollAllToAllv) TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d ces %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
|
||||
coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, chunkEffectiveSize, info->nBytes, info->protocol, info->nChannels, info->nThreads,
|
||||
// This is used by P2P to reduce the receive buffer size. We don't use it in collectives
|
||||
// because some protocols need to transmit more than the total size, plus they sometimes
|
||||
// round up
|
||||
proxyArgs->recvbytes = stepSize*proxyArgs->sliceSteps;
|
||||
|
||||
TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
|
||||
proxyArgs->opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
|
||||
nLoops, proxyArgs->nsteps, info->comm);
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -500,33 +509,26 @@ static ncclResult_t checkSetStream(struct ncclInfo* info) {
|
||||
}
|
||||
|
||||
ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
|
||||
if (info->comm->nRanks == 1 && info->coll != ncclCollSendRecv) {
|
||||
if (info->comm->nRanks == 1) {
|
||||
if (info->sendbuff != info->recvbuff)
|
||||
CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclColl coll;
|
||||
struct ncclWorkElem work;
|
||||
struct ncclProxyArgs proxyArgs;
|
||||
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
|
||||
NCCLCHECK(computeColl(info, &coll, &proxyArgs));
|
||||
NCCLCHECK(computeColl(info, &work, &proxyArgs));
|
||||
|
||||
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
|
||||
|
||||
int nChannels = info->coll == ncclCollSendRecv ? 1 : coll.args.coll.nChannels;
|
||||
|
||||
int nChannels = (info->coll == ncclFuncAllToAllv) ? work.a2av.nChannels : work.coll.nChannels;
|
||||
int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;
|
||||
|
||||
for (int bid=0; bid<nChannels*nSubChannels; bid++) {
|
||||
int channelId = (info->coll == ncclCollSendRecv) ? info->channelId :
|
||||
info->comm->myParams->gridDim.x % info->comm->nChannels;
|
||||
int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
|
||||
struct ncclChannel* channel = info->comm->channels+channelId;
|
||||
|
||||
if (channel->collCount == NCCL_MAX_OPS) {
|
||||
WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
|
||||
// Proxy
|
||||
proxyArgs.channel = channel;
|
||||
// Adjust pattern for CollNet based on channel index
|
||||
@@ -534,84 +536,165 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
|
||||
info->pattern = (channelId < info->comm->nChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown;
|
||||
}
|
||||
|
||||
if (info->coll == ncclCollSendRecv) {
|
||||
info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
|
||||
NCCLCHECK(ncclProxySaveP2p(info, channel));
|
||||
} else if (info->coll == ncclCollAllToAll || info->coll == ncclCollScatter || info->coll == ncclCollGather || info->coll == ncclCollAllToAllv) {
|
||||
NCCLCHECK(ncclProxySaveA2a(&proxyArgs, info));
|
||||
} else {
|
||||
NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
|
||||
}
|
||||
info->comm->myParams->gridDim.x++;
|
||||
int opIndex = channel->collFifoTail;
|
||||
struct ncclColl* c = channel->collectives+opIndex;
|
||||
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
|
||||
while (LOAD(activePtr) != 0) sched_yield();
|
||||
if (proxyArgs.nsteps) NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
|
||||
|
||||
memcpy(c, &coll, sizeof(struct ncclColl));
|
||||
if (info->coll == ncclCollAllToAllv) {
|
||||
c->args.a2av.extra = channel->collectivesExtra + info->comm->nRanks*4*opIndex;
|
||||
memcpy(c->args.a2av.extra, info->sendcounts, sizeof(size_t*)*(info->comm->nRanks));
|
||||
memcpy(c->args.a2av.extra+info->comm->nRanks, info->sdispls, sizeof(size_t*)*(info->comm->nRanks));
|
||||
memcpy(c->args.a2av.extra+info->comm->nRanks*2, info->recvcounts, sizeof(size_t*)*(info->comm->nRanks));
|
||||
memcpy(c->args.a2av.extra+info->comm->nRanks*3, info->rdispls, sizeof(size_t*)*(info->comm->nRanks));
|
||||
c->args.a2av.bid = bid % coll.args.coll.nChannels;
|
||||
} else if (info->coll != ncclCollSendRecv) {
|
||||
c->args.coll.bid = bid % coll.args.coll.nChannels;
|
||||
info->comm->myParams->gridDim.x++;
|
||||
if (info->coll == ncclFuncAllToAllv) {
|
||||
work.a2av.bid = bid % work.a2av.nChannels;
|
||||
} else {
|
||||
work.coll.bid = bid % nChannels;
|
||||
}
|
||||
|
||||
// [RCCL] Setup pointers to where all the input/output pointers will be
|
||||
if (info->protocol == NCCL_PROTO_CLIQUE) {
|
||||
NCCLCHECK(info->comm->cliqueManager->SetCliqueCollectiveArgs(&c->args));
|
||||
NCCLCHECK(info->comm->cliqueManager->SetCliqueArgs(&work));
|
||||
}
|
||||
// [/RCCL]
|
||||
|
||||
STORE(&c->active, 1);
|
||||
opIndex = (opIndex+1)%NCCL_MAX_OPS;
|
||||
c->nextIndex = opIndex;
|
||||
channel->collFifoTail = opIndex;
|
||||
channel->collCount++;
|
||||
struct ncclWork* w;
|
||||
NCCLCHECK(getNextOp(channel, &w, &work));
|
||||
if (info->coll == ncclFuncAllToAllv) {
|
||||
struct ncclWorkElem* e = w->elems;
|
||||
size_t* params = channel->a2avParams + info->comm->nRanks*4*e->index;
|
||||
memcpy(params, info->sendcounts, sizeof(size_t*)*(info->comm->nRanks));
|
||||
memcpy(params+info->comm->nRanks, info->sdispls, sizeof(size_t*)*(info->comm->nRanks));
|
||||
memcpy(params+info->comm->nRanks*2, info->recvcounts, sizeof(size_t*)*(info->comm->nRanks));
|
||||
memcpy(params+info->comm->nRanks*3, info->rdispls, sizeof(size_t*)*(info->comm->nRanks));
|
||||
}
|
||||
}
|
||||
info->comm->opCount++;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Save p2p operations in comm->p2plist. Operations will be posted to channels
|
||||
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
|
||||
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
|
||||
|
||||
ncclResult_t ncclSaveCommKernels(ncclComm_t comm) {
|
||||
if (comm->asyncOpCount == 0) {
|
||||
return ncclSuccess;
|
||||
} else if (comm->asyncOpCount == 1) {
|
||||
// No aggregation
|
||||
struct ncclInfo* info = comm->asyncOps;
|
||||
info->nChannels = 0;
|
||||
NCCLCHECK(ncclSaveKernel(info));
|
||||
} else {
|
||||
// Aggregation
|
||||
size_t channelSize = NCCL_AGG_CHANNEL_SIZE * comm->nRanks; // scale channel size based on nranks as latency increases
|
||||
// Reduce the per-channel size if we cannot fully utilize the channels
|
||||
while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2;
|
||||
for (int c = 0; c < comm->asyncOpCount; c++) {
|
||||
struct ncclInfo* info = comm->asyncOps+c;
|
||||
info->nChannels = std::min((int)DIVUP(info->nBytes, channelSize), comm->nChannels); // assign number of channels
|
||||
NCCLCHECK(ncclSaveKernel(info));
|
||||
}
|
||||
}
|
||||
// Reset counters
|
||||
comm->asyncOpCount = 0;
|
||||
comm->asyncTotalSize = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclSaveAsyncColl(struct ncclInfo* info) {
|
||||
ncclComm_t comm = info->comm;
|
||||
if (comm->asyncOpCount >= NCCL_MAX_OPS) {
|
||||
WARN("Too many async operations in progress, max is %d", NCCL_MAX_OPS);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
memcpy(comm->asyncOps+comm->asyncOpCount, info, sizeof(struct ncclInfo));
|
||||
comm->asyncOpCount++;
|
||||
comm->asyncTotalSize += info->nBytes;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Save p2p operations in comm->p2pSends and p2pRecvs. Operations will be posted to channels
|
||||
// during ncclGroupEnd()
|
||||
ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
|
||||
static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
|
||||
struct ncclComm* comm = info->comm;
|
||||
struct ncclP2Plist* p2plist = &comm->p2plist;
|
||||
int peer = info->root;
|
||||
p2plist->count++;
|
||||
ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
|
||||
if (info->recvbuff == NULL) {
|
||||
if (info->opName[0] == 'S') { // Send
|
||||
if (peer != comm->rank) {
|
||||
int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].send.connected == 0) {
|
||||
p2plist->connect.send[channelId*comm->nRanks+p2plist->connect.nsend[channelId]++] = peer;
|
||||
comm->connectSend[peer] |= (1<<channelId);
|
||||
comm->connect = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
p2plist->peerlist[info->root].sendbytes = nBytes;
|
||||
p2plist->peerlist[info->root].sendbuff = info->sendbuff;
|
||||
NCCLCHECK(enqueueP2pInfo(comm->p2pSends+info->root, (void*)info->sendbuff, nBytes));
|
||||
comm->p2pSendCount++;
|
||||
} else {
|
||||
if (peer != comm->rank) {
|
||||
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].recv.connected == 0) {
|
||||
p2plist->connect.recv[channelId*comm->nRanks+p2plist->connect.nrecv[channelId]++] = peer;
|
||||
comm->connectRecv[peer] |= (1<<channelId);
|
||||
comm->connect = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
p2plist->peerlist[info->root].recvbytes = nBytes;
|
||||
p2plist->peerlist[info->root].recvbuff = info->recvbuff;
|
||||
NCCLCHECK(enqueueP2pInfo(comm->p2pRecvs+info->root, info->recvbuff, nBytes));
|
||||
comm->p2pRecvCount++;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static int getSegment(struct ncclInfo* info, struct ncclWork* work) {
|
||||
const int e = (info->comm->topo->nodes[NET].count == 0 && info->comm->topo->type == RCCL_TOPO_4P2H_ROME)
|
||||
? 1 : NCCL_MAX_WORK_ELEMENTS;
|
||||
for (int s=0; s<e && work->elems[s].p2p.delta != info->delta; s++) {
|
||||
if (work->elems[s].p2p.nThreads == 0) return s;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static ncclResult_t saveP2pOp(struct ncclInfo* info /* input */, struct ncclWork* work, int s) {
|
||||
struct ncclWorkElem* elem = work->elems+s;
|
||||
elem->comm = info->comm->devComm;
|
||||
elem->funcIndex = FUNC_INDEX_P2P;
|
||||
elem->nThreads = info->nThreads = NCCL_MAX_NTHREADS;
|
||||
elem->sendbuff = info->sendbuff;
|
||||
elem->recvbuff = info->recvbuff;
|
||||
elem->opCount = info->comm->lastOpCount;
|
||||
elem->p2p.sendCount = info->sendbytes;
|
||||
elem->p2p.recvCount = info->recvbytes;
|
||||
elem->p2p.delta = info->delta;
|
||||
const int nsegments = s+1;
|
||||
int nThreads = 512;
|
||||
while (nsegments*nThreads > 256) nThreads /= 2;
|
||||
//if (nThreads >= 128) nThreads += WARP_SIZE;
|
||||
for (int i=0; i<nsegments; i++) work->elems[i].p2p.nThreads = nThreads;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info) {
|
||||
int channelId = info->channelId;
|
||||
struct ncclChannel* channel = info->comm->channels+channelId;
|
||||
|
||||
// Try to reuse last p2p operation if not full yet
|
||||
int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
|
||||
struct ncclWork* w = channel->workFifo+opIndex;
|
||||
int segment = -1;
|
||||
if (channel->workCount && w->elems[0].funcIndex == FUNC_INDEX_P2P && w->elems[NCCL_MAX_WORK_ELEMENTS-1].p2p.nThreads == 0) {
|
||||
// Try to pack more segments into a single operation
|
||||
segment = getSegment(info, w);
|
||||
}
|
||||
if (segment == -1) {
|
||||
NCCLCHECK(getNextOp(channel, &w, NULL));
|
||||
segment = 0;
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclProxySaveP2p(info, channel, segment));
|
||||
NCCLCHECK(saveP2pOp(info, w, segment));
|
||||
info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
|
||||
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
|
||||
// Launch asynchronously if needed
|
||||
if (ncclAsyncMode()) {
|
||||
@@ -629,19 +712,22 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
|
||||
NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
|
||||
NCCLCHECKGOTO(checkSetStream(info), ret, end);
|
||||
|
||||
if (info->coll == ncclCollAllToAllv)
|
||||
if (info->coll == ncclFuncAllToAllv)
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p",
|
||||
info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls,
|
||||
info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
||||
else
|
||||
else if (info->coll != ncclFuncSendRecv)
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
||||
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
|
||||
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
||||
|
||||
if (info->coll == ncclCollSendRecv) { //p2p stored separately
|
||||
if (info->coll == ncclFuncSendRecv) { //p2p stored separately
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
||||
info->opName, info->comm->lastOpCount, info->sendbuff, info->recvbuff, info->count,
|
||||
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
||||
NCCLCHECKGOTO(ncclSaveP2p(info), ret, end);
|
||||
} else {
|
||||
NCCLCHECKGOTO(ncclSaveKernel(info), ret, end);
|
||||
NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end);
|
||||
}
|
||||
|
||||
end:
|
||||
@@ -653,7 +739,7 @@ end:
|
||||
NCCLCHECK(ArgsCheck(info));
|
||||
NCCLCHECK(checkSetStream(info));
|
||||
|
||||
if (info->coll == ncclCollAllToAllv)
|
||||
if (info->coll == ncclFuncAllToAllv)
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p",
|
||||
info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls,
|
||||
info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
||||
|
||||
+77
-102
@@ -25,14 +25,10 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
for (int c=0; c<nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
channel->ring.prev = channel->ring.next = -1;
|
||||
channel->treeUp.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1;
|
||||
channel->treeDn.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1;
|
||||
channel->collTreeUp.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeUp.down[i] = -1;
|
||||
channel->collTreeDn.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeDn.down[i] = -1;
|
||||
channel->tree.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
|
||||
channel->collTree.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTree.down[i] = -1;
|
||||
|
||||
int* ringIntra = ringGraph->intra+c*localRanks;
|
||||
int* treeIntra = treeGraph->intra+c*localRanks;
|
||||
@@ -46,33 +42,21 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1];
|
||||
}
|
||||
if (treeIntra[i] == rank) {
|
||||
int recvIndex = 0, sendIndex = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
|
||||
int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
|
||||
int parentIndex = 0;
|
||||
int child0Index = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
|
||||
int child1Index = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
|
||||
|
||||
// Tree loop always flows in the same direction. Other trees are symmetric, i.e.
|
||||
// up/down go in reverse directions
|
||||
int sym = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP ? 0 : 1;
|
||||
|
||||
// Down tree is common
|
||||
topoRanks->treeDnRecv[c] = treeIntra[recvIndex];
|
||||
topoRanks->treeDnSend[c] = treeIntra[sendIndex];
|
||||
channel->treeDn.up = treeIntra[prev];
|
||||
channel->treeDn.down[0] = treeIntra[next];
|
||||
// Up tree depends on the pattern
|
||||
topoRanks->treeUpRecv[c] = sym ? topoRanks->treeDnSend[c] : topoRanks->treeDnRecv[c];
|
||||
topoRanks->treeUpSend[c] = sym ? topoRanks->treeDnRecv[c] : topoRanks->treeDnSend[c];
|
||||
channel->treeUp.down[0] = sym ? channel->treeDn.down[0] : channel->treeDn.up ;
|
||||
channel->treeUp.up = sym ? channel->treeDn.up : channel->treeDn.down[0];
|
||||
topoRanks->treeToParent[c] = treeIntra[parentIndex];
|
||||
topoRanks->treeToChild0[c] = treeIntra[child0Index];
|
||||
topoRanks->treeToChild1[c] = treeIntra[child1Index];
|
||||
channel->tree.up = i == 0 ? -1 : treeIntra[i-1];
|
||||
channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1];
|
||||
}
|
||||
if (collNetIntra[i] == rank) {
|
||||
int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
|
||||
|
||||
// CollTrees are always symmetric, i.e.
|
||||
// up/down go in reverse directions
|
||||
channel->collTreeDn.up = collNetIntra[prev];
|
||||
channel->collTreeDn.down[0] = collNetIntra[next];
|
||||
channel->collTreeUp.down[0] = channel->collTreeDn.down[0];
|
||||
channel->collTreeUp.up = channel->collTreeDn.up;
|
||||
channel->collTree.up = collNetIntra[prev];
|
||||
channel->collTree.down[0] = collNetIntra[next];
|
||||
}
|
||||
}
|
||||
topoRanks->ringPrev[c] = channel->ring.prev;
|
||||
@@ -122,72 +106,66 @@ static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstR
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t setTreeUp(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int u0, int u1) {
|
||||
if (u0 != -1) tree0->up = indexes[u0];
|
||||
if (u1 != -1) tree1->up = indexes[u1];
|
||||
static ncclResult_t setTreeUp(struct ncclTree* tree, int* indexes, int u) {
|
||||
if (u == -1) return ncclSuccess;
|
||||
tree->up = indexes[u];
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t addRanksDown(int* down, int* indexes, int r0, int r1) {
|
||||
static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
|
||||
if (d == -1) return ncclSuccess;
|
||||
int x = 0;
|
||||
if (down[x] >= 0) x++;
|
||||
if (down[x] >= 0) {
|
||||
WARN("Internal error : tree already has more than one child (%d %d %d)\n", down[0], down[1], down[2]);
|
||||
while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
|
||||
if (x == NCCL_MAX_TREE_ARITY) {
|
||||
WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (r0 != -1) down[x++] = indexes[r0];
|
||||
if (r1 != -1) down[x++] = indexes[r1];
|
||||
tree->down[x] = indexes[d];
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t setTreeDown(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int d0_0, int d0_1, int d1_0, int d1_1) {
|
||||
NCCLCHECK(addRanksDown(tree0->down, indexes, d0_0, d0_1));
|
||||
NCCLCHECK(addRanksDown(tree1->down, indexes, d1_0, d1_1));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t openRing(struct ncclTree* tree, int rank, int upRank) {
|
||||
if (tree->down[0] == upRank) tree->down[0] = -1;
|
||||
if (rank == upRank) tree->up = -1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* treeUpSend, int* treeDnRecv, int* treeDnSend, int* firstRanks) {
|
||||
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* firstRanks, int* treePatterns) {
|
||||
const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
|
||||
int* indexesSend, *indexesRecv;
|
||||
NCCLCHECK(ncclCalloc(&indexesSend, nNodes));
|
||||
NCCLCHECK(ncclCalloc(&indexesRecv, nNodes));
|
||||
int* ranksToParent, *ranksToChild0, *ranksToChild1;
|
||||
NCCLCHECK(ncclCalloc(&ranksToParent, nNodes));
|
||||
NCCLCHECK(ncclCalloc(&ranksToChild0, nNodes));
|
||||
NCCLCHECK(ncclCalloc(&ranksToChild1, nNodes));
|
||||
|
||||
// Compute tree depth. Not an exact value but a good approximation in most
|
||||
// cases
|
||||
int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
|
||||
|
||||
int u0, d0_0, d0_1, u1, d1_0, d1_1;
|
||||
NCCLCHECK(ncclGetDtree(nNodes, node, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
|
||||
int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
|
||||
NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
|
||||
for (int c=0; c<nChannels; c++) {
|
||||
struct ncclChannel* channel0 = comm->channels+c;
|
||||
struct ncclChannel* channel1 = channel0+nChannels;
|
||||
NCCLCHECK(getIndexes(treeUpSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
|
||||
NCCLCHECK(getIndexes(treeUpRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
|
||||
NCCLCHECK(openRing(&channel0->treeUp, comm->rank, indexesSend[node]));
|
||||
NCCLCHECK(openRing(&channel1->treeUp, comm->rank, indexesSend[node]));
|
||||
int root = indexesSend[node];
|
||||
if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeUp, &channel1->treeUp, indexesRecv, u0, u1));
|
||||
if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeUp, &channel1->treeUp, indexesSend, d0_0, d0_1, d1_0, d1_1));
|
||||
NCCLCHECK(getIndexes(treeDnSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
|
||||
NCCLCHECK(getIndexes(treeDnRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
|
||||
NCCLCHECK(openRing(&channel0->treeDn, comm->rank, u0 == -1 ? root : indexesRecv[node]));
|
||||
NCCLCHECK(openRing(&channel1->treeDn, comm->rank, u1 == -1 ? root : indexesRecv[node]));
|
||||
if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeDn, &channel1->treeDn, indexesRecv, d0_0, d0_1, d1_0, d1_1));
|
||||
if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeDn, &channel1->treeDn, indexesSend, u0, u1));
|
||||
TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c, channel0->treeUp.up, channel0->treeUp.down[0], channel0->treeUp.down[1], channel0->treeUp.down[2]);
|
||||
TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c+nChannels, channel1->treeUp.up, channel1->treeUp.down[0], channel1->treeUp.down[1], channel1->treeUp.down[2]);
|
||||
TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c, channel0->treeDn.up, channel0->treeDn.down[0], channel0->treeDn.down[1], channel0->treeDn.down[2]);
|
||||
TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c+nChannels, channel1->treeDn.up, channel1->treeDn.down[0], channel1->treeDn.down[1], channel1->treeDn.down[2]);
|
||||
channel0->treeUp.depth = channel1->treeUp.depth = depth;
|
||||
NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks));
|
||||
NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks));
|
||||
NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks));
|
||||
if (comm->rank == ranksToParent[node]) {
|
||||
NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ranksToChild0 : ranksToChild1, t0u));
|
||||
NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ranksToChild0 : ranksToChild1, t1u));
|
||||
}
|
||||
if (comm->rank == ranksToChild0[node]) {
|
||||
NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d0));
|
||||
NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d0));
|
||||
}
|
||||
if (comm->rank == ranksToChild1[node]) {
|
||||
NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d1));
|
||||
NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d1));
|
||||
}
|
||||
if (comm->rank == ranksToParent[node] ||
|
||||
comm->rank == ranksToChild0[node] ||
|
||||
comm->rank == ranksToChild1[node]) {
|
||||
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
|
||||
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
|
||||
}
|
||||
channel0->tree.depth = channel1->tree.depth = depth;
|
||||
}
|
||||
free(indexesSend);
|
||||
free(indexesRecv);
|
||||
free(ranksToParent);
|
||||
free(ranksToChild0);
|
||||
free(ranksToChild1);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -200,13 +178,13 @@ ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
// Set root of collTree to id nranks
|
||||
if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master
|
||||
channel->collTreeUp.up = channel->collTreeDn.up = nranks;
|
||||
channel->collTree.up = nranks;
|
||||
}
|
||||
if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
|
||||
channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
|
||||
channel->collTree.down[0] = -1;
|
||||
}
|
||||
channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
|
||||
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTreeUp.up, channel->collTreeUp.down[0]);
|
||||
channel->collTree.depth = depth;
|
||||
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTree.up, channel->collTree.down[0]);
|
||||
}
|
||||
int recvIndex = 0; // recv GPU index is always 0
|
||||
int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks;
|
||||
@@ -214,13 +192,13 @@ ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
struct ncclChannel* channel = comm->channels+comm->nChannels/2+c;
|
||||
// Set root of collTree to id nranks
|
||||
if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master
|
||||
channel->collTreeUp.up = channel->collTreeDn.up = nranks;
|
||||
channel->collTree.up = nranks;
|
||||
}
|
||||
if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
|
||||
channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
|
||||
channel->collTree.down[0] = -1;
|
||||
}
|
||||
channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
|
||||
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTreeDn.up, channel->collTreeDn.down[0]);
|
||||
channel->collTree.depth = depth;
|
||||
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTree.up, channel->collTree.down[0]);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -237,7 +215,7 @@ int ncclMinNchannels() {
|
||||
if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
|
||||
if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
|
||||
if (minNchannels > MAXCHANNELS) {
|
||||
WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS);
|
||||
WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
|
||||
minNchannels = MAXCHANNELS;
|
||||
}
|
||||
if (minNchannels < 0) minNchannels = 0;
|
||||
@@ -249,41 +227,39 @@ int ncclMaxNchannels() {
|
||||
if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
|
||||
if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
|
||||
if (maxNchannels < 1) {
|
||||
WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels);
|
||||
WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
|
||||
maxNchannels = 1;
|
||||
}
|
||||
return maxNchannels;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets) {
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets) {
|
||||
// Gather data from all ranks
|
||||
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
|
||||
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1;
|
||||
int nranks = comm->nRanks;
|
||||
int nChannels = comm->nChannels;
|
||||
NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeUpRecv, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeUpSend, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeDnRecv, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeDnSend, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeToParent, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeToChild0, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeToChild1, nranks*MAXCHANNELS));
|
||||
for (int i=0; i<nranks; i++) {
|
||||
for (int c=0; c<nChannels;c++) {
|
||||
ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c];
|
||||
ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c];
|
||||
ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c];
|
||||
ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c];
|
||||
treeUpRecv[c*nranks+i] = allTopoRanks[i]->treeUpRecv[c];
|
||||
treeUpSend[c*nranks+i] = allTopoRanks[i]->treeUpSend[c];
|
||||
treeDnRecv[c*nranks+i] = allTopoRanks[i]->treeDnRecv[c];
|
||||
treeDnSend[c*nranks+i] = allTopoRanks[i]->treeDnSend[c];
|
||||
treeToParent[c*nranks+i] = allTopoRanks[i]->treeToParent[c];
|
||||
treeToChild0[c*nranks+i] = allTopoRanks[i]->treeToChild0[c];
|
||||
treeToChild1[c*nranks+i] = allTopoRanks[i]->treeToChild1[c];
|
||||
}
|
||||
}
|
||||
|
||||
// Connect rings and trees. This should also duplicate the channels.
|
||||
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks));
|
||||
NCCLCHECK(connectTrees(comm, treeUpRecv, treeUpSend, treeDnRecv, treeDnSend, firstRanks));
|
||||
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, firstRanks, treePatterns));
|
||||
|
||||
// Duplicate ringPrev/ringNext for ncclBuildRing
|
||||
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
|
||||
@@ -317,10 +293,9 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
|
||||
free(ringSend);
|
||||
free(ringPrev);
|
||||
free(ringNext);
|
||||
free(treeUpRecv);
|
||||
free(treeUpSend);
|
||||
free(treeDnRecv);
|
||||
free(treeDnSend);
|
||||
free(treeToParent);
|
||||
free(treeToChild0);
|
||||
free(treeToChild1);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
+28
-110
@@ -26,88 +26,10 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("Could not find node of type %d id %lx\n", t, id);
|
||||
WARN("Could not find node of type %d id %lx", t, id);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
// [RCCL]
|
||||
// This function traverses only XGMI links (including multi-GPU hops) and builds them into the
|
||||
// topology system, which corresponds to how XGMI hardware operates
|
||||
static ncclResult_t ncclTopoSetXgmi(struct ncclTopoSystem* system)
|
||||
{
|
||||
// Compute paths to GPU g
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
struct ncclTopoNode *baseNode = system->nodes[GPU].nodes+g;
|
||||
|
||||
if (baseNode->paths[baseNode->type] == NULL) {
|
||||
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
|
||||
}
|
||||
|
||||
// breadth-first search to set all paths to that node in the system
|
||||
struct ncclTopoNodeList nodeList;
|
||||
struct ncclTopoNodeList nextNodeList;
|
||||
nodeList.count = 1; nodeList.list[0] = baseNode;
|
||||
nextNodeList.count = 0;
|
||||
struct ncclTopoLinkList* basePath;
|
||||
NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
|
||||
basePath->count = 0;
|
||||
basePath->width = LOC_WIDTH;
|
||||
basePath->type = PATH_LOC;
|
||||
|
||||
while (nodeList.count) {
|
||||
nextNodeList.count = 0;
|
||||
for (int n=0; n<nodeList.count; n++) {
|
||||
struct ncclTopoNode* node = nodeList.list[n];
|
||||
struct ncclTopoLinkList* path;
|
||||
NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
|
||||
for (int l=0; l<node->nlinks; l++) {
|
||||
struct ncclTopoLink* link = node->links+l;
|
||||
struct ncclTopoNode* remNode = link->remNode;
|
||||
|
||||
// Skip non-XGMI links
|
||||
if (link->type != LINK_NVL) continue;
|
||||
|
||||
if (remNode->paths[baseNode->type] == NULL) {
|
||||
NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
|
||||
}
|
||||
|
||||
struct ncclTopoLinkList* remPath;
|
||||
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
|
||||
float width = std::min(path->width, link->width);
|
||||
if (remPath->width < width) {
|
||||
// Find reverse link
|
||||
for (int l=0; l<remNode->nlinks; l++) {
|
||||
if (remNode->links[l].remNode == node) {
|
||||
remPath->list[0] = remNode->links+l;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (remPath->list[0] == NULL) {
|
||||
WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
|
||||
remNode->type, remNode->id, remNode->nlinks, node->type, node->id);
|
||||
return ncclInternalError;
|
||||
}
|
||||
// Copy the rest of the path
|
||||
for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
|
||||
remPath->count = path->count + 1;
|
||||
remPath->width = width;
|
||||
remPath->type = PATH_NVL;
|
||||
|
||||
// Add to the list for the next iteration if not already in the list
|
||||
// In this case, permit GPUs are intermediate XGMI steps
|
||||
for (int i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) continue;
|
||||
nextNodeList.list[nextNodeList.count++] = remNode;
|
||||
}
|
||||
}
|
||||
}
|
||||
memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
// [/RCCL]
|
||||
|
||||
|
||||
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
|
||||
if (baseNode->paths[baseNode->type] == NULL) {
|
||||
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
|
||||
@@ -139,14 +61,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
|
||||
struct ncclTopoLinkList* remPath;
|
||||
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
|
||||
float width = std::min(path->width, link->width);
|
||||
|
||||
// [RCCL] Do not let XGMI paths be overwritten (even if PCIe path may be faster)
|
||||
// Unless they are of shorter length
|
||||
// if (remPath->width < width) {
|
||||
bool notXGMI = remPath->type != PATH_NVL;
|
||||
if (remPath->width < width && notXGMI) {
|
||||
// [/RCCL]
|
||||
|
||||
if (remPath->width < width) {
|
||||
// Find reverse link
|
||||
for (int l=0; l<remNode->nlinks; l++) {
|
||||
if (remNode->links[l].remNode == node) {
|
||||
@@ -166,14 +81,13 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
|
||||
|
||||
// Start with path type = link type. PATH and LINK types are supposed to match.
|
||||
// Don't consider LINK_NET as we only care about the NIC->GPU path.
|
||||
int type = link->type == LINK_NET ? 0 : link->type;
|
||||
int type = link->type == LINK_NET ? LINK_LOC : link->type;
|
||||
// Differentiate between one and multiple PCI switches
|
||||
if (type == PATH_PIX && (node->type == PCI || link->remNode->type == PCI) && remPath->count > 3) type = PATH_PXB;
|
||||
if (node->type == PCI && remNode->type == PCI) type = PATH_PXB;
|
||||
// Consider a path going through the CPU as PATH_PHB
|
||||
if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB;
|
||||
// Ignore Power CPU in an NVLink path
|
||||
if (path->type == PATH_NVL && type == PATH_SYS && link->remNode->type == CPU &&
|
||||
link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) type = 0;
|
||||
// Set 1 hop NVLink as NVB
|
||||
//if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
|
||||
|
||||
remPath->type = std::max(path->type, type);
|
||||
|
||||
@@ -303,7 +217,7 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
|
||||
if (l == -1) {
|
||||
char* str = getenv(levelEnv);
|
||||
if (str) {
|
||||
for (int i=0; i<PATH_NET; i++) {
|
||||
for (int i=0; i<=PATH_SYS; i++) {
|
||||
if (strcmp(str, topoPathTypeStr[i]) == 0) {
|
||||
l = i;
|
||||
break;
|
||||
@@ -325,9 +239,10 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
|
||||
}
|
||||
|
||||
int ncclTopoUserP2pLevel = -1;
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read) {
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) {
|
||||
*p2p = 0;
|
||||
*read = 0;
|
||||
if (read) *read = 0;
|
||||
if (intermediateRank) *intermediateRank = -1;
|
||||
|
||||
// Get GPUs from topology
|
||||
int g1, g2;
|
||||
@@ -337,7 +252,16 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
|
||||
// GPU not found, we can't use p2p.
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
// Set intermediate GPU rank, if routing through an intermediate GPU.
|
||||
struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
|
||||
if (path->count == 2) {
|
||||
struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
|
||||
if (intermediateNode->type == GPU && intermediateRank) {
|
||||
*intermediateRank = intermediateNode->gpu.rank;
|
||||
}
|
||||
}
|
||||
|
||||
// In general, use P2P whenever we can.
|
||||
int p2pLevel = PATH_SYS;
|
||||
@@ -358,6 +282,9 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
|
||||
if (model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
|
||||
else p2pLevel = PATH_SYS;
|
||||
}
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
|
||||
p2pLevel = PATH_PXB;
|
||||
}
|
||||
|
||||
compare:
|
||||
// Compute the PCI distance and compare with the p2pLevel.
|
||||
@@ -366,7 +293,7 @@ compare:
|
||||
if (path->type == PATH_NVL) {
|
||||
struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
|
||||
// Enable P2P Read for Ampere/NVLink only
|
||||
if ((gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
|
||||
if (read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
@@ -392,9 +319,6 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
|
||||
if (read) { // For reads (sends) only enable under certain conditions
|
||||
int gdrReadParam = ncclParamNetGdrRead();
|
||||
if (gdrReadParam == 0) return ncclSuccess;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
return ncclSuccess;
|
||||
#else
|
||||
if (gdrReadParam < 0) {
|
||||
int nvlink = 0;
|
||||
// Since we don't know whether there are other communicators,
|
||||
@@ -409,7 +333,6 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
|
||||
}
|
||||
if (!nvlink) return ncclSuccess;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Check if we are close enough that it makes sense to enable GDR
|
||||
@@ -445,10 +368,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
|
||||
NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
|
||||
}
|
||||
|
||||
// [RCCL] Add XGMI-only links between GPUs first before any other paths
|
||||
NCCLCHECK(ncclTopoSetXgmi(system));
|
||||
// [/RCCL]
|
||||
|
||||
// Set direct paths from/to GPUs.
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
// Compute paths to GPU g
|
||||
@@ -456,8 +375,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
|
||||
|
||||
// Update path when we don't want to / can't use GPU Direct P2P
|
||||
for (int p=0; p<system->nodes[GPU].count; p++) {
|
||||
int p2p, read;
|
||||
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, &read));
|
||||
int p2p;
|
||||
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
|
||||
if (p2p == 0) {
|
||||
// Divert all traffic through the CPU
|
||||
int cpu;
|
||||
@@ -565,8 +484,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*
|
||||
// Local rank
|
||||
path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
|
||||
if (path->type == PATH_NVL) {
|
||||
int sm = system->nodes[GPU].nodes[g].gpu.cudaCompCap;
|
||||
double nvlWidth = sm < 70 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
|
||||
float nvlWidth = ncclTopoNVLinkSpeed(system->nodes[GPU].nodes[g].gpu.cudaCompCap);
|
||||
*nChannels = 2*std::max(1, (int)(path->width / nvlWidth));
|
||||
} else {
|
||||
*nChannels = 2;
|
||||
@@ -600,7 +518,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
}
|
||||
}
|
||||
|
||||
if (comm->topo->type == RCCL_TOPO_4P2H_ROME) {
|
||||
if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_4P2H_ROME) {
|
||||
// Adjust P2P channels on Rome
|
||||
comm->p2pnChannelsPerPeer = 2;
|
||||
comm->p2pnChannels = 2;
|
||||
@@ -612,7 +530,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
}
|
||||
|
||||
// Init channels that weren't used so far
|
||||
for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c));
|
||||
for (int c=comm->nChannels; c<std::max(comm->nChannels, comm->p2pnChannels); c++) NCCLCHECK(initChannel(comm, c));
|
||||
|
||||
// We want to spread channels used when there aren't many and progressively
|
||||
// fill the whole space of nChannels. To do so we mirror the bits in the
|
||||
|
||||
@@ -21,7 +21,7 @@ void dumpLine(int* values, int nranks, const char* prefix) {
|
||||
|
||||
ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
|
||||
for (int r=0; r<nrings; r++) {
|
||||
char prefix[30];
|
||||
char prefix[40];
|
||||
/*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
|
||||
dumpLine(prev+r*nranks, nranks, prefix);
|
||||
sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
|
||||
|
||||
@@ -20,8 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#define MAX_ROME_GPUS 8
|
||||
#define MAX_ROME_NICS 2
|
||||
#define MAX_ROME_GPUS 16
|
||||
#define MAX_ROME_NICS 8
|
||||
|
||||
struct rcclRomeModel {
|
||||
int nGpus;
|
||||
@@ -235,6 +235,28 @@ static struct rcclRomeModel rome_model_46 = {
|
||||
.ringBase = "6 5 7 4 1 2 3 0|7 4 6 5 1 0 3 2",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_48 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3,
|
||||
.gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "20202020",
|
||||
.ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_49 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 4, .nLinks = 3,
|
||||
.gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, },
|
||||
.nicIds = { 0x45000, 0x13000, 0xc6000, 0x85000, },
|
||||
.gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { 0, 1, 2, 3, },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "21212121",
|
||||
.ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel romeTopoModels[] = {
|
||||
rome_model_22,
|
||||
rome_model_25,
|
||||
@@ -254,4 +276,6 @@ static struct rcclRomeModel romeTopoModels[] = {
|
||||
rome_model_44,
|
||||
rome_model_45,
|
||||
rome_model_46,
|
||||
rome_model_48,
|
||||
rome_model_49,
|
||||
};
|
||||
|
||||
+113
-79
@@ -25,9 +25,18 @@ static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu
|
||||
}
|
||||
return maxWidth;
|
||||
}
|
||||
static float getTotalWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
|
||||
float nvlinkWidth = 0.0, pciWidth = 0.0;
|
||||
for (int l=0; l<gpu->nlinks; l++) {
|
||||
struct ncclTopoLink* link = gpu->links+l;
|
||||
if (link->type == LINK_NVL) nvlinkWidth += link->width;
|
||||
if (link->type == LINK_PCI) pciWidth = link->width;
|
||||
}
|
||||
return std::max(pciWidth, nvlinkWidth);
|
||||
}
|
||||
ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
|
||||
system->maxWidth = 0.0;
|
||||
system->type = 0;
|
||||
system->totalWidth = 0.0;
|
||||
int inter = system->nodes[NET].count;
|
||||
if (inter == 0 && system->nodes[GPU].count == 1) {
|
||||
system->maxWidth = LOC_WIDTH;
|
||||
@@ -36,6 +45,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
system->maxWidth = std::max(system->maxWidth, getMaxWidth(system, gpu, inter ? NET : GPU));
|
||||
system->totalWidth = std::max(system->totalWidth, getTotalWidth(system, gpu));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -48,7 +58,7 @@ static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode*
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("Could not find rev link for %d/%d -> %d/%d\n", node1->type, node1->id, node2->type, node2->id);
|
||||
WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
@@ -149,9 +159,6 @@ struct ncclGpuScore {
|
||||
int intraNhops;
|
||||
int intraWidth;
|
||||
int interNhops;
|
||||
// [RCCL]
|
||||
int intraType; // New sort parameter to favor XGMI
|
||||
// [/RCCL]
|
||||
int interPciWidth;
|
||||
int interWidth; // Most important
|
||||
};
|
||||
@@ -162,9 +169,6 @@ static int cmpScore(const void * g1, const void * g2) {
|
||||
int d;
|
||||
if ((d = (s2->interWidth - s1->interWidth))) return d;
|
||||
if ((d = (s2->interPciWidth - s1->interPciWidth))) return d;
|
||||
// [RCCL]
|
||||
if ((d = (s1->intraType - s2->intraType))) return d; // Prefer XGMI over any other types
|
||||
// [/RCCL]
|
||||
if ((d = (s1->interNhops - s2->interNhops))) return d;
|
||||
if ((d = (s2->intraWidth - s1->intraWidth))) return d;
|
||||
if ((d = (s1->intraNhops - s2->intraNhops))) return d;
|
||||
@@ -187,7 +191,7 @@ static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* in
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("Could not find gpu rank %d\n", rank);
|
||||
WARN("Could not find gpu rank %d", rank);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
@@ -198,7 +202,7 @@ static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int*
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("Could not find net id %lx\n", id);
|
||||
WARN("Could not find net id %lx", id);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
@@ -224,25 +228,11 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
|
||||
for (int i=1; i<ngpus; i++) {
|
||||
int g = (start+i)%ngpus;
|
||||
if (paths[g].count == 0) continue; // There is no path to that GPU
|
||||
// [RCCL] - Prune earlier for performance
|
||||
{
|
||||
if (paths[g].type > graph->typeIntra) continue; // Skip if the intra path type is already slower than the current target
|
||||
if (paths[g].width < graph->speedIntra) continue;
|
||||
if (netPaths)
|
||||
{
|
||||
if (netPaths[g].type > graph->typeInter) continue; // Skip if the inter path type is already slower than the current target
|
||||
if (netPaths[g].width < graph->speedInter) continue;
|
||||
}
|
||||
}
|
||||
// [/RCCL]
|
||||
if (system->nodes[GPU].nodes[g].used & flag) continue;
|
||||
scores[count].g = g;
|
||||
scores[count].startIndex = i;
|
||||
scores[count].intraNhops = paths[g].count;
|
||||
scores[count].intraWidth = paths[g].width;
|
||||
// [RCCL] - Add path type as sort factor
|
||||
scores[count].intraType = paths[g].type;
|
||||
// [/RCCL]
|
||||
if (netPaths) {
|
||||
scores[count].interNhops = netPaths[g].count;
|
||||
scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g);
|
||||
@@ -293,7 +283,6 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time, int type, int index, int g) {
|
||||
const uint64_t flag = 1ULL<<(graph->nChannels);
|
||||
struct ncclTopoNode* gpu;
|
||||
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu));
|
||||
if (gpu) {
|
||||
gpu->used ^= flag;
|
||||
@@ -304,6 +293,35 @@ ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static int ncclTopoCountXGMI(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int count = 0;
|
||||
for (int c=0; c<graph->nChannels; c++) {
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
int g = graph->intra[ngpus*c+i];
|
||||
int n = graph->intra[ngpus*c+((i+1)%ngpus)];
|
||||
struct ncclTopoNode *node;
|
||||
int j;
|
||||
for (j=0; j<ngpus; j++)
|
||||
if (system->nodes[GPU].nodes[j].gpu.rank == g) break;
|
||||
if (j<ngpus) {
|
||||
node = system->nodes[GPU].nodes+j;
|
||||
for (int k = 0; k<system->nodes[GPU].count; k++) {
|
||||
if (node->paths[GPU][k].count == 1) {
|
||||
struct ncclTopoLink* link = node->paths[GPU][k].list[0];
|
||||
struct ncclTopoNode* remNode = link->remNode;
|
||||
if (remNode->gpu.rank == n) {
|
||||
if (link->type == LINK_NVL)
|
||||
count ++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
|
||||
// 1. Constraint to get the same nChannels between Rings and Trees
|
||||
if (graph->nChannels < graph->minChannels) return ncclSuccess;
|
||||
@@ -317,6 +335,9 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
|
||||
// 3. Less hops (but not at the price of going cross NICs)
|
||||
if (graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
|
||||
|
||||
// 4. Prefer graph with more XGMI connections
|
||||
if (graph->nChannels == refGraph->nChannels
|
||||
&& ncclTopoCountXGMI(system, refGraph) < ncclTopoCountXGMI(system, graph)) *copy = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -352,11 +373,26 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
|
||||
if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
|
||||
|
||||
// Balanced Tree : count half of the bandwidth on first two GPUs
|
||||
int nextBackToNet = -1;
|
||||
float speedInterSave = graph->speedInter;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
|
||||
// Count half of the bandwidth on each of the first two GPUs
|
||||
if (step == 0) nextBackToNet = 1;
|
||||
else if (net->id != graph->inter[graph->nChannels*2+1]) continue;
|
||||
graph->speedInter /= 2;
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
|
||||
graph->speedInter = speedInterSave;
|
||||
if (net) {
|
||||
graph->inter[graph->nChannels*2+1] = net->id;
|
||||
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, time));
|
||||
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
|
||||
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->speedInter /= 2;
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
|
||||
graph->speedInter = speedInterSave;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -427,9 +463,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
if (paths[i].count < paths[f].count) f = i;
|
||||
int t = 1 << 10;
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, (f == 0) ? FORCED_ORDER_PCI : 0, &t, NET, n, f));
|
||||
// [RCCL] Event if forced order PCI is found, continue the search instead of ending early
|
||||
// if (t == -1) *time = -1;
|
||||
// [/RCCL]
|
||||
if (t == -1) *time = -1;
|
||||
}
|
||||
|
||||
// Then try the most local GPUs
|
||||
@@ -493,13 +527,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
|
||||
if (system->nodes[NET].count) {
|
||||
if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1;
|
||||
else if (pattern == NCCL_TOPO_PATTERN_TREE) *backToNet = 0;
|
||||
else *backToNet = 1;
|
||||
if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
|
||||
else *backToFirstRank = -1;
|
||||
else if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) *backToNet = 1;
|
||||
else *backToNet = 0;
|
||||
*backToFirstRank = -1;
|
||||
} else {
|
||||
*backToNet = -1;
|
||||
if (pattern == NCCL_TOPO_PATTERN_RING || pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
|
||||
if (pattern == NCCL_TOPO_PATTERN_RING) *backToFirstRank = system->nodes[GPU].count-1;
|
||||
else *backToFirstRank = -1;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -513,14 +546,6 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
|
||||
ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
|
||||
} else {
|
||||
// Intra-node only.
|
||||
// [RCCL] - Instead of trying PCI ordering, or replaying, just go straight to searching
|
||||
{
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
// [/RCCL]
|
||||
if (graph->nChannels == 0) {
|
||||
// Try PCI order first
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
|
||||
@@ -544,7 +569,7 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
|
||||
/* User defined graph from XML file */
|
||||
/************************************/
|
||||
|
||||
struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "LOC", PATH_LOC }, { NULL, 0 } };
|
||||
struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "NVB", PATH_NVB}, { "LOC", PATH_LOC }, { NULL, 0 } };
|
||||
ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int* inter = graph->inter+2*c;
|
||||
@@ -839,17 +864,6 @@ static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclT
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *g, int *n, int nnet, int *net_map) {
|
||||
*g = 0; *n = 0;
|
||||
int i;
|
||||
if (ncclTopoIdToIndex(system, CPU, id, &i) == ncclInternalError) return false;
|
||||
for (int j = 0; j < nnet; j++)
|
||||
if (system->nodes[NET].nodes[net_map[j]].paths[CPU][i].count == 2) (*n)++;
|
||||
for (int j = 0; j < system->nodes[GPU].count; j++)
|
||||
if (system->nodes[GPU].nodes[j].paths[CPU][i].count == 2) (*g)++;
|
||||
return true;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclGpuIdToIndex(struct ncclTopoSystem* system, int id, int* index) {
|
||||
*index = -1;
|
||||
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||
@@ -868,12 +882,18 @@ static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRo
|
||||
romeTopo->nNics = 0;
|
||||
romeTopo->nLinks = 0;
|
||||
for (int i = 0; i < romeTopo->nGpus; i ++) {
|
||||
int gpu, n;
|
||||
int gpu, n, m, distance;
|
||||
NCCLCHECK(ncclGpuIdToIndex(system, i, &gpu));
|
||||
romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id;
|
||||
for (n = 0; n < romeTopo->nCpus; n++)
|
||||
if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count == 2) break;
|
||||
if (n < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[n].id;
|
||||
m = 0;
|
||||
distance = system->nodes[GPU].nodes[gpu].paths[CPU][m].count;
|
||||
for (n = 1; n < romeTopo->nCpus; n++) {
|
||||
if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count < distance) {
|
||||
distance = system->nodes[GPU].nodes[gpu].paths[CPU][n].count;
|
||||
m = n;
|
||||
}
|
||||
}
|
||||
if (m < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[m].id;
|
||||
|
||||
struct ncclTopoNode* node = system->nodes[GPU].nodes+gpu;
|
||||
if (node->paths[GPU] == NULL) continue;
|
||||
@@ -911,24 +931,32 @@ static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRo
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < romeTopo->nNics; i ++) {
|
||||
int net, n, m, distance;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, net_map[i], &net));
|
||||
m = 0;
|
||||
distance = system->nodes[NET].nodes[net].paths[CPU][m].count;
|
||||
for (n = 0; n < romeTopo->nCpus; n++)
|
||||
if (system->nodes[NET].nodes[net].paths[CPU][n].count < distance) {
|
||||
distance = system->nodes[NET].nodes[net].paths[CPU][n].count;
|
||||
m = n;
|
||||
}
|
||||
if (m < romeTopo->nCpus) romeTopo->nicNuma[i] = system->nodes[CPU].nodes[m].id;
|
||||
else return ncclSuccess;
|
||||
}
|
||||
|
||||
// number of GPUs and NICs on each numa node is used as first screening pattern
|
||||
for (int i = 0; i < romeTopo->nCpus; i++) {
|
||||
int g, n;
|
||||
getGpuNetCount(system, i, &g, &n, romeTopo->nNics, net_map);
|
||||
int g = 0, n = 0;
|
||||
for (int j = 0; j < romeTopo->nGpus; j++)
|
||||
if (romeTopo->gpuNuma[j] == i) g++;
|
||||
for (int j = 0; j < romeTopo->nNics; j++)
|
||||
if (romeTopo->nicNuma[j] == i) n++;
|
||||
pattern[i*2] = '0' + g;
|
||||
pattern[i*2+1] = '0' + n;
|
||||
}
|
||||
pattern[romeTopo->nCpus*2] = 0;
|
||||
|
||||
for (int i = 0; i < romeTopo->nNics; i ++) {
|
||||
int net, n;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, net_map[i], &net));
|
||||
for (n = 0; n < romeTopo->nCpus; n++)
|
||||
if (system->nodes[NET].nodes[net].paths[CPU][n].count == 2) break;
|
||||
if (n < romeTopo->nCpus) romeTopo->nicNuma[i] = system->nodes[CPU].nodes[n].id;
|
||||
else return ncclSuccess;
|
||||
}
|
||||
|
||||
const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE");
|
||||
if (romeModelFile) {
|
||||
INFO(NCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile);
|
||||
@@ -1064,7 +1092,7 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopo
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
float speedArray[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
#else
|
||||
float speedArray[] = { 42.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
float speedArray[] = { 42.0, 30.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
#endif
|
||||
#define NSPEEDS (sizeof(speedArray)/sizeof(float))
|
||||
|
||||
@@ -1111,11 +1139,20 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
|
||||
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
// TODO: benchmark balance tree vs split tree
|
||||
//if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
|
||||
#else
|
||||
// SPLIT_TREE works better on older archs.
|
||||
int ccMin;
|
||||
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
|
||||
if (ccMin < 80 && graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
|
||||
#endif
|
||||
|
||||
struct ncclTopoGraph tmpGraph;
|
||||
memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
|
||||
|
||||
// First try crossnic, then decrease speed and finally increase speedIntra.
|
||||
tmpGraph.pattern = graph->pattern;
|
||||
int pass = 1;
|
||||
int speedIndex = 0;
|
||||
while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++;
|
||||
@@ -1130,7 +1167,7 @@ search:
|
||||
|
||||
NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
|
||||
#if 0
|
||||
printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
|
||||
printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
|
||||
for (int c=0; c<graph->nChannels; c++) {
|
||||
printf("%2d : ", c);
|
||||
for (int g=0; g<ngpus; g++) {
|
||||
@@ -1140,7 +1177,8 @@ search:
|
||||
}
|
||||
#endif
|
||||
// Optimal solution, stop here
|
||||
if (graph->nChannels == graph->maxChannels && graph->speedInter == system->maxWidth) goto done;
|
||||
if (time == -1) goto done;
|
||||
if (graph->nChannels*graph->speedInter >= system->totalWidth) goto done;
|
||||
|
||||
if (pass == 1) {
|
||||
// First pass, we don't have a solution yet ; try other options
|
||||
@@ -1154,7 +1192,7 @@ search:
|
||||
|
||||
if (time != -1) globalTimeout += time;
|
||||
else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
|
||||
if (globalTimeout < 0) goto done;
|
||||
if (globalTimeout < 0 && graph->nChannels) goto done;
|
||||
|
||||
int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
|
||||
if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
|
||||
@@ -1169,10 +1207,6 @@ search:
|
||||
tmpGraph.typeInter = PATH_PIX;
|
||||
|
||||
// Try a simpler tree
|
||||
if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) {
|
||||
tmpGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
|
||||
goto search;
|
||||
}
|
||||
if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
|
||||
tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
|
||||
goto search;
|
||||
@@ -1222,7 +1256,7 @@ done:
|
||||
}
|
||||
|
||||
if (graph->nChannels == 0 && graph->collNet == 0) {
|
||||
WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
|
||||
WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
|
||||
for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
|
||||
graph->inter[0] = graph->inter[1] = 0;
|
||||
graph->speedIntra = graph->speedInter = 0.1;
|
||||
|
||||
+41
-10
@@ -20,18 +20,17 @@
|
||||
#endif
|
||||
#include "xml.h"
|
||||
#include "cpuset.h"
|
||||
#include <numa.h>
|
||||
|
||||
#define BUSID_SIZE (sizeof("0000:00:00.0"))
|
||||
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
|
||||
|
||||
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
const char* topoLinkTypeStr[] = { "LOC", "XGMI", "PCI", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "XGMI", "PIX", "PXB", "PHB", "SYS", "NET" };
|
||||
const char* topoLinkTypeStr[] = { "LOC", "XGMI", "", "PCI", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "PIX", "PXB", "PHB", "SYS" };
|
||||
#else
|
||||
const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "NVL", "PIX", "PXB", "PHB", "SYS", "NET" };
|
||||
const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PHB", "SYS" };
|
||||
#endif
|
||||
|
||||
/******************************************************************/
|
||||
@@ -83,6 +82,9 @@ static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* wi
|
||||
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
|
||||
*width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH;
|
||||
}
|
||||
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
|
||||
*width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_WIDTH : ZPI_WIDTH;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -105,7 +107,7 @@ ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode*
|
||||
|
||||
ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
|
||||
if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
|
||||
WARN("Error : tried to create too many nodes of type %d\n", type);
|
||||
WARN("Error : tried to create too many nodes of type %d", type);
|
||||
return ncclInternalError;
|
||||
}
|
||||
struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
|
||||
@@ -226,7 +228,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
|
||||
INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f ===", s->maxWidth);
|
||||
INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f totalWidth %2.1f ===", s->maxWidth, s->totalWidth);
|
||||
char line[1024];
|
||||
for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
|
||||
INFO(NCCL_GRAPH, "==========================================");
|
||||
@@ -380,7 +382,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
|
||||
}
|
||||
|
||||
struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } };
|
||||
struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { NULL, 0 } };
|
||||
struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { "CentaurHauls", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { " Shanghai ", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { NULL, 0 } };
|
||||
|
||||
ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
|
||||
int numaId;
|
||||
@@ -403,6 +405,11 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
|
||||
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
|
||||
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
|
||||
cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
|
||||
} else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
|
||||
int familyId, modelId;
|
||||
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
|
||||
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
|
||||
if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG;
|
||||
}
|
||||
if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
|
||||
int familyId, modelId;
|
||||
@@ -487,7 +494,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
|
||||
NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
|
||||
NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
|
||||
if (gpu == NULL) {
|
||||
WARN("Add NVLink error : could not find GPU %lx\n", pBusId);
|
||||
WARN("Add NVLink error : could not find GPU %lx", pBusId);
|
||||
return ncclInternalError;
|
||||
}
|
||||
int count;
|
||||
@@ -515,7 +522,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
|
||||
}
|
||||
}
|
||||
if (remote) {
|
||||
int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
|
||||
float nvlSpeed = ncclTopoNVLinkSpeed(gpu->gpu.cudaCompCap);
|
||||
NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));
|
||||
if (remote->type != GPU) {
|
||||
NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));
|
||||
@@ -600,6 +607,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
struct ncclXmlNode* node;
|
||||
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
|
||||
if (node == NULL) continue;
|
||||
NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
|
||||
NCCLCHECK(xmlSetAttrInt(node, "rank", r));
|
||||
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
|
||||
}
|
||||
@@ -614,6 +622,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
NCCLCHECK(collNetGetProperties(n, &props));
|
||||
struct ncclXmlNode* netNode;
|
||||
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
|
||||
@@ -631,6 +640,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
NCCLCHECK(ncclNetGetProperties(n, &props));
|
||||
struct ncclXmlNode* netNode;
|
||||
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
|
||||
@@ -639,6 +649,9 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
|
||||
}
|
||||
|
||||
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
|
||||
NCCLCHECK(ncclTopoTrimXml(xml));
|
||||
|
||||
xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
|
||||
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
|
||||
INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
|
||||
@@ -747,3 +760,21 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank) {
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) {
|
||||
*count = system->nodes[NET].count;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax) {
|
||||
if (system->nodes[GPU].count == 0) return ncclInternalError;
|
||||
int min, max;
|
||||
min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap;
|
||||
for (int g=1; g<system->nodes[GPU].count; g++) {
|
||||
min = std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
|
||||
max = std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
|
||||
}
|
||||
if (ccMin) *ccMin = min;
|
||||
if (ccMax) *ccMax = max;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
+30
-13
@@ -13,15 +13,19 @@
|
||||
#include <sched.h>
|
||||
|
||||
#define LOC_WIDTH 5000.0
|
||||
#define PASCAL_NVLINK_WIDTH 18.0
|
||||
#define VOLTA_NVLINK_WIDTH 21.0
|
||||
#define SM60_NVLINK_WIDTH 18.0
|
||||
#define SM70_NVLINK_WIDTH 21.0
|
||||
#define SM80_NVLINK_WIDTH 21.0
|
||||
#define SM86_NVLINK_WIDTH 12.0
|
||||
#define PCI_WIDTH 12.0 // PCI Gen3 x16
|
||||
#define QPI_WIDTH 6.0
|
||||
#define SKL_QPI_WIDTH 9.0
|
||||
#define ZPI_WIDTH 6.0
|
||||
#define YONGFENG_ZPI_WIDTH 9.0
|
||||
#define P9_WIDTH 32.0
|
||||
#define ARM_WIDTH 6.0
|
||||
#define NET_WIDTH 12.0 // 100Gbit
|
||||
#define VEGA_XGMI_WIDTH 20.0
|
||||
#define VEGA_XGMI_WIDTH 24.0
|
||||
|
||||
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
|
||||
// to GPU traffic consumes more PCI bandwidth.
|
||||
@@ -40,20 +44,21 @@ extern const char* topoNodeTypeStr[];
|
||||
// We want link types and path types to match as much as possible
|
||||
#define LINK_LOC 0
|
||||
#define LINK_NVL 1
|
||||
#define LINK_PCI 2
|
||||
// Skipping 3 for PATH_PXB
|
||||
// Skipping 4 for PATH_PHB
|
||||
#define LINK_SYS 5
|
||||
#define LINK_NET 6
|
||||
// Skipping 2 for PATH_NVB
|
||||
#define LINK_PCI 3
|
||||
// Skipping 4 for PATH_PXB
|
||||
// Skipping 5 for PATH_PHB
|
||||
#define LINK_SYS 6
|
||||
#define LINK_NET 7
|
||||
extern const char* topoLinkTypeStr[];
|
||||
|
||||
#define PATH_LOC 0
|
||||
#define PATH_NVL 1
|
||||
#define PATH_PIX 2
|
||||
#define PATH_PXB 3
|
||||
#define PATH_PHB 4
|
||||
#define PATH_SYS 5
|
||||
#define PATH_NET 6
|
||||
#define PATH_NVB 2
|
||||
#define PATH_PIX 3
|
||||
#define PATH_PXB 4
|
||||
#define PATH_PHB 5
|
||||
#define PATH_SYS 6
|
||||
extern const char* topoPathTypeStr[];
|
||||
|
||||
struct ncclTopoNode;
|
||||
@@ -125,6 +130,7 @@ struct ncclTopoNodeSet {
|
||||
struct ncclTopoSystem {
|
||||
struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
|
||||
float maxWidth;
|
||||
float totalWidth;
|
||||
int type;
|
||||
};
|
||||
|
||||
@@ -141,6 +147,8 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
|
||||
ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels);
|
||||
ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml);
|
||||
|
||||
ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax);
|
||||
|
||||
static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) {
|
||||
*index = -1;
|
||||
for (int i=0; i<system->nodes[type].count; i++) {
|
||||
@@ -163,4 +171,13 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank,
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
// Returns NVLink speed in GB/s
|
||||
static float ncclTopoNVLinkSpeed(int cudaCompCap) {
|
||||
return
|
||||
cudaCompCap == 86 ? SM86_NVLINK_WIDTH :
|
||||
cudaCompCap >= 80 ? SM80_NVLINK_WIDTH :
|
||||
cudaCompCap >= 70 ? SM70_NVLINK_WIDTH :
|
||||
cudaCompCap >= 60 ? SM60_NVLINK_WIDTH :
|
||||
SM80_NVLINK_WIDTH;
|
||||
}
|
||||
#endif
|
||||
|
||||
+28
-25
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -28,7 +28,7 @@
|
||||
* / \ / \ / \ \
|
||||
* 1 3 5 7 9 11 13
|
||||
*/
|
||||
ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
|
||||
ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) {
|
||||
int up, down0, down1;
|
||||
int bit;
|
||||
for (bit=1; bit<nranks; bit<<=1) {
|
||||
@@ -37,13 +37,16 @@ ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
|
||||
|
||||
if (rank == 0) {
|
||||
*u = -1;
|
||||
*d0 = nranks > 1 ? bit >> 1 : -1;
|
||||
*d1 = -1;
|
||||
*d0 = -1;
|
||||
// Child rank is > 0 so it has to be our child 1, not 0.
|
||||
*d1 = nranks > 1 ? bit >> 1 : -1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
up = (rank ^ bit) | (bit << 1);
|
||||
// if smaller than the parent, we are his first child, otherwise we're his second
|
||||
if (up >= nranks) up = (rank ^ bit);
|
||||
*parentChildType = (rank < up) ? 0 : 1;
|
||||
*u = up;
|
||||
|
||||
int lowbit = bit >> 1;
|
||||
@@ -62,42 +65,42 @@ ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
|
||||
}
|
||||
|
||||
/* Build a double binary tree. Take the previous tree for the first tree.
|
||||
* For the second tree, we use a mirror tree (if nranks is odd)
|
||||
* For the second tree, we use a mirror tree (if nranks is even)
|
||||
*
|
||||
* 8---------0---------5
|
||||
* ______/ \______ _____/ \______
|
||||
* 4 12 1 9
|
||||
* / \ / \ / \
|
||||
* 2 6 10 3 7 10
|
||||
* / \ / \ / \ / \ / \ / \
|
||||
* 1 3 5 7 9 11 2 4 6 8 11 12
|
||||
* 0---------------8 3----------------11
|
||||
* ______/ \ / \______
|
||||
* 4 \ / 7
|
||||
* / \ \ / / \
|
||||
* 2 6 10 1 5 9
|
||||
* / \ / \ / \ / \ / \ / \
|
||||
* 1 3 5 7 9 11 0 2 4 6 8 10
|
||||
*
|
||||
* or shift it by one rank (if nranks is even)
|
||||
* or shift it by one rank (if nranks is odd).
|
||||
*
|
||||
* 8---------0--------------9
|
||||
* ______/ \ ______/ \
|
||||
* 4 \ 5 \
|
||||
* / \ \ / \ \
|
||||
* 2 6 10 3 7 11
|
||||
* / \ / \ / \ / \ / \ / \
|
||||
* 1 3 5 7 9 11 2 4 6 8 10 1
|
||||
* 0---------------8 1---------------9
|
||||
* ______/ \______ ______/ \______
|
||||
* 4 12 5 0
|
||||
* / \ / / \ /
|
||||
* 2 6 10 3 7 11
|
||||
* / \ / \ / \ / \ / \ / \
|
||||
* 1 3 5 7 9 11 2 4 6 8 10 12
|
||||
*/
|
||||
ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* s1, int* d1_0, int* d1_1) {
|
||||
ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) {
|
||||
// First tree ... use a btree
|
||||
ncclGetBtree(nranks, rank, s0, d0_0, d0_1);
|
||||
ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0);
|
||||
// Second tree ... mirror or shift
|
||||
if (nranks % 2 == 0) {
|
||||
if (nranks % 2 == 1) {
|
||||
// shift
|
||||
int shiftrank = (rank-1+nranks) % nranks;
|
||||
int u, d0, d1;
|
||||
ncclGetBtree(nranks, shiftrank, &u, &d0, &d1);
|
||||
ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1);
|
||||
*s1 = u == -1 ? -1 : (u+1) % nranks;
|
||||
*d1_0 = d0 == -1 ? -1 : (d0+1) % nranks;
|
||||
*d1_1 = d1 == -1 ? -1 : (d1+1) % nranks;
|
||||
} else {
|
||||
// mirror
|
||||
int u, d0, d1;
|
||||
ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1);
|
||||
ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1);
|
||||
*s1 = u == -1 ? -1 : nranks-1-u;
|
||||
*d1_0 = d0 == -1 ? -1 : nranks-1-d0;
|
||||
*d1_1 = d1 == -1 ? -1 : nranks-1-d1;
|
||||
|
||||
+67
-21
@@ -71,45 +71,66 @@ static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
|
||||
};
|
||||
|
||||
// LL128 max BW (per channel) for the different collectives
|
||||
// ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce
|
||||
static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15.2, 16.7 };
|
||||
// ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce
|
||||
static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15.2, 16.9 };
|
||||
static const double llMaxBws[2][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0} };
|
||||
static const double perChMaxTreeBws[2][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 22.5, 16.0} };
|
||||
|
||||
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
|
||||
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_MAX_NTHREADS;
|
||||
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
|
||||
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
|
||||
#else
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
|
||||
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
|
||||
#endif
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
|
||||
getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
|
||||
|
||||
if (comm->nRanks <= 1) return ncclSuccess;
|
||||
int nNodes = comm->nNodes;
|
||||
int nRanks = comm->nRanks;
|
||||
if (nRanks <= 1) return ncclSuccess;
|
||||
|
||||
int compCap80 = minCompCap == 80 && maxCompCap == 80 ? 1 : 0;
|
||||
float ppn = (float)comm->nRanks / comm->nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
|
||||
int cpuArch, cpuVendor, cpuModel;
|
||||
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
|
||||
int index2 = nNodes <= 2 ? nNodes-1 : 2;
|
||||
// LL: for single node, we look at GPU type; for multi-node, we look at CPU type
|
||||
int index1 = nNodes == 1 ? compCap80 : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
|
||||
double llMaxBw = llMaxBws[index1][index2];
|
||||
double perChMaxTreeBw = perChMaxTreeBws[compCap80][index2];
|
||||
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
|
||||
|
||||
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph };
|
||||
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
|
||||
|
||||
for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
|
||||
int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) :
|
||||
coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 :
|
||||
comm->nRanks;
|
||||
int nInterSteps = coll == ncclCollAllReduce ? 2*(comm->nNodes-1) :
|
||||
coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nNodes-1 :
|
||||
comm->nNodes;
|
||||
int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
|
||||
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
|
||||
nRanks;
|
||||
int nInterSteps = coll == ncclFuncAllReduce ? 2*(nNodes-1) :
|
||||
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
|
||||
nNodes;
|
||||
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
if (coll != ncclCollAllReduce && a != NCCL_ALGO_RING) continue;
|
||||
if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue;
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
float speed = comm->nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
|
||||
float speed = nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
|
||||
float busBw = graphs[a]->nChannels * speed;
|
||||
|
||||
// Various model refinements
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/5.0;
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
|
||||
double maxTreeBw = comm->nNodes > 2 ?
|
||||
@@ -118,21 +139,29 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.27, comm->nNodes > 1 ? 70.0 : 90.0);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/2.3;
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (comm->nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels*7.0/9.0);
|
||||
#else
|
||||
if (compCap80) busBw = std::min(busBw, 235.0f);
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
|
||||
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
|
||||
#endif
|
||||
if (a == NCCL_ALGO_COLLNET) busBw *= .9;
|
||||
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides
|
||||
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0; // CollNet does not support LL128
|
||||
|
||||
// Convert bus BW to algorithm BW
|
||||
float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * comm->nRanks) / nsteps;
|
||||
float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps;
|
||||
comm->bandwidths[coll][a][p] = busBw * ratio;
|
||||
|
||||
comm->latencies[coll][a][p] = baseLat[a][p];
|
||||
float intraLat = hwLat[intraHw[a]][a][p];
|
||||
float interLat = hwLat[NCCL_HW_NET][a][p];
|
||||
if (comm->nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
|
||||
if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
|
||||
if (a == NCCL_ALGO_RING) {
|
||||
float lat = hwLat[hw[a]][a][p];
|
||||
if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) {
|
||||
if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
|
||||
if (ringGraph->sameChannels) {
|
||||
comm->latencies[coll][a][p] += lat;
|
||||
} else {
|
||||
@@ -144,10 +173,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
}
|
||||
} else if (a == NCCL_ALGO_TREE) {
|
||||
comm->latencies[coll][a][p] +=
|
||||
2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
|
||||
2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
|
||||
} else {
|
||||
comm->latencies[coll][a][p] +=
|
||||
2 * (comm->nRanks/comm->nNodes-1) * intraLat + interLat;
|
||||
2 * (nRanks/nNodes-1) * intraLat + interLat;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -168,6 +197,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
|
||||
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
|
||||
}
|
||||
// Disable CollNet if it is not supported
|
||||
if (comm->collNetSupport == 0) {
|
||||
algoEnable[NCCL_ALGO_COLLNET] = 0;
|
||||
// If user has hard set NCCL_ALGO=COLLNET, ignore it
|
||||
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) {
|
||||
algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
|
||||
if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
|
||||
}
|
||||
}
|
||||
|
||||
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
int pEnable = protoEnable[p];
|
||||
@@ -178,7 +216,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
}
|
||||
if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
|
||||
// Only disable algo for Allreduce since others only have one
|
||||
if (c == ncclCollAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
|
||||
if (c == ncclFuncAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
|
||||
}
|
||||
|
||||
if (comm->rank == 0) {
|
||||
@@ -214,7 +252,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD;
|
||||
comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
|
||||
}
|
||||
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= comm->nRanks;
|
||||
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks;
|
||||
|
||||
// Override defaults with user env
|
||||
char* str = getenv("NCCL_THREAD_THRESHOLDS");
|
||||
@@ -263,8 +301,16 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
|
||||
*time = -1.0; return ncclSuccess;
|
||||
}
|
||||
int logSize = log2i(info->nBytes>>6);
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
if (algorithm == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[protocol][logSize];
|
||||
else if (algorithm == NCCL_ALGO_RING && logSize < 22) bw *= ringCorrectionFactor[protocol][logSize];
|
||||
#else
|
||||
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
|
||||
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
|
||||
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
|
||||
&& info->coll == ncclFuncAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring
|
||||
#endif
|
||||
*time = lat + (info->nBytes) / (1000 * bw);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
+39
-17
@@ -71,7 +71,7 @@ ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) {
|
||||
if (c == '=') {
|
||||
ptr[o] = '\0';
|
||||
if (value == NULL) {
|
||||
WARN("XML Parse : Unexpected value with name %s\n", ptr);
|
||||
WARN("XML Parse : Unexpected value with name %s", ptr);
|
||||
return ncclInternalError;
|
||||
}
|
||||
return xmlGetValue(file, value, last);
|
||||
@@ -137,7 +137,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
|
||||
// Re-read the name, we got '/' in the first call
|
||||
NCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
|
||||
if (c != '>') {
|
||||
WARN("XML Parse error : unexpected trailing %c in closing tag %s\n", c, node->name);
|
||||
WARN("XML Parse error : unexpected trailing %c in closing tag %s", c, node->name);
|
||||
return ncclInternalError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -150,7 +150,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
|
||||
while (c == ' ') {
|
||||
NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c));
|
||||
if (a == MAX_ATTR_COUNT) {
|
||||
INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)\n", MAX_ATTR_COUNT);
|
||||
INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)", MAX_ATTR_COUNT);
|
||||
// Actually we need to still consume the extra attributes so we have an extra one.
|
||||
} else a++;
|
||||
}
|
||||
@@ -178,7 +178,7 @@ ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* hea
|
||||
if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess;
|
||||
while (1) {
|
||||
if (xml->maxIndex == MAX_NODES) {
|
||||
WARN("Error : XML parser is limited to 1024 nodes\n");
|
||||
WARN("Error : XML parser is limited to 1024 nodes");
|
||||
return ncclInternalError;
|
||||
}
|
||||
struct ncclXmlNode* node = xml->nodes+xml->maxIndex;
|
||||
@@ -373,7 +373,7 @@ ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const char* pat
|
||||
char strValue[MAX_STR_LEN];
|
||||
NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue));
|
||||
if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); }
|
||||
TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s\n", path, fileName, attrName, strValue);
|
||||
TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s", path, fileName, attrName, strValue);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -579,7 +579,6 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
|
||||
NCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index));
|
||||
if (index == -1) {
|
||||
if (nvmlDev == NULL) {
|
||||
//WARN("No NVML, trying to use CUDA instead");
|
||||
const char* busId;
|
||||
NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
|
||||
if (busId == NULL || hipDeviceGetByPCIBusId(&dev, busId) != hipSuccess) dev = -1;
|
||||
@@ -662,7 +661,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
|
||||
int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12;
|
||||
|
||||
if (maxNvLinks > 0 && nvmlDev == NULL) {
|
||||
WARN("No NVML device handle. Skipping nvlink detection.\n");
|
||||
WARN("No NVML device handle. Skipping nvlink detection.");
|
||||
maxNvLinks = 0;
|
||||
}
|
||||
|
||||
@@ -721,6 +720,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
|
||||
char* path;
|
||||
NCCLCHECK(getPciPath(busId, &path));
|
||||
NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
|
||||
free(path);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -732,10 +732,14 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
|
||||
struct ncclXmlNode* node;
|
||||
NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node));
|
||||
NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
|
||||
NCCLCHECK(wrapNvmlSymbols());
|
||||
NCCLCHECK(wrapNvmlInit());
|
||||
nvmlDevice_t nvmlDev;
|
||||
if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
|
||||
nvmlDevice_t nvmlDev = NULL;
|
||||
static int nvmlInit = 0;
|
||||
if (nvmlInit == 0) {
|
||||
nvmlInit = (wrapNvmlSymbols() != ncclSuccess || wrapNvmlInit() != ncclSuccess) ? 2 : 1;
|
||||
}
|
||||
if (nvmlInit == 1) {
|
||||
if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
|
||||
}
|
||||
NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -778,12 +782,8 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
|
||||
for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--);
|
||||
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
strcpy(busId, pciSysPath+offset+1);
|
||||
NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", busId));
|
||||
if (parent == NULL) {
|
||||
NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
|
||||
NCCLCHECK(xmlSetAttr(parent, "busid", busId));
|
||||
NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
|
||||
}
|
||||
NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent));
|
||||
NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
|
||||
} else {
|
||||
// Virtual NIC, no PCI device, attach to first CPU
|
||||
NCCLCHECK(xmlFindTag(xml, "cpu", &parent));
|
||||
@@ -802,6 +802,28 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node) {
|
||||
const char* str;
|
||||
NCCLCHECK(xmlGetAttr(node, "keep", &str));
|
||||
if (str && strcmp(str, "1") == 0) {
|
||||
NCCLCHECK(xmlUnsetAttr(node, "keep"));
|
||||
} else {
|
||||
// Copy nSubs and subs as they could change as we trim recursively.
|
||||
struct ncclXmlNode* subs[MAX_SUBS];
|
||||
int nSubs = node->nSubs;
|
||||
memcpy(subs, node->subs, node->nSubs*sizeof(struct ncclXmlNode*));
|
||||
for (int s=0; s<nSubs; s++) {
|
||||
NCCLCHECK(ncclTopoTrimXmlRec(subs[s]));
|
||||
}
|
||||
if (node->nSubs == 0) NCCLCHECK(xmlRemoveNode(node));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
ncclResult_t ncclTopoTrimXml(struct ncclXml* xml) {
|
||||
NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/**************************************************/
|
||||
/* Parser rules for the user-defined graph search */
|
||||
/**************************************************/
|
||||
|
||||
+40
-5
@@ -8,7 +8,7 @@
|
||||
#define XML_H_
|
||||
|
||||
// A few constraints to make the implementation easy
|
||||
#define MAX_STR_LEN 256
|
||||
#define MAX_STR_LEN 255
|
||||
#define MAX_ATTR_COUNT 16
|
||||
#define MAX_SUBS 32
|
||||
#define MAX_NODES 1024
|
||||
@@ -19,10 +19,10 @@
|
||||
#define NODE_TYPE_SINGLE 3
|
||||
|
||||
struct ncclXmlNode {
|
||||
char name[MAX_STR_LEN];
|
||||
char name[MAX_STR_LEN+1];
|
||||
struct {
|
||||
char key[MAX_STR_LEN];
|
||||
char value[MAX_STR_LEN];
|
||||
char key[MAX_STR_LEN+1];
|
||||
char value[MAX_STR_LEN+1];
|
||||
} attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params
|
||||
int nAttrs;
|
||||
int type;
|
||||
@@ -47,6 +47,9 @@ ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXm
|
||||
ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode);
|
||||
ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode);
|
||||
|
||||
/* Remove unneeded parts */
|
||||
ncclResult_t ncclTopoTrimXml(struct ncclXml* xml);
|
||||
|
||||
/**************/
|
||||
/* XML Struct */
|
||||
/* Functions */
|
||||
@@ -56,7 +59,7 @@ static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrNa
|
||||
*index = -1;
|
||||
const int nAttrs = node->nAttrs;
|
||||
for (int a=0; a<nAttrs; a++) {
|
||||
if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN-1) == 0) {
|
||||
if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN) == 0) {
|
||||
*index = a;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -127,8 +130,10 @@ static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, c
|
||||
if (index == -1) {
|
||||
index = node->nAttrs++;
|
||||
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
|
||||
node->attrs[index].key[MAX_STR_LEN] = '\0';
|
||||
}
|
||||
strncpy(node->attrs[index].value, value, MAX_STR_LEN);
|
||||
node->attrs[index].value[MAX_STR_LEN] = '\0';
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -138,8 +143,10 @@ static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName
|
||||
if (index == -1) {
|
||||
index = node->nAttrs++;
|
||||
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
|
||||
node->attrs[index].key[MAX_STR_LEN] = '\0';
|
||||
}
|
||||
snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
|
||||
node->attrs[index].value[MAX_STR_LEN] = '\0';
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -149,8 +156,22 @@ static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrNa
|
||||
if (index == -1) {
|
||||
index = node->nAttrs++;
|
||||
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
|
||||
node->attrs[index].key[MAX_STR_LEN] = '\0';
|
||||
}
|
||||
snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value);
|
||||
node->attrs[index].value[MAX_STR_LEN] = '\0';
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t xmlUnsetAttr(struct ncclXmlNode* node, const char* attrName) {
|
||||
int index;
|
||||
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
|
||||
if (index == -1) return ncclSuccess;
|
||||
for (int i=index+1; i<node->nAttrs; i++) {
|
||||
strcpy(node->attrs[i-1].key, node->attrs[i].key);
|
||||
strcpy(node->attrs[i-1].value, node->attrs[i].value);
|
||||
}
|
||||
node->nAttrs--;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -199,6 +220,20 @@ static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent,
|
||||
s->parent = parent;
|
||||
if (parent) parent->subs[parent->nSubs++] = s;
|
||||
strncpy(s->name, subName, MAX_STR_LEN);
|
||||
s->name[MAX_STR_LEN] = '\0';
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t xmlRemoveNode(struct ncclXmlNode* node) {
|
||||
node->type = NODE_TYPE_NONE;
|
||||
struct ncclXmlNode* parent = node->parent;
|
||||
if (parent == NULL) return ncclSuccess;
|
||||
int shift = 0;
|
||||
for (int s=0; s<parent->nSubs; s++) {
|
||||
if (parent->subs[s] == node) shift = 1;
|
||||
else if (shift) parent->subs[s-1] = parent->subs[s];
|
||||
}
|
||||
parent->nSubs--;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
+120
-80
@@ -35,7 +35,6 @@ struct ncclInitArgs {
|
||||
};
|
||||
struct ncclCollArgs {
|
||||
ncclComm_t comm;
|
||||
int connect;
|
||||
};
|
||||
|
||||
enum ncclAsyncFuncType {
|
||||
@@ -110,6 +109,7 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) {
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupStart);
|
||||
ncclResult_t ncclGroupStart() {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
if (ncclGroupMode == 0) {
|
||||
memset(ncclGroupArgs, 0, sizeof(struct ncclAsyncArgs)*MAX_ASYNC_OPS);
|
||||
}
|
||||
@@ -118,7 +118,7 @@ ncclResult_t ncclGroupStart() {
|
||||
}
|
||||
|
||||
static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, void* recvbuff, ssize_t sendbytes, const void* sendbuff) {
|
||||
struct ncclInfo info = { ncclCollSendRecv, "SendRecv",
|
||||
struct ncclInfo info = { ncclFuncSendRecv, "SendRecv",
|
||||
sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */
|
||||
1, 1 };
|
||||
info.delta = delta;
|
||||
@@ -126,26 +126,32 @@ static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int chann
|
||||
info.sendbytes = sendbytes;
|
||||
info.recvbytes = recvbytes;
|
||||
if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage;
|
||||
NCCLCHECK(ncclSaveKernel(&info));
|
||||
NCCLCHECK(ncclSaveP2pKernel(&info));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void* ncclAsyncThreadPreconnect(void* args_) {
|
||||
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
|
||||
CUDACHECKTHREAD(hipSetDevice(args->coll.comm->cudaDev));
|
||||
for (int c=0; c<args->coll.comm->p2pnChannels; c++) {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
struct ncclP2PConnect* connect = &comm->p2plist.connect;
|
||||
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, channel, connect->nrecv[c], connect->recv+c*comm->nRanks, connect->nsend[c], connect->send+c*comm->nRanks));
|
||||
connect->nrecv[c] = 0;
|
||||
connect->nsend[c] = 0;
|
||||
}
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
CUDACHECKTHREAD(hipSetDevice(comm->cudaDev));
|
||||
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL));
|
||||
return args;
|
||||
}
|
||||
|
||||
static size_t getP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
|
||||
size_t size = std::max(minSize, DIVUP(totalSize, minChannels));
|
||||
int nChannels = minChannels;
|
||||
while (size > maxSize && nChannels <= maxChannels/2) {
|
||||
nChannels *= 2;
|
||||
size = DIVUP(totalSize, nChannels);
|
||||
}
|
||||
ALIGN_SIZE(size, minSize);
|
||||
return size;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupEnd);
|
||||
ncclResult_t ncclGroupEnd() {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
if (ncclGroupMode == 0) {
|
||||
WARN("ncclGroupEnd: not in a group call.");
|
||||
return ncclInvalidUsage;
|
||||
@@ -186,29 +192,21 @@ ncclResult_t ncclGroupEnd() {
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL) {
|
||||
struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
|
||||
if (p2plist->count != 0) {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
args->coll.connect = 0;
|
||||
for (int c=0; c<comm->p2pnChannels; c++)
|
||||
args->coll.connect += comm->p2plist.connect.nsend[c] + comm->p2plist.connect.nrecv[c];
|
||||
if (args->coll.connect) {
|
||||
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
|
||||
}
|
||||
}
|
||||
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
|
||||
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL && (args->coll.connect)) {
|
||||
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
|
||||
int err = pthread_join(ncclGroupThreads[i], NULL);
|
||||
if (err != 0) {
|
||||
WARN("Error waiting for pthread_join : %s\n", strerror(errno));
|
||||
WARN("Error waiting for pthread_join : %s", strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
NCCLCHECKGOTO(args->ret, ret, end);
|
||||
args->coll.comm->connect = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -218,56 +216,102 @@ ncclResult_t ncclGroupEnd() {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
int rank = comm->rank;
|
||||
int nRanks = comm->nRanks;
|
||||
struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
|
||||
if (p2plist->count) {
|
||||
for (int delta=0; delta<nRanks; delta++) {
|
||||
struct ncclP2Plist* p2pSends = comm->p2pSends;
|
||||
struct ncclP2Plist* p2pRecvs = comm->p2pRecvs;
|
||||
|
||||
// Compute how much to split operations
|
||||
// Natural step size matching buffer steps.
|
||||
ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
|
||||
// Try to use all channels
|
||||
int nChannelsMax = comm->p2pnChannelsPerPeer;
|
||||
int nChannelsMin = nChannelsMax;
|
||||
// Try to use all channels, but one channel per operation.
|
||||
while (nChannelsMin*comm->nRanks > std::max(comm->nChannels, comm->p2pnChannels) && nChannelsMin > 1) nChannelsMin /= 2;
|
||||
// Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
|
||||
while (nChannelsMax*comm->nRanks > std::max(comm->nChannels, comm->p2pnChannels)*4 && nChannelsMax > 1) nChannelsMax /= 2;
|
||||
|
||||
while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
|
||||
// schedule delta 0, +1, -1, +2, -2, ...
|
||||
// also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
|
||||
for (int d=0; d<=nRanks/4; d++) {
|
||||
int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, (nRanks-(nRanks/2-d))%nRanks };
|
||||
int index = 0;
|
||||
int delta = deltas[index];
|
||||
sched_delta:
|
||||
uint32_t from = (rank+nRanks-delta)%nRanks;
|
||||
uint32_t to = (rank+delta)%nRanks;
|
||||
struct ncclP2Pinfo* recv = p2pRecvs[from].head;
|
||||
struct ncclP2Pinfo* send = p2pSends[to].head;
|
||||
if (recv != NULL || send != NULL) {
|
||||
ssize_t totRecvBytes = -1, totSendBytes = -1;
|
||||
if (recv != NULL) totRecvBytes = recv->nbytes;
|
||||
if (send != NULL) totSendBytes = send->nbytes;
|
||||
ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
|
||||
ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
|
||||
|
||||
// Compute how much to split operations
|
||||
// Natural step size matching buffer steps.
|
||||
ssize_t stepSize = 4*comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
|
||||
// Split each operation on p2pnChannelsPerPeer max.
|
||||
ssize_t recvChunkSize = DIVUP(p2plist->peerlist[from].recvbytes, comm->p2pnChannelsPerPeer);
|
||||
ssize_t sendChunkSize = DIVUP(p2plist->peerlist[to].sendbytes, comm->p2pnChannelsPerPeer);
|
||||
recvChunkSize = std::max((ssize_t)1, DIVUP(recvChunkSize, stepSize)) * stepSize;
|
||||
sendChunkSize = std::max((ssize_t)1, DIVUP(sendChunkSize, stepSize)) * stepSize;
|
||||
|
||||
ssize_t sendOffset = 0;
|
||||
ssize_t recvOffset = 0;
|
||||
int remaining = 1;
|
||||
int chunk = 0;
|
||||
while (remaining) {
|
||||
int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
|
||||
remaining = 0;
|
||||
ssize_t recvbytes = p2plist->peerlist[from].recvbytes-recvOffset;
|
||||
ssize_t sendbytes = p2plist->peerlist[to].sendbytes-sendOffset;
|
||||
if (recvbytes > recvChunkSize) { remaining = 1; recvbytes = recvChunkSize; } else p2plist->peerlist[from].recvbytes = -1;
|
||||
if (sendbytes > sendChunkSize) { remaining = 1; sendbytes = sendChunkSize; } else p2plist->peerlist[to].sendbytes = -1;
|
||||
if (sendbytes >= 0 || recvbytes >= 0) {
|
||||
NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
|
||||
recvbytes, ((char*)(p2plist->peerlist[from].recvbuff)) + recvOffset,
|
||||
sendbytes, ((const char*)(p2plist->peerlist[to].sendbuff)) + sendOffset), ret, end);
|
||||
ssize_t sendOffset = 0;
|
||||
ssize_t recvOffset = 0;
|
||||
int sendRemaining = 1, recvRemaining = 1;
|
||||
int chunk = 0;
|
||||
do {
|
||||
int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
|
||||
ssize_t recvbytes = totRecvBytes-recvOffset;
|
||||
ssize_t sendbytes = totSendBytes-sendOffset;
|
||||
if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
|
||||
if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
|
||||
// 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
|
||||
// (total size == 0), otherwise set size to -1 so that the kernel skips the operation.
|
||||
if (sendbytes == 0 && totSendBytes != 0) sendbytes = -1;
|
||||
if (recvbytes == 0 && totRecvBytes != 0) recvbytes = -1;
|
||||
if (sendbytes >= 0 || recvbytes >= 0) {
|
||||
NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
|
||||
recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
|
||||
sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL), ret, group_cleanup);
|
||||
}
|
||||
recvOffset += recvChunkSize;
|
||||
sendOffset += sendChunkSize;
|
||||
chunk++;
|
||||
} while (sendRemaining || recvRemaining);
|
||||
if (recv) {
|
||||
NCCLCHECKGOTO(dequeueP2pInfo(p2pRecvs+from), ret, group_cleanup);
|
||||
comm->p2pRecvCount--;
|
||||
}
|
||||
recvOffset += recvChunkSize;
|
||||
sendOffset += sendChunkSize;
|
||||
chunk++;
|
||||
if (send) {
|
||||
NCCLCHECKGOTO(dequeueP2pInfo(p2pSends+to), ret, group_cleanup);
|
||||
comm->p2pSendCount--;
|
||||
}
|
||||
}
|
||||
index++;
|
||||
if (index == 1 && deltas[1] == deltas[0]) index++;
|
||||
if (index == 2 && deltas[2] == deltas[0]) index++;
|
||||
if (index == 3 && deltas[3] == deltas[2]) index++;
|
||||
if (index == 3 && deltas[3] == deltas[1]) index++;
|
||||
if (index < 4) {
|
||||
delta = deltas[index];
|
||||
goto sched_delta;
|
||||
}
|
||||
}
|
||||
p2plist->count = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Collectives are done in three steps :
|
||||
* 0. Save kernels previously enqueued. Compute channel, algo, proto, etc.
|
||||
* 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
|
||||
* 2. Barrier Wait. No CUDA call is permitted
|
||||
* 3. Enqueue Events. CUDA event wait/enqueue.
|
||||
* This is needed because step 2 cannot call any CUDA primitive, otherwise if
|
||||
* hipFree happens between 1 and 3, it could block that CUDA call and
|
||||
* cudaFree happens between 1 and 3, it could block that CUDA call and
|
||||
* prevent some ranks from launching their network threads, which would
|
||||
* prevent the NCCL call from completing, blocking the hipFree call.
|
||||
* prevent the NCCL call from completing, blocking the cudaFree call.
|
||||
*/
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL) {
|
||||
ncclComm_t comm = args->coll.comm;
|
||||
NCCLCHECKGOTO(ncclSaveCommKernels(comm), ret, group_cleanup);
|
||||
}
|
||||
}
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL) {
|
||||
@@ -304,32 +348,28 @@ group_cleanup:
|
||||
*args->init.newcomm = NULL;
|
||||
} else {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
for (int i=0; i<channel->collCount; i++) {
|
||||
channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
|
||||
// Reset aggregation counters
|
||||
comm->asyncOpCount = 0;
|
||||
comm->asyncTotalSize = 0;
|
||||
// Dequeue p2p lists
|
||||
if (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
|
||||
struct ncclP2Plist* p2pSends = comm->p2pSends;
|
||||
struct ncclP2Plist* p2pRecvs = comm->p2pRecvs;
|
||||
for (int peer=0; peer<comm->nRanks; peer++) {
|
||||
while (p2pSends[peer].head != NULL) dequeueP2pInfo(p2pSends+peer);
|
||||
while (p2pRecvs[peer].head != NULL) dequeueP2pInfo(p2pRecvs+peer);
|
||||
}
|
||||
channel->collFifoTail = channel->collStart;
|
||||
channel->collCount = 0;
|
||||
comm->p2pSendCount = comm->p2pRecvCount = 0;
|
||||
}
|
||||
/* Cancel all proxy ops : mark them as ncclProxyOpNone and they should be freed later on */
|
||||
/* Free all proxy ops in state->nextOps */
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
struct ncclProxyArgs *op, *start;
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
op = start = state->ops;
|
||||
while (op) {
|
||||
if (op->opCount >= comm->lastOpCount) op->state = ncclProxyOpNone;
|
||||
struct ncclProxyArgs* peerOp = op->nextPeer;
|
||||
while (peerOp) {
|
||||
if (peerOp->opCount >= comm->lastOpCount) peerOp->state = ncclProxyOpNone;
|
||||
peerOp = peerOp->nextPeer;
|
||||
}
|
||||
op = op->next;
|
||||
if (op == start) break;
|
||||
pthread_mutex_lock(&state->poolMutex);
|
||||
for (struct ncclProxyArgs *op = state->nextOps; op; op = op->next) {
|
||||
op->next = state->pool;
|
||||
state->pool = op;
|
||||
}
|
||||
comm->opCount = comm->lastOpCount;
|
||||
pthread_cond_signal(&state->cond);
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
pthread_mutex_unlock(&state->poolMutex);
|
||||
state->nextOps = NULL;
|
||||
|
||||
comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
|
||||
comm->userStreamSet = false;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -16,6 +16,8 @@ ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commSt
|
||||
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
|
||||
ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr);
|
||||
ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
|
||||
ncclResult_t bootstrapClose(void* commState);
|
||||
ncclResult_t bootstrapAbort(void* commState);
|
||||
#endif
|
||||
|
||||
@@ -24,7 +24,7 @@ static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, voi
|
||||
static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
|
||||
NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t collNetFlush(void* collComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclCollNet->flush(collComm, data, size, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetIflush(void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
|
||||
static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
|
||||
static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
|
||||
|
||||
@@ -8,63 +8,67 @@
|
||||
#ifndef NCCL_COLLECTIVES_H_
|
||||
#define NCCL_COLLECTIVES_H_
|
||||
|
||||
#define FUNC_INDEX_P2P (4+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps)
|
||||
#define FUNC_INDEX(coll, redop, dtype, al, pr) ((coll >= NCCL_NUM_FUNCTIONS) \
|
||||
? (coll-NCCL_NUM_FUNCTIONS+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps) \
|
||||
: ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)))
|
||||
#define FUNC_INDEX_P2P (NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps)
|
||||
#define FUNC_INDEX_A2A (FUNC_INDEX_P2P+1)
|
||||
#define FUNC_INDEX_A2AV (FUNC_INDEX_P2P+2)
|
||||
|
||||
#define NCCL_COLL_NAME(coll, op, dtype) \
|
||||
coll##_##op##_##dtype
|
||||
#define FUNC_INDEX(func, redop, ncclType, al, pr) ((func >= NCCL_NUM_FUNCTIONS) \
|
||||
? (func-NCCL_NUM_FUNCTIONS+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps) \
|
||||
: ((((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)))
|
||||
|
||||
#define NCCL_KERN_NAME(coll, op, dtype) \
|
||||
coll##Kernel_##op##_##dtype
|
||||
#define NCCL_FUNC_NAME(func, algo, proto, redop, type) \
|
||||
ncclFunction_##func##_##algo##_##proto##_##redop##_##type
|
||||
|
||||
#define NCCL_KERN_NAME(func, algo, proto, redop, type) \
|
||||
ncclKernel_##func##_##algo##_##proto##_##redop##_##type
|
||||
|
||||
#define NCCL_IMPL_NAME(func, algo, proto) \
|
||||
nccl##func##algo##proto
|
||||
|
||||
/* Declare all collective operations */
|
||||
#define DECL_COLL5(coll, op, dtype) \
|
||||
extern __device__ __attribute__((noinline)) void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
|
||||
extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm); \
|
||||
#define DECL5(func, algo, proto, redop, type) \
|
||||
extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, redop, type)(struct ncclWorkElem* args); \
|
||||
extern __global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(struct ncclWorkElem first); \
|
||||
|
||||
#define DECL_COLL4(coll, op, dtype) \
|
||||
DECL_COLL5(coll, op, dtype) \
|
||||
DECL_COLL5(coll##LL, op, dtype) \
|
||||
DECL_COLL5(coll##LL128, op, dtype)
|
||||
#define DECL4(func, algo, redop, type) \
|
||||
DECL5(func, algo, SIMPLE, redop, type) \
|
||||
DECL5(func, algo, LL, redop, type) \
|
||||
DECL5(func, algo, LL128, redop, type)
|
||||
|
||||
#define DECL_COLL3(coll, op, dtype) \
|
||||
DECL_COLL4(coll##Ring, op, dtype) \
|
||||
DECL_COLL4(coll##Tree, op, dtype) \
|
||||
DECL_COLL4(coll##CollNet, op, dtype)
|
||||
#define DECL3(func, redop, type) \
|
||||
DECL4(func, RING, redop, type) \
|
||||
DECL4(func, TREE, redop, type) \
|
||||
DECL4(func, COLLNET, redop, type)
|
||||
|
||||
#define DECL_COLL2(coll, op) \
|
||||
DECL_COLL3(coll, op, i8) \
|
||||
DECL_COLL3(coll, op, u8) \
|
||||
DECL_COLL3(coll, op, i32) \
|
||||
DECL_COLL3(coll, op, u32) \
|
||||
DECL_COLL3(coll, op, i64) \
|
||||
DECL_COLL3(coll, op, u64) \
|
||||
DECL_COLL3(coll, op, f16) \
|
||||
DECL_COLL3(coll, op, f32) \
|
||||
DECL_COLL3(coll, op, f64) \
|
||||
DECL_COLL3(coll, op, b16)
|
||||
#define DECL2(func, redop) \
|
||||
DECL3(func, redop, int8_t) \
|
||||
DECL3(func, redop, uint8_t) \
|
||||
DECL3(func, redop, int32_t) \
|
||||
DECL3(func, redop, uint32_t) \
|
||||
DECL3(func, redop, int64_t) \
|
||||
DECL3(func, redop, uint64_t) \
|
||||
DECL3(func, redop, half) \
|
||||
DECL3(func, redop, float) \
|
||||
DECL3(func, redop, double) \
|
||||
DECL3(func, redop, rccl_bfloat16)
|
||||
|
||||
#define DECL_COLL(coll) \
|
||||
DECL_COLL2(coll, sum) \
|
||||
DECL_COLL2(coll, prod) \
|
||||
DECL_COLL2(coll, min) \
|
||||
DECL_COLL2(coll, max)
|
||||
#define DECL(func) \
|
||||
DECL2(func, Sum) \
|
||||
DECL2(func, Prod) \
|
||||
DECL2(func, Min) \
|
||||
DECL2(func, Max)
|
||||
|
||||
#define DECL_ALL_COLLS \
|
||||
DECL_COLL2(ncclBroadcast, copy) \
|
||||
DECL_COLL(ncclReduce) \
|
||||
DECL_COLL2(ncclAllGather, copy) \
|
||||
DECL_COLL(ncclReduceScatter) \
|
||||
DECL_COLL(ncclAllReduce) \
|
||||
DECL_COLL5(ncclGather, copy, i8) \
|
||||
DECL_COLL5(ncclScatter, copy, i8) \
|
||||
DECL_COLL5(ncclAllToAll, copy, i8) \
|
||||
DECL_COLL5(ncclAllToAllv, copy, i8) \
|
||||
DECL_COLL5(ncclSendRecv, copy, i8) \
|
||||
#define DECL_ALL \
|
||||
DECL2(Broadcast, Sum) \
|
||||
DECL(Reduce) \
|
||||
DECL2(AllGather, Sum) \
|
||||
DECL(ReduceScatter) \
|
||||
DECL(AllReduce) \
|
||||
DECL5(SendRecv, RING, SIMPLE, Sum, int8_t) \
|
||||
DECL5(AllToAll, RING, SIMPLE, Sum, int8_t) \
|
||||
DECL5(AllToAllv, RING, SIMPLE, Sum, int8_t) \
|
||||
|
||||
DECL_ALL_COLLS
|
||||
DECL_ALL
|
||||
|
||||
// CHUNKSIZE must be a multiple of SLICESIZE
|
||||
//#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
|
||||
@@ -92,5 +96,4 @@ DECL_ALL_COLLS
|
||||
#define ALLTOALL_CHUNKSTEPS 4
|
||||
#define ALLTOALLV_SLICESTEPS 4
|
||||
#define ALLTOALLV_CHUNKSTEPS 4
|
||||
|
||||
#endif
|
||||
|
||||
@@ -55,8 +55,8 @@ struct ncclRecvMem {
|
||||
struct {
|
||||
uint64_t tail;
|
||||
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
int sizesFifo[NCCL_STEPS];
|
||||
void* ptrsFifo[NCCL_STEPS];
|
||||
};
|
||||
char pad4[MEM_ALIGN];
|
||||
};
|
||||
@@ -70,6 +70,10 @@ struct ncclComm {
|
||||
struct ncclTopoSystem* topo;
|
||||
|
||||
void* bootstrap;
|
||||
// Bitmasks for ncclTransportP2pSetup
|
||||
int connect;
|
||||
uint32_t* connectSend;
|
||||
uint32_t* connectRecv;
|
||||
|
||||
int rank; // my rank in the communicator
|
||||
int nRanks; // number of GPUs in communicator
|
||||
@@ -134,8 +138,8 @@ struct ncclComm {
|
||||
int* intraCudaDevs;
|
||||
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
|
||||
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
|
||||
struct ncclDevComm* args;
|
||||
struct ncclDevComm** argsptr;
|
||||
struct ncclWorkElem args;
|
||||
void* argsptr;
|
||||
|
||||
// Global proxy thread
|
||||
pthread_t proxyThread;
|
||||
@@ -143,8 +147,17 @@ struct ncclComm {
|
||||
|
||||
// Whether this communicator uses collNet
|
||||
int collNetSupport;
|
||||
|
||||
// Store info of async operations
|
||||
struct ncclInfo* asyncOps;
|
||||
int asyncOpCount;
|
||||
size_t asyncTotalSize;
|
||||
|
||||
//list of async p2p operation queued in a group semantics
|
||||
struct ncclP2Plist p2plist;
|
||||
struct ncclP2Plist* p2pSends;
|
||||
struct ncclP2Plist* p2pRecvs;
|
||||
int p2pSendCount;
|
||||
int p2pRecvCount;
|
||||
|
||||
// [RCCL]
|
||||
bool alltoallDisable; // RCCL AllToAll/Scatter/Gather API
|
||||
|
||||
@@ -57,5 +57,6 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
|
||||
#include "alloc.h"
|
||||
#include "utils.h"
|
||||
#include "param.h"
|
||||
#include "nvtx_stub.h"
|
||||
|
||||
#endif // end include guard
|
||||
|
||||
@@ -19,7 +19,7 @@ static int hexToInt(char c) {
|
||||
|
||||
#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
|
||||
|
||||
ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
|
||||
static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
|
||||
uint32_t cpumasks[CPU_SET_N_U32];
|
||||
int m = CPU_SET_N_U32-1;
|
||||
cpumasks[m] = 0;
|
||||
@@ -42,7 +42,7 @@ ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
|
||||
static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
|
||||
int c = 0;
|
||||
uint8_t* m8 = (uint8_t*)mask;
|
||||
for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
|
||||
|
||||
@@ -25,7 +25,7 @@ extern pthread_mutex_t ncclDebugOutputLock;
|
||||
extern FILE *ncclDebugFile;
|
||||
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||
|
||||
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
|
||||
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
|
||||
|
||||
// Let code temporarily downgrade WARN into INFO
|
||||
extern thread_local int ncclDebugNoWarn;
|
||||
|
||||
+62
-64
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -27,8 +27,8 @@
|
||||
|
||||
|
||||
#define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
|
||||
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollGather, ncclCollScatter, ncclCollAllToAll, ncclCollAllToAllv, ncclCollSendRecv} ncclFunc_t;
|
||||
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+4];
|
||||
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncAllToAll, ncclFuncAllToAllv } ncclFunc_t;
|
||||
extern const char* ncclFuncStr[];
|
||||
|
||||
#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
|
||||
#define NCCL_ALGO_TREE 0
|
||||
@@ -64,6 +64,7 @@ union ncclLLFifoLine {
|
||||
#define WARP_SIZE 64
|
||||
#define MAXCHANNELS 32
|
||||
#define NCCL_MAX_NTHREADS 256
|
||||
#define NCCL_SIMPLE_MAX_NTHREADS NCCL_MAX_NTHREADS
|
||||
#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
|
||||
#define NCCL_LL_LINES_PER_THREAD 8
|
||||
#ifdef TEST_LL_CLEANUP
|
||||
@@ -77,7 +78,7 @@ union ncclLLFifoLine {
|
||||
// Make sure the clean mask will last for at least NCCL_NSTEPS
|
||||
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
|
||||
|
||||
#define NCCL_LL128_LINESIZE 64
|
||||
#define NCCL_LL128_LINESIZE 128
|
||||
#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
|
||||
#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
|
||||
|
||||
@@ -88,15 +89,12 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
|
||||
// to 3 dests. Use 70% for reduce and 30% for bcast.
|
||||
#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
|
||||
|
||||
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
|
||||
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 2
|
||||
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
|
||||
|
||||
#define NCCL_DIRECT_GPU 0x01
|
||||
#define NCCL_DIRECT_NIC 0x10
|
||||
|
||||
#define MAXBARRIERS 2
|
||||
#define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
|
||||
|
||||
struct ncclConnInfo {
|
||||
// Regular comm mechanism
|
||||
char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
|
||||
@@ -104,9 +102,11 @@ struct ncclConnInfo {
|
||||
uint64_t *head; // Local for send, remote for recv
|
||||
|
||||
int direct; // Direct communication
|
||||
int shared; // Buffers are shared
|
||||
void **ptrExchange; // Pointer exchange for direct communication
|
||||
|
||||
int *fifo; // Size fifo for proxy
|
||||
int *sizesFifo; // Sizes fifo from GPU to proxy
|
||||
void* *ptrsFifo; // Buffer fifo from proxy to GPU
|
||||
|
||||
uint64_t step; // Keep where we are
|
||||
uint64_t llLastCleaning;
|
||||
@@ -115,7 +115,7 @@ struct ncclConnInfo {
|
||||
// allows software to explicitly initiate a flush read to HDP memory. See more
|
||||
// descriptions in primitives.h.
|
||||
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
|
||||
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
uint32_t* curr_hdp_reg; // Current GPU's HDP register
|
||||
};
|
||||
|
||||
struct ncclConnector {
|
||||
@@ -156,79 +156,69 @@ struct ncclDevComm;
|
||||
|
||||
#pragma pack(push) /* push current alignment to stack */
|
||||
#pragma pack(4) /* set alignment to 4 bytes boundary */
|
||||
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
|
||||
/* to make sure reads to host from the CUDA kernel are aligned. */
|
||||
/* Make sure to adjust padding at the end of ncclColl. */
|
||||
struct CollectiveArgs {
|
||||
struct ncclDevComm* comm;
|
||||
uint64_t opCount;
|
||||
#define NCCL_MAX_WORK_ELEMENTS 2
|
||||
#define NCCL_MAX_GROUPS (NCCL_MAX_WORK_ELEMENTS*2)
|
||||
|
||||
/* ncclWork is to be a power of two, currently 8x64 bytes, */
|
||||
/* to make sure reads to host from the CUDA kernel are aligned. */
|
||||
/* Make sure to adjust padding at the end of ncclWorkElem. */
|
||||
struct ncclWorkElem {
|
||||
// Header
|
||||
struct ncclDevComm* comm;
|
||||
uint16_t nThreads;
|
||||
uint16_t funcIndex;
|
||||
uint16_t index;
|
||||
uint16_t active;
|
||||
|
||||
// local and remote input, output, and buffer
|
||||
const void * sendbuff;
|
||||
void * recvbuff;
|
||||
|
||||
// Op-specific fields. Make sure the common part stays the
|
||||
// same on all structs of the union
|
||||
uint64_t opCount;
|
||||
// Op-specific fields.
|
||||
union {
|
||||
struct {
|
||||
uint16_t nThreads;
|
||||
} common;
|
||||
struct {
|
||||
uint16_t nThreads;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
uint32_t root;
|
||||
size_t count;
|
||||
size_t lastChunkSize;
|
||||
uint32_t root;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
} coll;
|
||||
struct {
|
||||
uint16_t nThreads;
|
||||
uint16_t unused;
|
||||
int32_t delta;
|
||||
size_t sendCount;
|
||||
size_t recvCount;
|
||||
int32_t delta;
|
||||
uint16_t nThreads;
|
||||
} p2p;
|
||||
struct {
|
||||
uint16_t nThreads;
|
||||
size_t count;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
size_t count;
|
||||
size_t* extra;
|
||||
} a2av;
|
||||
// [RCCL] Clique-based arguments
|
||||
// NOTE: Follows same field structure as coll
|
||||
// because nChannels is accessed from "coll" struct.
|
||||
struct {
|
||||
uint16_t nThreads;
|
||||
size_t count;
|
||||
cliqueDevicePtrs_t* ptrs;
|
||||
uint32_t unused;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
size_t count;
|
||||
int verbose;
|
||||
cliqueDevicePtrs_t* ptrs;
|
||||
} clique;
|
||||
// [/RCCL]
|
||||
uint64_t align[3];
|
||||
};
|
||||
};
|
||||
|
||||
struct ncclColl {
|
||||
union {
|
||||
struct {
|
||||
struct CollectiveArgs args;
|
||||
uint16_t funcIndex;
|
||||
uint16_t nextIndex;
|
||||
uint8_t active;
|
||||
};
|
||||
int data[0x10];
|
||||
};
|
||||
struct ncclWork {
|
||||
struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
|
||||
};
|
||||
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
|
||||
static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size");
|
||||
|
||||
struct ncclChannel {
|
||||
union {
|
||||
struct {
|
||||
struct ncclRing ring;
|
||||
struct ncclTree treeUp;
|
||||
struct ncclTree treeDn;
|
||||
struct ncclTree collTreeUp;
|
||||
struct ncclTree collTreeDn;
|
||||
struct ncclTree tree;
|
||||
struct ncclTree collTree;
|
||||
|
||||
int id;
|
||||
|
||||
@@ -237,16 +227,11 @@ struct ncclChannel {
|
||||
struct ncclPeer* devPeers;
|
||||
|
||||
// Operation list for aggregation
|
||||
struct ncclColl* collectives;
|
||||
size_t* collectivesExtra;
|
||||
int collStart;
|
||||
int collCount;
|
||||
int collFifoHead; // Only used by GPU
|
||||
int collFifoTail; // Only used by CPU
|
||||
struct ncclWork* workFifo;
|
||||
int workCount;
|
||||
uint64_t workFifoTail; // Only used by CPU
|
||||
size_t* a2avParams;
|
||||
|
||||
uint32_t* sync;
|
||||
uint64_t* barrier;
|
||||
uint64_t* barrier_next;
|
||||
#ifdef ENABLE_PROFILING
|
||||
struct timeval tvs;
|
||||
uint64_t sizes;
|
||||
@@ -304,9 +289,11 @@ struct ncclProf {
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
typedef enum {
|
||||
ncclCollTraceNotReady,
|
||||
ncclCollTraceKernelLaunchType,
|
||||
ncclCollTraceCollEndType,
|
||||
ncclCollTraceAbortType
|
||||
ncclCollTraceAbortType,
|
||||
ncclCollTraceDataType
|
||||
} ncclCollTraceDataType_t;
|
||||
|
||||
struct ncclCollTrace {
|
||||
@@ -316,11 +303,22 @@ struct ncclCollTrace {
|
||||
uint32_t data_0;
|
||||
uint64_t timeStamp;
|
||||
uint64_t opCount;
|
||||
uint64_t data_1;
|
||||
union {
|
||||
uint64_t data_1;
|
||||
struct {
|
||||
uint16_t nThreads;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
} coll;
|
||||
struct {
|
||||
uint16_t nThreads;
|
||||
uint16_t delta;
|
||||
} p2p;
|
||||
};
|
||||
};
|
||||
static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must have a pow2 size");
|
||||
|
||||
#define COLLTRACE_NUM_ITEMS 1024
|
||||
#define COLLTRACE_NUM_ITEMS 8192
|
||||
#endif
|
||||
|
||||
struct ncclDevComm {
|
||||
|
||||
@@ -19,5 +19,7 @@ ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
|
||||
ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
|
||||
ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
|
||||
ncclResult_t ncclSaveKernel(struct ncclInfo* info);
|
||||
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info);
|
||||
ncclResult_t ncclSaveCommKernels(struct ncclComm* comm);
|
||||
|
||||
#endif // End include guard
|
||||
|
||||
@@ -29,7 +29,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
|
||||
|
||||
// Query topology
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net);
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read);
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
|
||||
|
||||
// Set CPU affinity
|
||||
@@ -40,20 +40,23 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
|
||||
#define NCCL_TOPO_CPU_ARCH_ARM 3
|
||||
#define NCCL_TOPO_CPU_VENDOR_INTEL 1
|
||||
#define NCCL_TOPO_CPU_VENDOR_AMD 2
|
||||
#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
|
||||
#define NCCL_TOPO_CPU_TYPE_BDW 1
|
||||
#define NCCL_TOPO_CPU_TYPE_SKL 2
|
||||
#define NCCL_TOPO_CPU_TYPE_ZEN 3
|
||||
#define NCCL_TOPO_CPU_TYPE_ROME 4
|
||||
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
|
||||
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
|
||||
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
|
||||
|
||||
#define NCCL_TOPO_MAX_NODES 256
|
||||
|
||||
// Init search. Needs to be done before calling ncclTopoCompute
|
||||
ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
|
||||
|
||||
#define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction
|
||||
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Split tree (send/recv from different ranks) flowing in both directions
|
||||
#define NCCL_TOPO_PATTERN_TREE 3 // Simple tree (send/recv from same rank) flowing in both directions
|
||||
#define NCCL_TOPO_PATTERN_BALANCED_TREE 1 // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU)
|
||||
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
|
||||
#define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU
|
||||
#define NCCL_TOPO_PATTERN_RING 4 // Ring
|
||||
struct ncclTopoGraph {
|
||||
// Input / output
|
||||
@@ -84,17 +87,16 @@ struct ncclTopoRanks {
|
||||
int ringSend[MAXCHANNELS];
|
||||
int ringPrev[MAXCHANNELS];
|
||||
int ringNext[MAXCHANNELS];
|
||||
int treeUpRecv[MAXCHANNELS];
|
||||
int treeUpSend[MAXCHANNELS];
|
||||
int treeDnRecv[MAXCHANNELS];
|
||||
int treeDnSend[MAXCHANNELS];
|
||||
int treeToParent[MAXCHANNELS];
|
||||
int treeToChild0[MAXCHANNELS];
|
||||
int treeToChild1[MAXCHANNELS];
|
||||
};
|
||||
|
||||
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
|
||||
struct ncclTopoRanks* topoRanks);
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets);
|
||||
|
||||
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
|
||||
|
||||
@@ -15,6 +15,9 @@
|
||||
#define NCCL_PTR_HOST 0x1
|
||||
#define NCCL_PTR_CUDA 0x2
|
||||
|
||||
// Maximum number of requests per comm object
|
||||
#define NCCL_NET_MAX_REQUESTS 8
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
@@ -29,9 +32,9 @@ typedef struct {
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
}ncclNetProperties_v3_t;
|
||||
}ncclNetProperties_v4_t;
|
||||
|
||||
typedef ncclNetProperties_v3_t ncclNetProperties_t;
|
||||
typedef ncclNetProperties_v4_t ncclNetProperties_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
@@ -41,7 +44,7 @@ typedef struct {
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
@@ -62,7 +65,7 @@ typedef struct {
|
||||
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
|
||||
ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
@@ -70,11 +73,11 @@ typedef struct {
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclNet_v3_t;
|
||||
} ncclNet_v4_t;
|
||||
|
||||
typedef ncclNet_v3_t ncclNet_t;
|
||||
typedef ncclNet_v4_t ncclNet_t;
|
||||
|
||||
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v3
|
||||
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v4
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
@@ -85,7 +88,7 @@ typedef struct {
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
@@ -105,17 +108,17 @@ typedef struct {
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*flush)(void* collComm, void* data, int size, void* mhandle);
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v3_t;
|
||||
} ncclCollNet_v4_t;
|
||||
|
||||
typedef ncclCollNet_v3_t ncclCollNet_t;
|
||||
typedef ncclCollNet_v4_t ncclCollNet_t;
|
||||
|
||||
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v3
|
||||
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v4
|
||||
|
||||
#endif // end include guard
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -25,7 +25,7 @@ static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, voi
|
||||
static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, data, size, mhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -45,14 +45,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index)
|
||||
NVMLCHECK(nvmlDeviceGetIndex(device, index));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
|
||||
NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrapNvmlDeviceGetHandleByPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
|
||||
NVMLCHECK(nvmlDeviceGetPciInfo(device, pci));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
|
||||
NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive));
|
||||
return ncclSuccess;
|
||||
@@ -66,10 +58,6 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig
|
||||
NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
|
||||
NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
|
||||
NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor));
|
||||
return ncclSuccess;
|
||||
@@ -150,12 +138,10 @@ ncclResult_t wrapNvmlShutdown(void);
|
||||
ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
|
||||
ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
|
||||
ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
|
||||
ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
|
||||
ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
|
||||
ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
|
||||
ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
|
||||
nvmlNvLinkCapability_t capability, unsigned int *capResult);
|
||||
ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
|
||||
ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
|
||||
|
||||
#endif // NVML_DIRECT
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_NVTX_H_
|
||||
#define NCCL_NVTX_H_
|
||||
|
||||
#include "nvtx3.hpp"
|
||||
|
||||
struct nccl_domain{static constexpr char const* name{"NCCL"};};
|
||||
|
||||
#endif
|
||||
Fichier diff supprimé car celui-ci est trop grand
Voir la Diff
Fichier diff supprimé car celui-ci est trop grand
Voir la Diff
@@ -0,0 +1,141 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "nvToolsExt.h"
|
||||
|
||||
#include "cuda.h"
|
||||
|
||||
#ifndef NVTOOLSEXT_CUDA_V3
|
||||
#define NVTOOLSEXT_CUDA_V3
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/* ========================================================================= */
|
||||
/** \name Functions for CUDA Resource Naming
|
||||
*/
|
||||
/** \addtogroup RESOURCE_NAMING
|
||||
* \section RESOURCE_NAMING_CUDA CUDA Resource Naming
|
||||
*
|
||||
* This section covers the API functions that allow to annotate CUDA resources
|
||||
* with user-provided names.
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief Used to build a non-colliding value for resource types separated class
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
#define NVTX_RESOURCE_CLASS_CUDA 4
|
||||
/** \endcond */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Resource types for CUDA
|
||||
*/
|
||||
typedef enum nvtxResourceCUDAType_t
|
||||
{
|
||||
NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
|
||||
NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
|
||||
NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
|
||||
NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
|
||||
} nvtxResourceCUDAType_t;
|
||||
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates a CUDA device.
|
||||
*
|
||||
* Allows the user to associate a CUDA device with a user-provided name.
|
||||
*
|
||||
* \param device - The handle of the CUDA device to name.
|
||||
* \param name - The name of the CUDA device.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates a CUDA context.
|
||||
*
|
||||
* Allows the user to associate a CUDA context with a user-provided name.
|
||||
*
|
||||
* \param context - The handle of the CUDA context to name.
|
||||
* \param name - The name of the CUDA context.
|
||||
*
|
||||
* \par Example:
|
||||
* \code
|
||||
* CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
|
||||
* if ( CUDA_SUCCESS != status )
|
||||
* goto Error;
|
||||
* nvtxNameCuContext(cuContext, "CTX_NAME");
|
||||
* \endcode
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates a CUDA stream.
|
||||
*
|
||||
* Allows the user to associate a CUDA stream with a user-provided name.
|
||||
*
|
||||
* \param stream - The handle of the CUDA stream to name.
|
||||
* \param name - The name of the CUDA stream.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates a CUDA event.
|
||||
*
|
||||
* Allows the user to associate a CUDA event with a user-provided name.
|
||||
*
|
||||
* \param event - The handle of the CUDA event to name.
|
||||
* \param name - The name of the CUDA event.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/** @} */ /* END RESOURCE_NAMING */
|
||||
|
||||
/* ========================================================================= */
|
||||
#ifdef UNICODE
|
||||
#define nvtxNameCuDevice nvtxNameCuDeviceW
|
||||
#define nvtxNameCuContext nvtxNameCuContextW
|
||||
#define nvtxNameCuStream nvtxNameCuStreamW
|
||||
#define nvtxNameCuEvent nvtxNameCuEventW
|
||||
#else
|
||||
#define nvtxNameCuDevice nvtxNameCuDeviceA
|
||||
#define nvtxNameCuContext nvtxNameCuContextA
|
||||
#define nvtxNameCuStream nvtxNameCuStreamA
|
||||
#define nvtxNameCuEvent nvtxNameCuEventA
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifndef NVTX_NO_IMPL
|
||||
#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */
|
||||
#include "nvtxDetail/nvtxImplCuda_v3.h"
|
||||
#undef NVTX_IMPL_GUARD_CUDA
|
||||
#endif /*NVTX_NO_IMPL*/
|
||||
|
||||
#endif /* NVTOOLSEXT_CUDA_V3 */
|
||||
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "nvToolsExt.h"
|
||||
|
||||
#include "cuda.h"
|
||||
#include "driver_types.h"
|
||||
|
||||
#ifndef NVTOOLSEXT_CUDART_V3
|
||||
#define NVTOOLSEXT_CUDART_V3
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/* ========================================================================= */
|
||||
/** \name Functions for CUDA Resource Naming
|
||||
*/
|
||||
/** \addtogroup RESOURCE_NAMING
|
||||
* \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming
|
||||
*
|
||||
* This section covers the API functions that allow to annotate CUDA resources
|
||||
* with user-provided names.
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief Used to build a non-colliding value for resource types separated class
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
#define NVTX_RESOURCE_CLASS_CUDART 5
|
||||
/** \endcond */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Resource types for CUDART
|
||||
*/
|
||||
typedef enum nvtxResourceCUDARTType_t
|
||||
{
|
||||
NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
|
||||
NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
|
||||
NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
|
||||
} nvtxResourceCUDARTType_t;
|
||||
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates a CUDA device.
|
||||
*
|
||||
* Allows the user to associate a CUDA device with a user-provided name.
|
||||
*
|
||||
* \param device - The id of the CUDA device to name.
|
||||
* \param name - The name of the CUDA device.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates a CUDA stream.
|
||||
*
|
||||
* Allows the user to associate a CUDA stream with a user-provided name.
|
||||
*
|
||||
* \param stream - The handle of the CUDA stream to name.
|
||||
* \param name - The name of the CUDA stream.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates a CUDA event.
|
||||
*
|
||||
* Allows the user to associate a CUDA event with a user-provided name.
|
||||
*
|
||||
* \param event - The handle of the CUDA event to name.
|
||||
* \param name - The name of the CUDA event.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/** @} */ /* END RESOURCE_NAMING */
|
||||
|
||||
/* ========================================================================= */
|
||||
#ifdef UNICODE
|
||||
#define nvtxNameCudaDevice nvtxNameCudaDeviceW
|
||||
#define nvtxNameCudaStream nvtxNameCudaStreamW
|
||||
#define nvtxNameCudaEvent nvtxNameCudaEventW
|
||||
#else
|
||||
#define nvtxNameCudaDevice nvtxNameCudaDeviceA
|
||||
#define nvtxNameCudaStream nvtxNameCudaStreamA
|
||||
#define nvtxNameCudaEvent nvtxNameCudaEventA
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifndef NVTX_NO_IMPL
|
||||
#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */
|
||||
#include "nvtxDetail/nvtxImplCudaRt_v3.h"
|
||||
#undef NVTX_IMPL_GUARD_CUDART
|
||||
#endif /*NVTX_NO_IMPL*/
|
||||
|
||||
#endif /* NVTOOLSEXT_CUDART_V3 */
|
||||
@@ -0,0 +1,191 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "nvToolsExt.h"
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
#ifndef NVTOOLSEXT_OPENCL_V3
|
||||
#define NVTOOLSEXT_OPENCL_V3
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/* ========================================================================= */
|
||||
/** \name Functions for OpenCL Resource Naming
|
||||
*/
|
||||
/** \addtogroup RESOURCE_NAMING
|
||||
* \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
|
||||
*
|
||||
* This section covers the API functions that allow to annotate OpenCL resources
|
||||
* with user-provided names.
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief Used to build a non-colliding value for resource types separated class
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
#define NVTX_RESOURCE_CLASS_OPENCL 6
|
||||
/** \endcond */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Resource types for OpenCL
|
||||
*/
|
||||
typedef enum nvtxResourceOpenCLType_t
|
||||
{
|
||||
NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
|
||||
NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
|
||||
NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
|
||||
NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
|
||||
NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
|
||||
NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
|
||||
NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7),
|
||||
} nvtxResourceOpenCLType_t;
|
||||
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates an OpenCL device.
|
||||
*
|
||||
* Allows to associate an OpenCL device with a user-provided name.
|
||||
*
|
||||
* \param device - The handle of the OpenCL device to name.
|
||||
* \param name - The name of the OpenCL device.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates an OpenCL context.
|
||||
*
|
||||
* Allows to associate an OpenCL context with a user-provided name.
|
||||
*
|
||||
* \param context - The handle of the OpenCL context to name.
|
||||
* \param name - The name of the OpenCL context.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates an OpenCL command queue.
|
||||
*
|
||||
* Allows to associate an OpenCL command queue with a user-provided name.
|
||||
*
|
||||
* \param command_queue - The handle of the OpenCL command queue to name.
|
||||
* \param name - The name of the OpenCL command queue.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates an OpenCL memory object.
|
||||
*
|
||||
* Allows to associate an OpenCL memory object with a user-provided name.
|
||||
*
|
||||
* \param memobj - The handle of the OpenCL memory object to name.
|
||||
* \param name - The name of the OpenCL memory object.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates an OpenCL sampler.
|
||||
*
|
||||
* Allows to associate an OpenCL sampler with a user-provided name.
|
||||
*
|
||||
* \param sampler - The handle of the OpenCL sampler to name.
|
||||
* \param name - The name of the OpenCL sampler.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates an OpenCL program.
|
||||
*
|
||||
* Allows to associate an OpenCL program with a user-provided name.
|
||||
*
|
||||
* \param program - The handle of the OpenCL program to name.
|
||||
* \param name - The name of the OpenCL program.
|
||||
*
|
||||
* \code
|
||||
* cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
|
||||
* (const char **) &cSourceCL, &program_length, &ciErrNum);
|
||||
* shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
* nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
|
||||
* \endcode
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Annotates an OpenCL event.
|
||||
*
|
||||
* Allows to associate an OpenCL event with a user-provided name.
|
||||
*
|
||||
* \param evnt - The handle of the OpenCL event to name.
|
||||
* \param name - The name of the OpenCL event.
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/** @} */ /* END RESOURCE_NAMING */
|
||||
|
||||
/* ========================================================================= */
|
||||
#ifdef UNICODE
|
||||
#define nvtxNameClDevice nvtxNameClDeviceW
|
||||
#define nvtxNameClContext nvtxNameClContextW
|
||||
#define nvtxNameClCommandQueue nvtxNameClCommandQueueW
|
||||
#define nvtxNameClMemObject nvtxNameClMemObjectW
|
||||
#define nvtxNameClSampler nvtxNameClSamplerW
|
||||
#define nvtxNameClProgram nvtxNameClProgramW
|
||||
#define nvtxNameClEvent nvtxNameClEventW
|
||||
#else
|
||||
#define nvtxNameClDevice nvtxNameClDeviceA
|
||||
#define nvtxNameClContext nvtxNameClContextA
|
||||
#define nvtxNameClCommandQueue nvtxNameClCommandQueueA
|
||||
#define nvtxNameClMemObject nvtxNameClMemObjectA
|
||||
#define nvtxNameClSampler nvtxNameClSamplerA
|
||||
#define nvtxNameClProgram nvtxNameClProgramA
|
||||
#define nvtxNameClEvent nvtxNameClEventA
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifndef NVTX_NO_IMPL
|
||||
#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
|
||||
#include "nvtxDetail/nvtxImplOpenCL_v3.h"
|
||||
#undef NVTX_IMPL_GUARD_OPENCL
|
||||
#endif /*NVTX_NO_IMPL*/
|
||||
|
||||
#endif /* NVTOOLSEXT_OPENCL_V3 */
|
||||
@@ -0,0 +1,382 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#include "nvToolsExt.h"
|
||||
|
||||
#ifndef NVTOOLSEXT_SYNC_V3
|
||||
#define NVTOOLSEXT_SYNC_V3
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
#define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) )
|
||||
/** \endcond */
|
||||
|
||||
|
||||
/**
|
||||
* \page PAGE_SYNCHRONIZATION Synchronization
|
||||
*
|
||||
* This section covers a subset of the API that allow users to track additional
|
||||
* synchronization details of their application. Naming OS synchronization primitives
|
||||
* may allow users to better understand the data collected by traced synchronization
|
||||
* APIs. Additionally, a user defined synchronization object can allow the users to
|
||||
* to tell the tools when the user is building their own synchronization system
|
||||
* that do not rely on the OS to provide behaviors and instead use techniques like
|
||||
* atomic operations and spinlocks.
|
||||
*
|
||||
* See module \ref SYNCHRONIZATION for details.
|
||||
*
|
||||
* \par Example:
|
||||
* \code
|
||||
* class MyMutex
|
||||
* {
|
||||
* volatile long bLocked;
|
||||
* nvtxSyncUser_t hSync;
|
||||
* public:
|
||||
* MyMutex(const char* name, nvtxDomainHandle_t d){
|
||||
* bLocked = 0;
|
||||
*
|
||||
* nvtxSyncUserAttributes_t attribs = { 0 };
|
||||
* attribs.version = NVTX_VERSION;
|
||||
* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
|
||||
* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
|
||||
* attribs.message.ascii = name;
|
||||
* hSync = nvtxDomainSyncUserCreate(d, &attribs);
|
||||
* }
|
||||
*
|
||||
* ~MyMutex() {
|
||||
* nvtxDomainSyncUserDestroy(hSync);
|
||||
* }
|
||||
*
|
||||
* bool Lock() {
|
||||
* nvtxDomainSyncUserAcquireStart(hSync);
|
||||
* bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic
|
||||
|
||||
* if (acquired) {
|
||||
* nvtxDomainSyncUserAcquireSuccess(hSync);
|
||||
* }
|
||||
* else {
|
||||
* nvtxDomainSyncUserAcquireFailed(hSync);
|
||||
* }
|
||||
* return acquired;
|
||||
* }
|
||||
|
||||
* void Unlock() {
|
||||
* nvtxDomainSyncUserReleasing(hSync);
|
||||
* bLocked = false;
|
||||
* }
|
||||
* };
|
||||
* \endcode
|
||||
*
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* \cond SHOW_HIDDEN
|
||||
* \brief Used to build a non-colliding value for resource types separated class
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
#define NVTX_RESOURCE_CLASS_SYNC_OS 2 /**< Synchronization objects that are OS specific. */
|
||||
#define NVTX_RESOURCE_CLASS_SYNC_PTHREAD 3 /**< Synchronization objects that are from the POSIX Threads API (pthread)*/
|
||||
/** \endcond */
|
||||
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \defgroup SYNCHRONIZATION Synchronization
|
||||
* See page \ref PAGE_SYNCHRONIZATION.
|
||||
* @{
|
||||
*/
|
||||
|
||||
/** \brief Resource type values for OSs with POSIX Thread API support
|
||||
*/
|
||||
typedef enum nvtxResourceSyncPosixThreadType_t
|
||||
{
|
||||
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 1), /* pthread_mutex_t */
|
||||
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_CONDITION = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 2), /* pthread_cond_t */
|
||||
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_RWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 3), /* pthread_rwlock_t */
|
||||
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_BARRIER = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 4), /* pthread_barrier_t */
|
||||
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 5), /* pthread_spinlock_t */
|
||||
NVTX_RESOURCE_TYPE_SYNC_PTHREAD_ONCE = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 6) /* pthread_once_t */
|
||||
} nvtxResourceSyncPosixThreadType_t;
|
||||
|
||||
/** \brief Resource type values for Windows OSs
|
||||
*/
|
||||
typedef enum nvtxResourceSyncWindowsType_t
|
||||
{
|
||||
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
|
||||
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
|
||||
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_EVENT = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
|
||||
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_CRITICAL_SECTION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
|
||||
NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SRWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5)
|
||||
} nvtxResourceSyncWindowsType_t;
|
||||
|
||||
/** \brief Resource type values for Linux and Linux derived OSs such as Android
|
||||
* \sa
|
||||
* ::nvtxResourceSyncPosixThreadType_t
|
||||
*/
|
||||
typedef enum nvtxResourceSyncLinuxType_t
|
||||
{
|
||||
NVTX_RESOURCE_TYPE_SYNC_LINUX_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
|
||||
NVTX_RESOURCE_TYPE_SYNC_LINUX_FUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
|
||||
NVTX_RESOURCE_TYPE_SYNC_LINUX_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
|
||||
NVTX_RESOURCE_TYPE_SYNC_LINUX_COMPLETION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
|
||||
NVTX_RESOURCE_TYPE_SYNC_LINUX_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5),
|
||||
NVTX_RESOURCE_TYPE_SYNC_LINUX_SEQLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 6),
|
||||
NVTX_RESOURCE_TYPE_SYNC_LINUX_RCU = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 7)
|
||||
} nvtxResourceSyncLinuxType_t;
|
||||
|
||||
/** \brief Resource type values for Android come from Linux.
|
||||
* \sa
|
||||
* ::nvtxResourceSyncLinuxType_t
|
||||
* ::nvtxResourceSyncPosixThreadType_t
|
||||
*/
|
||||
typedef enum nvtxResourceSyncLinuxType_t nvtxResourceSyncAndroidType_t;
|
||||
|
||||
/** \brief User Defined Synchronization Object Handle .
|
||||
* \anchor SYNCUSER_HANDLE_STRUCTURE
|
||||
*
|
||||
* This structure is opaque to the user and is used as a handle to reference
|
||||
* a user defined syncrhonization object. The tools will return a pointer through the API for the application
|
||||
* to hold on it's behalf to reference the string in the future.
|
||||
*
|
||||
*/
|
||||
typedef struct nvtxSyncUser* nvtxSyncUser_t;
|
||||
|
||||
/** \brief User Defined Synchronization Object Attributes Structure.
|
||||
* \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE
|
||||
*
|
||||
* This structure is used to describe the attributes of a user defined synchronization
|
||||
* object. The layout of the structure is defined by a specific version of the tools
|
||||
* extension library and can change between different versions of the Tools Extension
|
||||
* library.
|
||||
*
|
||||
* \par Initializing the Attributes
|
||||
*
|
||||
* The caller should always perform the following three tasks when using
|
||||
* attributes:
|
||||
* <ul>
|
||||
* <li>Zero the structure
|
||||
* <li>Set the version field
|
||||
* <li>Set the size field
|
||||
* </ul>
|
||||
*
|
||||
* Zeroing the structure sets all the event attributes types and values
|
||||
* to the default value.
|
||||
*
|
||||
* The version and size field are used by the Tools Extension
|
||||
* implementation to handle multiple versions of the attributes structure.
|
||||
*
|
||||
* It is recommended that the caller use one of the following to methods
|
||||
* to initialize the event attributes structure:
|
||||
*
|
||||
* \par Method 1: Initializing nvtxEventAttributes for future compatibility
|
||||
* \code
|
||||
* nvtxSyncUserAttributes_t attribs = {0};
|
||||
* attribs.version = NVTX_VERSION;
|
||||
* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
|
||||
* \endcode
|
||||
*
|
||||
* \par Method 2: Initializing nvtxSyncUserAttributes_t for a specific version
|
||||
* \code
|
||||
* nvtxSyncUserAttributes_t attribs = {0};
|
||||
* attribs.version = 1;
|
||||
* attribs.size = (uint16_t)(sizeof(nvtxSyncUserAttributes_t));
|
||||
* \endcode
|
||||
*
|
||||
* If the caller uses Method 1 it is critical that the entire binary
|
||||
* layout of the structure be configured to 0 so that all fields
|
||||
* are initialized to the default value.
|
||||
*
|
||||
* The caller should either use both NVTX_VERSION and
|
||||
* NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
|
||||
* and a versioned type (Method 2). Using a mix of the two methods
|
||||
* will likely cause either source level incompatibility or binary
|
||||
* incompatibility in the future.
|
||||
*
|
||||
* \par Settings Attribute Types and Values
|
||||
*
|
||||
*
|
||||
* \par Example:
|
||||
* \code
|
||||
* // Initialize
|
||||
* nvtxSyncUserAttributes_t attribs = {0};
|
||||
* attribs.version = NVTX_VERSION;
|
||||
* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
|
||||
*
|
||||
* // Configure the Attributes
|
||||
* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
|
||||
* attribs.message.ascii = "Example";
|
||||
* \endcode
|
||||
*
|
||||
* \sa
|
||||
* ::nvtxDomainSyncUserCreate
|
||||
*/
|
||||
typedef struct nvtxSyncUserAttributes_v0
|
||||
{
|
||||
/**
|
||||
* \brief Version flag of the structure.
|
||||
*
|
||||
* Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
|
||||
* supported in this header file. This can optionally be overridden to
|
||||
* another version of the tools extension library.
|
||||
*/
|
||||
uint16_t version;
|
||||
|
||||
/**
|
||||
* \brief Size of the structure.
|
||||
*
|
||||
* Needs to be set to the size in bytes of the event attribute
|
||||
* structure used to specify the event.
|
||||
*/
|
||||
uint16_t size;
|
||||
|
||||
/** \brief Message type specified in this attribute structure.
|
||||
*
|
||||
* Defines the message format of the attribute structure's \ref nvtxSyncUserAttributes_v0::message
|
||||
* "message" field.
|
||||
*
|
||||
* Default Value is NVTX_MESSAGE_UNKNOWN
|
||||
*/
|
||||
int32_t messageType; /* nvtxMessageType_t */
|
||||
|
||||
/** \brief Message assigned to this attribute structure.
|
||||
*
|
||||
* The text message that is attached to an event.
|
||||
*/
|
||||
nvtxMessageValue_t message;
|
||||
|
||||
} nvtxSyncUserAttributes_v0;
|
||||
|
||||
typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Create a user defined synchronization object
|
||||
* This is used to track non-OS synchronization working with spinlocks and atomics
|
||||
*
|
||||
* \param domain - Domain to own the resource
|
||||
* \param attribs - A structure to assign multiple attributes to the object.
|
||||
*
|
||||
* \return A handle that represents the newly created user defined synchronization object.
|
||||
*
|
||||
* \sa
|
||||
* ::nvtxDomainSyncUserCreate
|
||||
* ::nvtxDomainSyncUserDestroy
|
||||
* ::nvtxDomainSyncUserAcquireStart
|
||||
* ::nvtxDomainSyncUserAcquireFailed
|
||||
* ::nvtxDomainSyncUserAcquireSuccess
|
||||
* ::nvtxDomainSyncUserReleasing
|
||||
*
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Destroy a user defined synchronization object
|
||||
* This is used to track non-OS synchronization working with spinlocks and atomics
|
||||
*
|
||||
* \param handle - A handle to the object to operate on.
|
||||
*
|
||||
* \sa
|
||||
* ::nvtxDomainSyncUserCreate
|
||||
* ::nvtxDomainSyncUserDestroy
|
||||
* ::nvtxDomainSyncUserAcquireStart
|
||||
* ::nvtxDomainSyncUserAcquireFailed
|
||||
* ::nvtxDomainSyncUserAcquireSuccess
|
||||
* ::nvtxDomainSyncUserReleasing
|
||||
*
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle);
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Signal to tools that an attempt to acquire a user defined synchronization object
|
||||
*
|
||||
* \param handle - A handle to the object to operate on.
|
||||
*
|
||||
* \sa
|
||||
* ::nvtxDomainSyncUserCreate
|
||||
* ::nvtxDomainSyncUserDestroy
|
||||
* ::nvtxDomainSyncUserAcquireStart
|
||||
* ::nvtxDomainSyncUserAcquireFailed
|
||||
* ::nvtxDomainSyncUserAcquireSuccess
|
||||
* ::nvtxDomainSyncUserReleasing
|
||||
*
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle);
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Signal to tools of failure in acquiring a user defined synchronization object
|
||||
* This should be called after \ref nvtxDomainSyncUserAcquireStart
|
||||
*
|
||||
* \param handle - A handle to the object to operate on.
|
||||
*
|
||||
* \sa
|
||||
* ::nvtxDomainSyncUserCreate
|
||||
* ::nvtxDomainSyncUserDestroy
|
||||
* ::nvtxDomainSyncUserAcquireStart
|
||||
* ::nvtxDomainSyncUserAcquireFailed
|
||||
* ::nvtxDomainSyncUserAcquireSuccess
|
||||
* ::nvtxDomainSyncUserReleasing
|
||||
*
|
||||
* \version \NVTX_VERSION_2
|
||||
*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle);
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Signal to tools of success in acquiring a user defined synchronization object
|
||||
* This should be called after \ref nvtxDomainSyncUserAcquireStart.
|
||||
*
|
||||
* \param handle - A handle to the object to operate on.
|
||||
*
|
||||
* \sa
|
||||
* ::nvtxDomainSyncUserCreate
|
||||
* ::nvtxDomainSyncUserDestroy
|
||||
* ::nvtxDomainSyncUserAcquireStart
|
||||
* ::nvtxDomainSyncUserAcquireFailed
|
||||
* ::nvtxDomainSyncUserAcquireSuccess
|
||||
* ::nvtxDomainSyncUserReleasing
|
||||
*
|
||||
* \version \NVTX_VERSION_2
|
||||
*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle);
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/** \brief Signal to tools of releasing a reservation on user defined synchronization object
|
||||
* This should be called after \ref nvtxDomainSyncUserAcquireSuccess.
|
||||
*
|
||||
* \param handle - A handle to the object to operate on.
|
||||
*
|
||||
* \sa
|
||||
* ::nvtxDomainSyncUserCreate
|
||||
* ::nvtxDomainSyncUserDestroy
|
||||
* ::nvtxDomainSyncUserAcquireStart
|
||||
* ::nvtxDomainSyncUserAcquireFailed
|
||||
* ::nvtxDomainSyncUserAcquireSuccess
|
||||
* ::nvtxDomainSyncUserReleasing
|
||||
*
|
||||
* \version \NVTX_VERSION_2
|
||||
*/
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle);
|
||||
|
||||
|
||||
/** @} */ /*END defgroup*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifndef NVTX_NO_IMPL
|
||||
#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot included directly */
|
||||
#include "nvtxDetail/nvtxImplSync_v3.h"
|
||||
#undef NVTX_IMPL_GUARD_SYNC
|
||||
#endif /*NVTX_NO_IMPL*/
|
||||
|
||||
#endif /* NVTOOLSEXT_SYNC_V3 */
|
||||
@@ -0,0 +1,438 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
/* ---- Include required platform headers ---- */
|
||||
|
||||
#if defined(_WIN32)
|
||||
|
||||
#include <Windows.h>
|
||||
|
||||
#else
|
||||
#include <unistd.h>
|
||||
|
||||
#if defined(__ANDROID__)
|
||||
#include <android/api-level.h>
|
||||
#endif
|
||||
|
||||
#if defined(__linux__) || defined(__CYGWIN__)
|
||||
#include <sched.h>
|
||||
#endif
|
||||
|
||||
#include <limits.h>
|
||||
#include <dlfcn.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <pthread.h>
|
||||
#include <stdlib.h>
|
||||
#include <wchar.h>
|
||||
|
||||
#endif
|
||||
|
||||
/* ---- Define macros used in this file ---- */
|
||||
|
||||
#define NVTX_INIT_STATE_FRESH 0
|
||||
#define NVTX_INIT_STATE_STARTED 1
|
||||
#define NVTX_INIT_STATE_COMPLETE 2
|
||||
|
||||
#ifdef NVTX_DEBUG_PRINT
|
||||
#ifdef __ANDROID__
|
||||
#include <android/log.h>
|
||||
#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
|
||||
#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
|
||||
#else
|
||||
#include <stdio.h>
|
||||
#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
|
||||
#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
|
||||
#endif
|
||||
#else /* !defined(NVTX_DEBUG_PRINT) */
|
||||
#define NVTX_ERR(...)
|
||||
#define NVTX_INFO(...)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility push(hidden)
|
||||
#endif
|
||||
|
||||
/* ---- Forward declare all functions referenced in globals ---- */
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
|
||||
NvtxCallbackModule module,
|
||||
NvtxFunctionTable* out_table,
|
||||
unsigned int* out_size);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(
|
||||
uint32_t version);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(
|
||||
uint32_t exportTableId);
|
||||
|
||||
#include "nvtxInitDecls.h"
|
||||
|
||||
/* ---- Define all globals ---- */
|
||||
|
||||
typedef struct nvtxGlobals_t
|
||||
{
|
||||
volatile unsigned int initState;
|
||||
NvtxExportTableCallbacks etblCallbacks;
|
||||
NvtxExportTableVersionInfo etblVersionInfo;
|
||||
|
||||
/* Implementation function pointers */
|
||||
nvtxMarkEx_impl_fntype nvtxMarkEx_impl_fnptr;
|
||||
nvtxMarkA_impl_fntype nvtxMarkA_impl_fnptr;
|
||||
nvtxMarkW_impl_fntype nvtxMarkW_impl_fnptr;
|
||||
nvtxRangeStartEx_impl_fntype nvtxRangeStartEx_impl_fnptr;
|
||||
nvtxRangeStartA_impl_fntype nvtxRangeStartA_impl_fnptr;
|
||||
nvtxRangeStartW_impl_fntype nvtxRangeStartW_impl_fnptr;
|
||||
nvtxRangeEnd_impl_fntype nvtxRangeEnd_impl_fnptr;
|
||||
nvtxRangePushEx_impl_fntype nvtxRangePushEx_impl_fnptr;
|
||||
nvtxRangePushA_impl_fntype nvtxRangePushA_impl_fnptr;
|
||||
nvtxRangePushW_impl_fntype nvtxRangePushW_impl_fnptr;
|
||||
nvtxRangePop_impl_fntype nvtxRangePop_impl_fnptr;
|
||||
nvtxNameCategoryA_impl_fntype nvtxNameCategoryA_impl_fnptr;
|
||||
nvtxNameCategoryW_impl_fntype nvtxNameCategoryW_impl_fnptr;
|
||||
nvtxNameOsThreadA_impl_fntype nvtxNameOsThreadA_impl_fnptr;
|
||||
nvtxNameOsThreadW_impl_fntype nvtxNameOsThreadW_impl_fnptr;
|
||||
|
||||
nvtxNameCuDeviceA_fakeimpl_fntype nvtxNameCuDeviceA_impl_fnptr;
|
||||
nvtxNameCuDeviceW_fakeimpl_fntype nvtxNameCuDeviceW_impl_fnptr;
|
||||
nvtxNameCuContextA_fakeimpl_fntype nvtxNameCuContextA_impl_fnptr;
|
||||
nvtxNameCuContextW_fakeimpl_fntype nvtxNameCuContextW_impl_fnptr;
|
||||
nvtxNameCuStreamA_fakeimpl_fntype nvtxNameCuStreamA_impl_fnptr;
|
||||
nvtxNameCuStreamW_fakeimpl_fntype nvtxNameCuStreamW_impl_fnptr;
|
||||
nvtxNameCuEventA_fakeimpl_fntype nvtxNameCuEventA_impl_fnptr;
|
||||
nvtxNameCuEventW_fakeimpl_fntype nvtxNameCuEventW_impl_fnptr;
|
||||
|
||||
nvtxNameClDeviceA_fakeimpl_fntype nvtxNameClDeviceA_impl_fnptr;
|
||||
nvtxNameClDeviceW_fakeimpl_fntype nvtxNameClDeviceW_impl_fnptr;
|
||||
nvtxNameClContextA_fakeimpl_fntype nvtxNameClContextA_impl_fnptr;
|
||||
nvtxNameClContextW_fakeimpl_fntype nvtxNameClContextW_impl_fnptr;
|
||||
nvtxNameClCommandQueueA_fakeimpl_fntype nvtxNameClCommandQueueA_impl_fnptr;
|
||||
nvtxNameClCommandQueueW_fakeimpl_fntype nvtxNameClCommandQueueW_impl_fnptr;
|
||||
nvtxNameClMemObjectA_fakeimpl_fntype nvtxNameClMemObjectA_impl_fnptr;
|
||||
nvtxNameClMemObjectW_fakeimpl_fntype nvtxNameClMemObjectW_impl_fnptr;
|
||||
nvtxNameClSamplerA_fakeimpl_fntype nvtxNameClSamplerA_impl_fnptr;
|
||||
nvtxNameClSamplerW_fakeimpl_fntype nvtxNameClSamplerW_impl_fnptr;
|
||||
nvtxNameClProgramA_fakeimpl_fntype nvtxNameClProgramA_impl_fnptr;
|
||||
nvtxNameClProgramW_fakeimpl_fntype nvtxNameClProgramW_impl_fnptr;
|
||||
nvtxNameClEventA_fakeimpl_fntype nvtxNameClEventA_impl_fnptr;
|
||||
nvtxNameClEventW_fakeimpl_fntype nvtxNameClEventW_impl_fnptr;
|
||||
|
||||
nvtxNameCudaDeviceA_impl_fntype nvtxNameCudaDeviceA_impl_fnptr;
|
||||
nvtxNameCudaDeviceW_impl_fntype nvtxNameCudaDeviceW_impl_fnptr;
|
||||
nvtxNameCudaStreamA_fakeimpl_fntype nvtxNameCudaStreamA_impl_fnptr;
|
||||
nvtxNameCudaStreamW_fakeimpl_fntype nvtxNameCudaStreamW_impl_fnptr;
|
||||
nvtxNameCudaEventA_fakeimpl_fntype nvtxNameCudaEventA_impl_fnptr;
|
||||
nvtxNameCudaEventW_fakeimpl_fntype nvtxNameCudaEventW_impl_fnptr;
|
||||
|
||||
nvtxDomainMarkEx_impl_fntype nvtxDomainMarkEx_impl_fnptr;
|
||||
nvtxDomainRangeStartEx_impl_fntype nvtxDomainRangeStartEx_impl_fnptr;
|
||||
nvtxDomainRangeEnd_impl_fntype nvtxDomainRangeEnd_impl_fnptr;
|
||||
nvtxDomainRangePushEx_impl_fntype nvtxDomainRangePushEx_impl_fnptr;
|
||||
nvtxDomainRangePop_impl_fntype nvtxDomainRangePop_impl_fnptr;
|
||||
nvtxDomainResourceCreate_impl_fntype nvtxDomainResourceCreate_impl_fnptr;
|
||||
nvtxDomainResourceDestroy_impl_fntype nvtxDomainResourceDestroy_impl_fnptr;
|
||||
nvtxDomainNameCategoryA_impl_fntype nvtxDomainNameCategoryA_impl_fnptr;
|
||||
nvtxDomainNameCategoryW_impl_fntype nvtxDomainNameCategoryW_impl_fnptr;
|
||||
nvtxDomainRegisterStringA_impl_fntype nvtxDomainRegisterStringA_impl_fnptr;
|
||||
nvtxDomainRegisterStringW_impl_fntype nvtxDomainRegisterStringW_impl_fnptr;
|
||||
nvtxDomainCreateA_impl_fntype nvtxDomainCreateA_impl_fnptr;
|
||||
nvtxDomainCreateW_impl_fntype nvtxDomainCreateW_impl_fnptr;
|
||||
nvtxDomainDestroy_impl_fntype nvtxDomainDestroy_impl_fnptr;
|
||||
nvtxInitialize_impl_fntype nvtxInitialize_impl_fnptr;
|
||||
|
||||
nvtxDomainSyncUserCreate_impl_fntype nvtxDomainSyncUserCreate_impl_fnptr;
|
||||
nvtxDomainSyncUserDestroy_impl_fntype nvtxDomainSyncUserDestroy_impl_fnptr;
|
||||
nvtxDomainSyncUserAcquireStart_impl_fntype nvtxDomainSyncUserAcquireStart_impl_fnptr;
|
||||
nvtxDomainSyncUserAcquireFailed_impl_fntype nvtxDomainSyncUserAcquireFailed_impl_fnptr;
|
||||
nvtxDomainSyncUserAcquireSuccess_impl_fntype nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
|
||||
nvtxDomainSyncUserReleasing_impl_fntype nvtxDomainSyncUserReleasing_impl_fnptr;
|
||||
|
||||
/* Tables of function pointers -- Extra null added to the end to ensure
|
||||
* a crash instead of silent corruption if a tool reads off the end. */
|
||||
NvtxFunctionPointer* functionTable_CORE [NVTX_CBID_CORE_SIZE + 1];
|
||||
NvtxFunctionPointer* functionTable_CUDA [NVTX_CBID_CUDA_SIZE + 1];
|
||||
NvtxFunctionPointer* functionTable_OPENCL[NVTX_CBID_OPENCL_SIZE + 1];
|
||||
NvtxFunctionPointer* functionTable_CUDART[NVTX_CBID_CUDART_SIZE + 1];
|
||||
NvtxFunctionPointer* functionTable_CORE2 [NVTX_CBID_CORE2_SIZE + 1];
|
||||
NvtxFunctionPointer* functionTable_SYNC [NVTX_CBID_SYNC_SIZE + 1];
|
||||
} nvtxGlobals_t;
|
||||
|
||||
NVTX_LINKONCE_DEFINE_GLOBAL nvtxGlobals_t NVTX_VERSIONED_IDENTIFIER(nvtxGlobals) =
|
||||
{
|
||||
NVTX_INIT_STATE_FRESH,
|
||||
|
||||
{
|
||||
sizeof(NvtxExportTableCallbacks),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)
|
||||
},
|
||||
{
|
||||
sizeof(NvtxExportTableVersionInfo),
|
||||
NVTX_VERSION,
|
||||
0,
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)
|
||||
},
|
||||
|
||||
/* Implementation function pointers */
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init),
|
||||
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init),
|
||||
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init),
|
||||
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init),
|
||||
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init),
|
||||
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init),
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init),
|
||||
|
||||
/* Tables of function pointers */
|
||||
{
|
||||
0,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr,
|
||||
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr,
|
||||
0
|
||||
}
|
||||
};
|
||||
|
||||
/* ---- Define static inline implementations of core API functions ---- */
|
||||
|
||||
#include "nvtxImplCore.h"
|
||||
|
||||
/* ---- Define implementations of export table functions ---- */
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
|
||||
NvtxCallbackModule module,
|
||||
NvtxFunctionTable* out_table,
|
||||
unsigned int* out_size)
|
||||
{
|
||||
unsigned int bytes = 0;
|
||||
NvtxFunctionTable table = (NvtxFunctionTable)0;
|
||||
|
||||
switch (module)
|
||||
{
|
||||
case NVTX_CB_MODULE_CORE:
|
||||
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE;
|
||||
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE);
|
||||
break;
|
||||
case NVTX_CB_MODULE_CUDA:
|
||||
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA;
|
||||
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA);
|
||||
break;
|
||||
case NVTX_CB_MODULE_OPENCL:
|
||||
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL;
|
||||
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL);
|
||||
break;
|
||||
case NVTX_CB_MODULE_CUDART:
|
||||
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART;
|
||||
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART);
|
||||
break;
|
||||
case NVTX_CB_MODULE_CORE2:
|
||||
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2;
|
||||
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2);
|
||||
break;
|
||||
case NVTX_CB_MODULE_SYNC:
|
||||
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC;
|
||||
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC);
|
||||
break;
|
||||
default: return 0;
|
||||
}
|
||||
|
||||
if (out_size)
|
||||
*out_size = (bytes / (unsigned int)sizeof(NvtxFunctionPointer*)) - 1;
|
||||
|
||||
if (out_table)
|
||||
*out_table = table;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(uint32_t exportTableId)
|
||||
{
|
||||
switch (exportTableId)
|
||||
{
|
||||
case NVTX_ETID_CALLBACKS: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblCallbacks;
|
||||
case NVTX_ETID_VERSIONINFO: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblVersionInfo;
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(uint32_t version)
|
||||
{
|
||||
/* Reserved for custom implementations to resolve problems with tools */
|
||||
(void)version;
|
||||
}
|
||||
|
||||
/* ---- Define implementations of init versions of all API functions ---- */
|
||||
|
||||
#include "nvtxInitDefs.h"
|
||||
|
||||
/* ---- Define implementations of initialization functions ---- */
|
||||
|
||||
#include "nvtxInit.h"
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility pop
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
@@ -0,0 +1,307 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(eventAttrib);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxMarkA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(message);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxMarkW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(message);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(eventAttrib);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxRangeId_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangeStartA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(message);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxRangeId_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangeStartW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(message);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxRangeId_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(id);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(eventAttrib);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (int)NVTX_NO_PUSH_POP_TRACKING;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangePushA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(message);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (int)NVTX_NO_PUSH_POP_TRACKING;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangePushW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(message);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (int)NVTX_NO_PUSH_POP_TRACKING;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC int NVTX_API nvtxRangePop(void)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)();
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (int)NVTX_NO_PUSH_POP_TRACKING;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(category, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(category, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameOsThreadA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(threadId, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameOsThreadW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(threadId, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(domain, eventAttrib);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(domain, eventAttrib);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxRangeId_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(domain, id);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(domain, eventAttrib);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (int)NVTX_NO_PUSH_POP_TRACKING;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(domain);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (int)NVTX_NO_PUSH_POP_TRACKING;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainResourceCreate_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(domain, attribs);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxResourceHandle_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainResourceDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(resource);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(domain, category, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(domain, category, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainRegisterStringA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(domain, string);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxStringHandle_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainRegisterStringW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(domain, string);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxStringHandle_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainCreateA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(message);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxDomainHandle_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* message)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainCreateW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(message);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxDomainHandle_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(domain);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxInitialize_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(reserved);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD_CUDART
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(device, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(device, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(stream, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(stream, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(event, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(event, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD_CUDA
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name);
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(device, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(device, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(context, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(context, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(stream, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(stream, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(event, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(event, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
|
||||
@@ -0,0 +1,161 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD_OPENCL
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
typedef void (NVTX_API * nvtxNameClDeviceA_impl_fntype)(cl_device_id device, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClDeviceW_impl_fntype)(cl_device_id device, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClContextA_impl_fntype)(cl_context context, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClContextW_impl_fntype)(cl_context context, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClCommandQueueA_impl_fntype)(cl_command_queue command_queue, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClCommandQueueW_impl_fntype)(cl_command_queue command_queue, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClMemObjectA_impl_fntype)(cl_mem memobj, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClMemObjectW_impl_fntype)(cl_mem memobj, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClSamplerA_impl_fntype)(cl_sampler sampler, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClSamplerW_impl_fntype)(cl_sampler sampler, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClProgramA_impl_fntype)(cl_program program, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClProgramW_impl_fntype)(cl_program program, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameClEventA_impl_fntype)(cl_event evnt, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameClEventW_impl_fntype)(cl_event evnt, const wchar_t* name);
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClDeviceA_impl_fntype local = (nvtxNameClDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(device, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClDeviceW_impl_fntype local = (nvtxNameClDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(device, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClContextA_impl_fntype local = (nvtxNameClContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(context, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClContextW_impl_fntype local = (nvtxNameClContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(context, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClCommandQueueA_impl_fntype local = (nvtxNameClCommandQueueA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(command_queue, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClCommandQueueW_impl_fntype local = (nvtxNameClCommandQueueW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(command_queue, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClMemObjectA_impl_fntype local = (nvtxNameClMemObjectA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(memobj, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClMemObjectW_impl_fntype local = (nvtxNameClMemObjectW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(memobj, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClSamplerA_impl_fntype local = (nvtxNameClSamplerA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(sampler, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClSamplerW_impl_fntype local = (nvtxNameClSamplerW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(sampler, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClProgramA_impl_fntype local = (nvtxNameClProgramA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(program, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClProgramW_impl_fntype local = (nvtxNameClProgramW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(program, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClEventA_impl_fntype local = (nvtxNameClEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(evnt, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameClEventW_impl_fntype local = (nvtxNameClEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(evnt, name);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD_SYNC
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
|
||||
typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle);
|
||||
typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle);
|
||||
typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle);
|
||||
typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle);
|
||||
typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle);
|
||||
|
||||
NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
|
||||
if(local!=0)
|
||||
return (*local)(domain, attribs);
|
||||
else
|
||||
#endif /*NVTX_DISABLE*/
|
||||
return (nvtxSyncUser_t)0;
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(handle);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(handle);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(handle);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(handle);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
|
||||
if(local!=0)
|
||||
(*local)(handle);
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
@@ -0,0 +1,312 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
/* ---- Platform-independent helper definitions and functions ---- */
|
||||
|
||||
/* Prefer macros over inline functions to reduce symbol resolution at link time */
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define NVTX_PATHCHAR wchar_t
|
||||
#define NVTX_STR(x) L##x
|
||||
#define NVTX_GETENV _wgetenv
|
||||
#define NVTX_BUFSIZE MAX_PATH
|
||||
#define NVTX_DLLHANDLE HMODULE
|
||||
#define NVTX_DLLOPEN(x) LoadLibraryW(x)
|
||||
#define NVTX_DLLFUNC GetProcAddress
|
||||
#define NVTX_DLLCLOSE FreeLibrary
|
||||
#define NVTX_YIELD() SwitchToThread()
|
||||
#define NVTX_MEMBAR() MemoryBarrier()
|
||||
#define NVTX_ATOMIC_WRITE_32(address, value) InterlockedExchange((volatile LONG*)address, value)
|
||||
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand)
|
||||
#elif defined(__GNUC__)
|
||||
#define NVTX_PATHCHAR char
|
||||
#define NVTX_STR(x) x
|
||||
#define NVTX_GETENV getenv
|
||||
#define NVTX_BUFSIZE PATH_MAX
|
||||
#define NVTX_DLLHANDLE void*
|
||||
#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
|
||||
#define NVTX_DLLFUNC dlsym
|
||||
#define NVTX_DLLCLOSE dlclose
|
||||
#define NVTX_YIELD() sched_yield()
|
||||
#define NVTX_MEMBAR() __sync_synchronize()
|
||||
/* Ensure full memory barrier for atomics, to match Windows functions */
|
||||
#define NVTX_ATOMIC_WRITE_32(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value)
|
||||
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
|
||||
#else
|
||||
#error The library does not support your configuration!
|
||||
#endif
|
||||
|
||||
/* Define this to 1 for platforms that where pre-injected libraries can be discovered. */
|
||||
#if defined(_WIN32)
|
||||
/* TODO */
|
||||
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
|
||||
#else
|
||||
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
|
||||
#endif
|
||||
|
||||
/* Define this to 1 for platforms that support environment variables */
|
||||
/* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */
|
||||
/* Try: #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */
|
||||
#define NVTX_SUPPORT_ENV_VARS 1
|
||||
|
||||
/* Define this to 1 for platforms that support dynamic/shared libraries */
|
||||
#define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1
|
||||
|
||||
/* Injection libraries implementing InitializeInjectionNvtx2 may be statically linked,
|
||||
* and this will override any dynamic injection. Useful for platforms where dynamic
|
||||
* injection is not available. Since weak symbols not explicitly marked extern are
|
||||
* guaranteed to be initialized to zero if no definitions are found by the linker, the
|
||||
* dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */
|
||||
#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__)
|
||||
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1
|
||||
/* To statically inject an NVTX library, define InitializeInjectionNvtx2_fnptr as a normal
|
||||
* symbol (not weak) pointing to the implementation of InitializeInjectionNvtx2 (which
|
||||
* does not need to be named "InitializeInjectionNvtx2" as is necessary in a dynamic
|
||||
* injection library. */
|
||||
__attribute__((weak)) NvtxInitializeInjectionNvtxFunc_t InitializeInjectionNvtx2_fnptr;
|
||||
#else
|
||||
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0
|
||||
#endif
|
||||
|
||||
/* This function tries to find or load an NVTX injection library and get the
|
||||
* address of its InitializeInjection2 function. If such a function pointer
|
||||
* is found, it is called, and passed the address of this NVTX instance's
|
||||
* nvtxGetExportTable function, so the injection can attach to this instance.
|
||||
* If the initialization fails for any reason, any dynamic library loaded will
|
||||
* be freed, and all NVTX implementation functions will be set to no-ops. If
|
||||
* initialization succeeds, NVTX functions not attached to the tool will be set
|
||||
* to no-ops. This is implemented as one function instead of several small
|
||||
* functions to minimize the number of weak symbols the linker must resolve.
|
||||
* Order of search is:
|
||||
* - Pre-injected library exporting InitializeInjectionNvtx2
|
||||
* - Loadable library exporting InitializeInjectionNvtx2
|
||||
* - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
|
||||
* - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
|
||||
* - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
|
||||
*/
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void);
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void)
|
||||
{
|
||||
const char* const initFuncName = "InitializeInjectionNvtx2";
|
||||
NvtxInitializeInjectionNvtxFunc_t init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)0;
|
||||
NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0;
|
||||
int entryPointStatus = 0;
|
||||
|
||||
#if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY
|
||||
/* Use POSIX global symbol chain to query for init function from any module */
|
||||
init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(0, initFuncName);
|
||||
#endif
|
||||
|
||||
#if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY
|
||||
/* Try discovering dynamic injection library to load */
|
||||
if (!init_fnptr)
|
||||
{
|
||||
#if NVTX_SUPPORT_ENV_VARS
|
||||
/* If env var NVTX_INJECTION64_PATH is set, it should contain the path
|
||||
* to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
|
||||
const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4)
|
||||
? NVTX_STR("NVTX_INJECTION32_PATH")
|
||||
: NVTX_STR("NVTX_INJECTION64_PATH");
|
||||
#endif /* NVTX_SUPPORT_ENV_VARS */
|
||||
NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE];
|
||||
const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0;
|
||||
|
||||
/* Refer to this variable explicitly in case all references to it are #if'ed out */
|
||||
(void)injectionLibraryPathBuf;
|
||||
|
||||
#if NVTX_SUPPORT_ENV_VARS
|
||||
/* Disable the warning for getenv & _wgetenv -- this usage is safe because
|
||||
* these functions are not called again before using the returned value. */
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning( push )
|
||||
#pragma warning( disable : 4996 )
|
||||
#endif
|
||||
injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName);
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning( pop )
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__ANDROID__)
|
||||
if (!injectionLibraryPath)
|
||||
{
|
||||
const char *bits = (sizeof(void*) == 4) ? "32" : "64";
|
||||
char cmdlineBuf[32];
|
||||
char pkgName[PATH_MAX];
|
||||
int count;
|
||||
int pid;
|
||||
FILE *fp;
|
||||
size_t bytesRead;
|
||||
size_t pos;
|
||||
|
||||
pid = (int)getpid();
|
||||
count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid);
|
||||
if (count <= 0 || count >= (int)sizeof(cmdlineBuf))
|
||||
{
|
||||
NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid);
|
||||
return NVTX_ERR_INIT_ACCESS_LIBRARY;
|
||||
}
|
||||
|
||||
fp = fopen(cmdlineBuf, "r");
|
||||
if (!fp)
|
||||
{
|
||||
NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf);
|
||||
return NVTX_ERR_INIT_ACCESS_LIBRARY;
|
||||
}
|
||||
|
||||
bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp);
|
||||
fclose(fp);
|
||||
if (bytesRead == 0)
|
||||
{
|
||||
NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf);
|
||||
return NVTX_ERR_INIT_ACCESS_LIBRARY;
|
||||
}
|
||||
|
||||
pkgName[bytesRead] = 0;
|
||||
|
||||
/* String can contain colon as a process separator. In this case the package name is before the colon. */
|
||||
pos = 0;
|
||||
while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0')
|
||||
{
|
||||
++pos;
|
||||
}
|
||||
pkgName[pos] = 0;
|
||||
|
||||
count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits);
|
||||
if (count <= 0 || count >= NVTX_BUFSIZE)
|
||||
{
|
||||
NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits);
|
||||
return NVTX_ERR_INIT_ACCESS_LIBRARY;
|
||||
}
|
||||
|
||||
/* On Android, verify path is accessible due to aggressive file access restrictions. */
|
||||
/* For dlopen, if the filename contains a leading slash, then it is interpreted as a */
|
||||
/* relative or absolute pathname; otherwise it will follow the rules in ld.so. */
|
||||
if (injectionLibraryPathBuf[0] == '/')
|
||||
{
|
||||
#if (__ANDROID_API__ < 21)
|
||||
int access_err = access(injectionLibraryPathBuf, F_OK | R_OK);
|
||||
#else
|
||||
int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0);
|
||||
#endif
|
||||
if (access_err != 0)
|
||||
{
|
||||
NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf);
|
||||
return NVTX_ERR_INIT_ACCESS_LIBRARY;
|
||||
}
|
||||
}
|
||||
injectionLibraryPath = injectionLibraryPathBuf;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* At this point, injectionLibraryPath is specified if a dynamic
|
||||
* injection library was specified by a tool. */
|
||||
if (injectionLibraryPath)
|
||||
{
|
||||
/* Load the injection library */
|
||||
injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath);
|
||||
if (!injectionLibraryHandle)
|
||||
{
|
||||
NVTX_ERR("Failed to load injection library\n");
|
||||
return NVTX_ERR_INIT_LOAD_LIBRARY;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Attempt to get the injection library's entry-point */
|
||||
init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName);
|
||||
if (!init_fnptr)
|
||||
{
|
||||
NVTX_DLLCLOSE(injectionLibraryHandle);
|
||||
NVTX_ERR("Failed to get address of function InitializeInjectionNvtx2 from injection library\n");
|
||||
return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY
|
||||
if (!init_fnptr)
|
||||
{
|
||||
/* Check weakly-defined function pointer. A statically-linked injection can define this as
|
||||
* a normal symbol and it will take precedence over a dynamic injection. */
|
||||
if (InitializeInjectionNvtx2_fnptr)
|
||||
{
|
||||
init_fnptr = InitializeInjectionNvtx2_fnptr;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* At this point, if init_fnptr is not set, then no tool has specified
|
||||
* an NVTX injection library -- return non-success result so all NVTX
|
||||
* API functions will be set to no-ops. */
|
||||
if (!init_fnptr)
|
||||
{
|
||||
return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE;
|
||||
}
|
||||
|
||||
/* Invoke injection library's initialization function. If it returns
|
||||
* 0 (failure) and a dynamic injection was loaded, unload it. */
|
||||
entryPointStatus = init_fnptr(NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable));
|
||||
if (entryPointStatus == 0)
|
||||
{
|
||||
NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n");
|
||||
if (injectionLibraryHandle)
|
||||
{
|
||||
NVTX_DLLCLOSE(injectionLibraryHandle);
|
||||
}
|
||||
return NVTX_ERR_INIT_FAILED_LIBRARY_ENTRY_POINT;
|
||||
}
|
||||
|
||||
return NVTX_SUCCESS;
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void)
|
||||
{
|
||||
unsigned int old;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState == NVTX_INIT_STATE_COMPLETE)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
NVTX_ATOMIC_CAS_32(
|
||||
old,
|
||||
&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState,
|
||||
NVTX_INIT_STATE_STARTED,
|
||||
NVTX_INIT_STATE_FRESH);
|
||||
if (old == NVTX_INIT_STATE_FRESH)
|
||||
{
|
||||
int result;
|
||||
int forceAllToNoops;
|
||||
|
||||
/* Load & initialize injection library -- it will assign the function pointers */
|
||||
result = NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)();
|
||||
|
||||
/* Set all pointers not assigned by the injection to null */
|
||||
forceAllToNoops = result != NVTX_SUCCESS; /* Set all to null if injection init failed */
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(forceAllToNoops);
|
||||
|
||||
/* Signal that initialization has finished, so now the assigned function pointers will be used */
|
||||
NVTX_ATOMIC_WRITE_32(
|
||||
&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState,
|
||||
NVTX_INIT_STATE_COMPLETE);
|
||||
}
|
||||
else /* Spin-wait until initialization has finished */
|
||||
{
|
||||
NVTX_MEMBAR();
|
||||
while (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState != NVTX_INIT_STATE_COMPLETE)
|
||||
{
|
||||
NVTX_YIELD();
|
||||
NVTX_MEMBAR();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name);
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name);
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name);
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name);
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved);
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle);
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle);
|
||||
@@ -0,0 +1,573 @@
|
||||
/*
|
||||
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*/
|
||||
|
||||
#ifndef NVTX_IMPL_GUARD
|
||||
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
|
||||
#endif
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxMarkEx(eventAttrib);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxMarkA(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxMarkW(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxRangeStartEx(eventAttrib);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxRangeStartA(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxRangeStartW(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxRangeEnd(id);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxRangePushEx(eventAttrib);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxRangePushA(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxRangePushW(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxRangePop();
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxNameCategoryA(category, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxNameCategoryW(category, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxNameOsThreadA(threadId, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxNameOsThreadW(threadId, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxDomainMarkEx(domain, eventAttrib);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainRangeStartEx(domain, eventAttrib);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxDomainRangeEnd(domain, id);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainRangePushEx(domain, eventAttrib);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainRangePop(domain);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainResourceCreate(domain, attribs);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxDomainResourceDestroy(resource);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxDomainNameCategoryA(domain, category, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxDomainNameCategoryW(domain, category, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainRegisterStringA(domain, string);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainRegisterStringW(domain, string);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainCreateA(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
return nvtxDomainCreateW(message);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxDomainDestroy(domain);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved){
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
nvtxInitialize(reserved);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name){
|
||||
nvtxNameCuDeviceA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
|
||||
if (local)
|
||||
local(device, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name){
|
||||
nvtxNameCuDeviceW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
|
||||
if (local)
|
||||
local(device, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name){
|
||||
nvtxNameCuContextA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
|
||||
if (local)
|
||||
local(context, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name){
|
||||
nvtxNameCuContextW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
|
||||
if (local)
|
||||
local(context, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name){
|
||||
nvtxNameCuStreamA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
|
||||
if (local)
|
||||
local(stream, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name){
|
||||
nvtxNameCuStreamW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
|
||||
if (local)
|
||||
local(stream, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name){
|
||||
nvtxNameCuEventA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
|
||||
if (local)
|
||||
local(event, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name){
|
||||
nvtxNameCuEventW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
|
||||
if (local)
|
||||
local(event, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name){
|
||||
nvtxNameCudaDeviceA_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr;
|
||||
if (local)
|
||||
local(device, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name){
|
||||
nvtxNameCudaDeviceW_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr;
|
||||
if (local)
|
||||
local(device, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name){
|
||||
nvtxNameCudaStreamA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
|
||||
if (local)
|
||||
local(stream, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name){
|
||||
nvtxNameCudaStreamW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
|
||||
if (local)
|
||||
local(stream, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name){
|
||||
nvtxNameCudaEventA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
|
||||
if (local)
|
||||
local(event, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name){
|
||||
nvtxNameCudaEventW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
|
||||
if (local)
|
||||
local(event, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name){
|
||||
nvtxNameClDeviceA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr;
|
||||
if (local)
|
||||
local(device, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name){
|
||||
nvtxNameClDeviceW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr;
|
||||
if (local)
|
||||
local(device, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name){
|
||||
nvtxNameClContextA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr;
|
||||
if (local)
|
||||
local(context, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name){
|
||||
nvtxNameClContextW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr;
|
||||
if (local)
|
||||
local(context, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name){
|
||||
nvtxNameClCommandQueueA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr;
|
||||
if (local)
|
||||
local(command_queue, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name){
|
||||
nvtxNameClCommandQueueW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr;
|
||||
if (local)
|
||||
local(command_queue, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name){
|
||||
nvtxNameClMemObjectA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr;
|
||||
if (local)
|
||||
local(memobj, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name){
|
||||
nvtxNameClMemObjectW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr;
|
||||
if (local)
|
||||
local(memobj, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name){
|
||||
nvtxNameClSamplerA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr;
|
||||
if (local)
|
||||
local(sampler, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name){
|
||||
nvtxNameClSamplerW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr;
|
||||
if (local)
|
||||
local(sampler, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name){
|
||||
nvtxNameClProgramA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr;
|
||||
if (local)
|
||||
local(program, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name){
|
||||
nvtxNameClProgramW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr;
|
||||
if (local)
|
||||
local(program, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name){
|
||||
nvtxNameClEventA_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr;
|
||||
if (local)
|
||||
local(evnt, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name){
|
||||
nvtxNameClEventW_fakeimpl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr;
|
||||
if (local)
|
||||
local(evnt, name);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs){
|
||||
nvtxDomainSyncUserCreate_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr;
|
||||
if (local) {
|
||||
return local(domain, attribs);
|
||||
}
|
||||
return (nvtxSyncUser_t)0;
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle){
|
||||
nvtxDomainSyncUserDestroy_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr;
|
||||
if (local)
|
||||
local(handle);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle){
|
||||
nvtxDomainSyncUserAcquireStart_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr;
|
||||
if (local)
|
||||
local(handle);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle){
|
||||
nvtxDomainSyncUserAcquireFailed_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr;
|
||||
if (local)
|
||||
local(handle);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle){
|
||||
nvtxDomainSyncUserAcquireSuccess_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
|
||||
if (local)
|
||||
local(handle);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle){
|
||||
nvtxDomainSyncUserReleasing_impl_fntype local;
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)();
|
||||
local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr;
|
||||
if (local)
|
||||
local(handle);
|
||||
}
|
||||
|
||||
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops);
|
||||
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops)
|
||||
{
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr = NULL;
|
||||
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr = NULL;
|
||||
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr = NULL;
|
||||
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr = NULL;
|
||||
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr = NULL;
|
||||
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr = NULL;
|
||||
if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init) || forceAllToNoops)
|
||||
NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr = NULL;
|
||||
}
|
||||
Certains fichiers ne sont pas affichés car ce diff contient trop de modifications Voir plus
Référencer dans un nouveau ticket
Bloquer un utilisateur