Files
rocm-systems/src/misc/utils.cc
T
Sylvain Jeaugey 68b542363f 2.23.4-1
Add scalable init API
 * Add new ncclCommInitRankScalable to allow for passing multiple
   unique IDs to the init function.
 * Spreads the load onto multiple bootstrap roots, allowing for
   constant bootstrap time.
 * Requires multiple ranks to create a unique ID, and the CPU-side
   ID exchange code to call allgather[v] instead of broadcast.

Accelerate init bootstrap operations
 * Reduce the number of calls to allgather.
 * Allow roots to reply early to ranks when information is already
   available.
 * Add an option to use ncclNet instead of sockets to perform
   bootstrap allgather operations.

Add PAT algorithms for Allgather and ReduceScatter
 * Parallel Aggregated Trees, variation of Bruck algorithm.
 * Logarithmic number of network steps for small sizes at scale.
 * Only supports one rank per node at the moment.

Add support for registered buffers for intra-node communication.
 * Allow registered user buffers to be accessed directly intra-node
 * Avoids extra copies in algorithms which permit it, saving
   memory bandwidth and helping with compute overlap.

Add profiler plugin API
 * New plugin API for profiling
 * Supports various levels of profiling, with a hierarchy.

Asynchronous graph allocation
 * Make calls to cudaMalloc and cudaMemcpy during graph allocation
   asynchronous.
 * Significantly speeds up graph capture.

Use fatal IB asynchronous events to stop network operation
 * Avoids many other error messages
 * Only fatal errors are affected; potentially transient errors
   (e.g. port down) do not cause an immediate stop.

Set P2P level to PXB on AMD CPUs when using more than 2 GPUs per node
 * P2P would cause a significant performance degradation when using
   many GPUs, and therefore many interleaved data flows.
 * Disable P2P through the CPU when we have 3+ GPUs per node; keep it
   enabled when we only have 2 GPUs.

Improve the init logs to report the real NCCL function.
 * Make the log report ncclCommInitRank or ncclCommSplit, rather than
   the generic ncclCommInitRankFunc.

Add a parameter to set the location of the user configuration file.
 * Add NCCL_CONF_FILE environment variable to set where the user's
   configuration file resides.

Increase default IB timeout
 * Increase IB timeout value from 18 to 20.
 * Should help avoid fatal errors on large RoCE systems.

Add new check for nvidia peermem
 * On linux kernels 6.6+, /sys/kernel/mm/memory_peers is no longer
   present; check for /sys/module/nvidia_peermem/version instead.

Fix old performance regression when mixing small and large operations.
 * Improves distribution of work on channels.

Fix crash when NUMA IDs are equal to -1.
 * Can happen when a NIC is a virtual NIC, or when linux doesn't
   know which NUMA node a device is attached to
 * Issue NVIDIA/nccl-tests#233

Fix tree graph search when NCCL_CROSS_NIC is set to 1.
 * Would force NCCL to use the balanced_tree pattern, thereby
   disabling LL128 on platforms with 1 GPU+1 NIC per PCI switch.
 * Would also try to use alternate rings even though it was not
   needed.

Compiler tweaks and fixes
 * PR #1177
 * PR #1228

Fix stack smash
 * PR #1325

Fixes for multi-node NVLink + IB operation

Coverity fixes and comments.
2024-09-16 23:41:17 -07:00

292 líneas
9.4 KiB
C++

/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "utils.h"
#include "core.h"
#include "nvmlwrap.h"
#include <stdlib.h>
// Get current Compute Capability
int ncclCudaCompCap() {
int cudaDev;
if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
int ccMajor, ccMinor;
if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0;
return ccMajor*10+ccMinor;
}
ncclResult_t int64ToBusId(int64_t id, char* busId) {
sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
return ncclSuccess;
}
ncclResult_t busIdToInt64(const char* busId, int64_t* id) {
char hexStr[17]; // Longest possible int64 hex string + null terminator.
int hexOffset = 0;
for (int i = 0; hexOffset < sizeof(hexStr) - 1; i++) {
char c = busId[i];
if (c == '.' || c == ':') continue;
if ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f')) {
hexStr[hexOffset++] = busId[i];
} else break;
}
hexStr[hexOffset] = '\0';
*id = strtol(hexStr, NULL, 16);
return ncclSuccess;
}
// Convert a logical cudaDev index to the NVML device minor number
ncclResult_t getBusId(int cudaDev, int64_t *busId) {
// On most systems, the PCI bus ID comes back as in the 0000:00:00.0
// format. Still need to allocate proper space in case PCI domain goes
// higher.
char busIdStr[] = "00000000:00:00.0";
CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
NCCLCHECK(busIdToInt64(busIdStr, busId));
return ncclSuccess;
}
ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
if (gethostname(hostname, maxlen) != 0) {
strncpy(hostname, "unknown", maxlen);
return ncclSystemError;
}
int i = 0;
while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++;
hostname[i] = '\0';
return ncclSuccess;
}
static uint64_t hostHashValue = 0;
/* Generate a hash of the unique identifying string for this host
* that will be unique for both bare-metal and container instances
* Equivalent of a hash of;
*
* $(hostname)$(cat /proc/sys/kernel/random/boot_id)
*
* This string can be overridden by using the NCCL_HOSTID env var.
*/
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
static void getHostHashOnce() {
char hostHash[1024];
const char *hostId;
// Fall back is the full hostname if something fails
(void) getHostName(hostHash, sizeof(hostHash), '\0');
int offset = strlen(hostHash);
if ((hostId = ncclGetEnv("NCCL_HOSTID")) != NULL) {
INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId);
strncpy(hostHash, hostId, sizeof(hostHash)-1);
hostHash[sizeof(hostHash)-1] = '\0';
} else {
FILE *file = fopen(HOSTID_FILE, "r");
if (file != NULL) {
char *p;
if (fscanf(file, "%ms", &p) == 1) {
strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
free(p);
}
fclose(file);
}
}
// Make sure the string is terminated
hostHash[sizeof(hostHash)-1]='\0';
TRACE(NCCL_INIT,"unique hostname '%s'", hostHash);
hostHashValue = getHash(hostHash, strlen(hostHash));
}
uint64_t getHostHash(void) {
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, getHostHashOnce);
return hostHashValue;
}
/* Generate a hash of the unique identifying string for this process
* that will be unique for both bare-metal and container instances
* Equivalent of a hash of;
*
* $$ $(readlink /proc/self/ns/pid)
*/
uint64_t getPidHash(void) {
char pname[1024];
// Start off with our pid ($$)
sprintf(pname, "%ld", (long) getpid());
int plen = strlen(pname);
int len = readlink("/proc/self/ns/pid", pname+plen, sizeof(pname)-1-plen);
if (len < 0) len = 0;
pname[plen+len]='\0';
TRACE(NCCL_INIT,"unique PID '%s'", pname);
return getHash(pname, strlen(pname));
}
int parseStringList(const char* string, struct netIf* ifList, int maxList) {
if (!string) return 0;
const char* ptr = string;
int ifNum = 0;
int ifC = 0;
char c;
do {
c = *ptr;
if (c == ':') {
if (ifC > 0) {
ifList[ifNum].prefix[ifC] = '\0';
ifList[ifNum].port = atoi(ptr+1);
ifNum++; ifC = 0;
}
while (c != ',' && c != '\0') c = *(++ptr);
} else if (c == ',' || c == '\0') {
if (ifC > 0) {
ifList[ifNum].prefix[ifC] = '\0';
ifList[ifNum].port = -1;
ifNum++; ifC = 0;
}
} else {
ifList[ifNum].prefix[ifC] = c;
ifC++;
}
ptr++;
} while (ifNum < maxList && c);
return ifNum;
}
static bool matchIf(const char* string, const char* ref, bool matchExact) {
// Make sure to include '\0' in the exact case
int matchLen = matchExact ? strlen(string) + 1 : strlen(ref);
return strncmp(string, ref, matchLen) == 0;
}
static bool matchPort(const int port1, const int port2) {
if (port1 == -1) return true;
if (port2 == -1) return true;
if (port1 == port2) return true;
return false;
}
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) {
// Make an exception for the case where no user list is defined
if (listSize == 0) return true;
for (int i=0; i<listSize; i++) {
if (matchIf(string, ifList[i].prefix, matchExact)
&& matchPort(port, ifList[i].port)) {
return true;
}
}
return false;
}
__thread struct ncclThreadSignal ncclThreadSignalLocalInstance = ncclThreadSignalStaticInitializer();
void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) {
// `me->hunks` points to the top of the stack non-empty hunks. Hunks above
// this (reachable via `->above`) are empty.
struct Hunk* top = me->topFrame.hunk;
size_t mallocSize = 0;
// If we have lots of space left in hunk but that wasn't enough then we'll
// allocate the object unhunked.
if (me->topFrame.end - me->topFrame.bumper >= 8<<10)
goto unhunked;
// If we have another hunk (which must be empty) waiting above this one and
// the object fits then use that.
if (top && top->above) {
struct Hunk* top1 = top->above;
uintptr_t uobj = (reinterpret_cast<uintptr_t>(top1) + sizeof(struct Hunk) + align-1) & -uintptr_t(align);
if (uobj + size <= reinterpret_cast<uintptr_t>(top1) + top1->size) {
me->topFrame.hunk = top1;
me->topFrame.bumper = uobj + size;
me->topFrame.end = reinterpret_cast<uintptr_t>(top1) + top1->size;
return reinterpret_cast<void*>(uobj);
}
}
{ // If the next hunk we're going to allocate wouldn't be big enough but the
// Unhunk proxy fits in the current hunk then go allocate as unhunked.
size_t nextSize = (top ? top->size : 0) + (64<<10);
constexpr size_t maxAlign = 64;
if (nextSize < sizeof(struct Hunk) + maxAlign + size) {
uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk));
if (uproxy + sizeof(struct Unhunk) <= me->topFrame.end)
goto unhunked;
}
// At this point we must need another hunk, either to fit the object
// itself or its Unhunk proxy.
mallocSize = nextSize;
INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
struct Hunk *top1 = (struct Hunk*)malloc(mallocSize);
if (top1 == nullptr) goto malloc_exhausted;
top1->size = nextSize;
top1->above = nullptr;
if (top) top->above = top1;
top = top1;
me->topFrame.hunk = top;
me->topFrame.end = reinterpret_cast<uintptr_t>(top) + nextSize;
me->topFrame.bumper = reinterpret_cast<uintptr_t>(top) + sizeof(struct Hunk);
}
{ // Try to fit object in the new top hunk.
uintptr_t uobj = (me->topFrame.bumper + align-1) & -uintptr_t(align);
if (uobj + size <= me->topFrame.end) {
me->topFrame.bumper = uobj + size;
return reinterpret_cast<void*>(uobj);
}
}
unhunked:
{ // We need to allocate the object out-of-band and put an Unhunk proxy in-band
// to keep track of it.
uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk));
Unhunk* proxy = reinterpret_cast<Unhunk*>(uproxy);
me->topFrame.bumper = uproxy + sizeof(Unhunk);
proxy->next = me->topFrame.unhunks;
me->topFrame.unhunks = proxy;
mallocSize = size;
proxy->obj = malloc(mallocSize);
INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
if (proxy->obj == nullptr) goto malloc_exhausted;
return proxy->obj;
}
malloc_exhausted:
WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize);
abort();
}
void ncclMemoryStackDestruct(struct ncclMemoryStack* me) {
// Free unhunks first because both the frames and unhunk proxies lie within the hunks.
struct ncclMemoryStack::Frame* f = &me->topFrame;
while (f != nullptr) {
struct ncclMemoryStack::Unhunk* u = f->unhunks;
while (u != nullptr) {
free(u->obj);
u = u->next;
}
f = f->below;
}
// Free hunks
struct ncclMemoryStack::Hunk* h = me->stub.above;
while (h != nullptr) {
struct ncclMemoryStack::Hunk *h1 = h->above;
free(h);
h = h1;
}
}