Merge pull request #299 from ROCmSoftwarePlatform/develop

Enable target id build

[ROCm/rccl commit: 377b43470b]
Этот коммит содержится в:
Colin Smith
2020-11-10 15:47:42 -07:00
коммит произвёл GitHub
родитель 94437eef28 a7ef699687
Коммит 1349b382cd
52 изменённых файлов: 2949 добавлений и 241 удалений
+2 -3
Просмотреть файл
@@ -5,11 +5,10 @@ def runCompileCommand(platform, project, jobName)
{
project.paths.construct_build_prefix()
String hipclangArgs = jobName.contains('hipclang') ? '--hip-clang' : ''
def command = """#!/usr/bin/env bash
set -x
cd ${project.paths.project_build_prefix}
LD_LIBRARY_PATH=/opt/rocm/hcc/lib ${project.paths.build_command} ${hipclangArgs}
LD_LIBRARY_PATH=/opt/rocm/hcc/lib ${project.paths.build_command}
"""
platform.runCommand(this,command)
@@ -22,7 +21,7 @@ def runTestCommand (platform, project)
def command = """#!/usr/bin/env bash
set -x
cd ${project.paths.project_build_prefix}/build/release/test
NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_output=xml --gtest_color=yes
${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_output=xml --gtest_color=yes
"""
platform.runCommand(this, command)
+3 -3
Просмотреть файл
@@ -55,7 +55,7 @@ ci: {
propertyList = auxiliary.appendPropertyList(propertyList)
def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['4gfx906','4gfx908']])]
def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([sles15sp1:['4gfx906'],centos8:['4gfx908'],centos7:['4gfx906'],ubuntu18:['4gfx906']])]
jobNameList = auxiliary.appendJobNameList(jobNameList)
@@ -80,7 +80,7 @@ ci: {
{
properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
stage(urlJobName) {
runCI([ubuntu16:['rccl906']], urlJobName)
runCI([ubuntu18:['4gfx906']], urlJobName)
}
}
}
}
+53
Просмотреть файл
@@ -0,0 +1,53 @@
# Change Log for RCCL
Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
## [Unreleased]
### Added
- Experimental support for clique-based kernels (opt in with RCCL_ENABLE_CLIQUE=1)
- Clique-based kernels may offer better performance for smaller input sizes
- Clique-based kernels are currently only enabled for AllReduce under a certain byte limit (controlled via RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT)
### Optimizations
- Performance improvements for Rome-based systems
### Known issues
- Clique-based kernels are currently experimental and have not been fully tested on all topologies. By default, clique-based kernels are disabled if the detected topology is not supported (override with RCCL_FORCE_ENABLE_CLIQUE)
- Clique-based kernels may hang if there are differences between environment variables set across ranks.
- Clique-based kernels may fail if the input / output device pointers are not the base device pointers returned by hipMalloc.
## [RCCL-2.7.8 for ROCm 3.9.0]
### Added
- Adding support for alltoallv RCCL kernel
### Optimizations
- Modifications to topology based on XGMI links
### Known issues
- None
## [RCCL-2.7.6 for ROCm 3.8.0]
### Added
- Support for static library builds
### Known issues
- None
## [RCCL-2.7.6 for ROCm 3.7.0]
### Added
- Updated to RCCL API version of 2.7.6
- Added gather, scatter and all-to-all collectives
## [RCCL-2.7.0 for ROCm 3.6.0]
### Added
- Updated to RCCL API version of 2.6.4
## [RCCL-2.7.0 for ROCm 3.5.0]
### Added
- Compatibility with NCCL 2.6
- Network interface improvements with API v3
### Optimizations
- Fixing issues and built time improvements for hip-clang
- Network topology detection
- Improved CPU type detection
- Infiniband adaptive routing support
### Changed
- Switched to hip-clang as default compiler
### Deprecated
- Deprecated hcc build
+41 -5
Просмотреть файл
@@ -11,7 +11,7 @@ set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
project(rccl CXX)
set(AMDGPU_TARGETS gfx803;gfx900;gfx906;gfx908 CACHE STRING "List of specific machine types for library to target")
set(AMDGPU_TARGETS gfx803;gfx900;gfx906:xnack-;gfx908:xnack- CACHE STRING "List of specific machine types for library to target")
option(BUILD_TESTS "Build test programs" OFF)
option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
@@ -126,6 +126,12 @@ set(CC_SOURCES
src/collectives/all_to_all_api.cc
src/collectives/all_to_allv_api.cc
src/channel.cc
src/clique/CliqueManager.cc # RCCL
src/clique/HandleCache.cc # RCCL
src/clique/HandleShm.cc # RCCL
src/clique/Hash.cc # RCCL
src/clique/MsgQueue.cc # RCCL
src/clique/ShmObject.cc # RCCL
src/misc/argcheck.cc
src/misc/nvmlwrap_stub.cc
src/misc/utils.cc
@@ -169,7 +175,7 @@ endforeach()
if("${HIP_COMPILER}" MATCHES "clang")
foreach(target ${AMDGPU_TARGETS})
target_compile_options(rccl PRIVATE --cuda-gpu-arch=${target} PRIVATE -fgpu-rdc PRIVATE -mno-xnack -Xarch_gfx906 -msram-ecc -Xarch_gfx908 -mno-sram-ecc)
target_compile_options(rccl PRIVATE --cuda-gpu-arch=${target} PRIVATE -fgpu-rdc)
endforeach()
target_link_libraries(rccl PRIVATE -fgpu-rdc)
target_include_directories(rccl PRIVATE /opt/rocm/hsa/include)
@@ -223,17 +229,47 @@ rocm_export_targets(NAMESPACE
hip)
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-rocclr (>= 3.5.0)")
set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
set(CPACK_RPM_PACKAGE_REQUIRES "hip-rocclr >= 3.5.0")
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "/opt/rocm")
find_file (DEBIAN debian_version debconf.conf PATHS /etc)
if(DEBIAN)
# Write copyright file
file(WRITE "${CMAKE_BINARY_DIR}/copyright"
"Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: rccl
Source: https://github.com/ROCmSoftwarePlatform/rccl
Files: *
Copyright: (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
License: See LICENSE.txt for license information\n")
install(FILES "${CMAKE_BINARY_DIR}/copyright" DESTINATION /usr/share/doc/rccl)
# Write changelog file
find_program( date_executable date )
execute_process(COMMAND ${date_executable} -R OUTPUT_VARIABLE TIMESTAMP)
file(WRITE "${CMAKE_BINARY_DIR}/changelog"
"rccl (${VERSION_STRING}-1) unstable; urgency=medium
* Initial release.
-- RCCL Maintainer <rccl-maintainer@amd.com> ${TIMESTAMP}\n")
find_program( gzip_executable gzip )
execute_process(COMMAND bash "-c" "${gzip_executable} -9 -c ${CMAKE_BINARY_DIR}/changelog"
WORKING_DIRECTORY ${CMAKE_BINARY_DIR} OUTPUT_FILE "${CMAKE_BINARY_DIR}/changelog.Debian.gz")
install(FILES "${CMAKE_BINARY_DIR}/changelog.Debian.gz" DESTINATION /usr/share/doc/rccl)
set(CPACK_DEBIAN_PACKAGE_DESCRIPTION "ROCm Communication Collectives Library
Optimized primitives for collective multi-GPU communication")
endif()
rocm_create_package(
NAME
rccl
DESCRIPTION
"Optimized primitives for collective multi-GPU communication"
"ROCm Communication Collectives Library"
MAINTAINER
"<rccl-maintainer@amd.com>"
"RCCL Maintainer <rccl-maintainer@amd.com>"
LDCONFIG)
rocm_install_symlink_subdir(rccl)
+1 -1
Просмотреть файл
@@ -56,7 +56,7 @@ master_doc = 'index'
# General information about the project.
project = u'RCCL'
copyright = u'2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019 Advanced Mirco Devices'
copyright = u'2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019-2020 Advanced Mirco Devices'
author = u'Advanced Mirco Devices'
# The version info for the project you're documenting, acts as replacement for
+33 -3
Просмотреть файл
@@ -12,6 +12,11 @@
#include "socket.h"
#include <unistd.h>
#include <sys/types.h>
// [RCCL]
#include "clique/CliqueManager.h"
#include "clique/CliqueShmNames.h"
#include "clique/Hash.h"
// [/RCCL]
struct bootstrapNetComm {
int fd;
@@ -163,7 +168,14 @@ static ncclResult_t setFilesLimit() {
return ncclSuccess;
}
static void *bootstrapRoot(void* listenComm) {
static void *bootstrapRoot(void* bootstrapRootStruct) { // [RCCL] Modified to include hash argument)
// [RCCL] Unpack bootstrapRootStruct
struct bootstrapRootStruct* rootStruct = (struct bootstrapRootStruct*) bootstrapRootStruct;
void* listenComm = rootStruct->listenComm;
unsigned long hash = rootStruct->hash;
int pid = getpid(); // sharing PID to other ranks for creating shared memory files for CliqueManager
// [/RCCL]
struct extInfo info;
ncclNetHandle_t *rankHandles = NULL;
ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
@@ -205,12 +217,19 @@ static void *bootstrapRoot(void* listenComm) {
} while (c < nranks);
TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks);
{ // [RCCL] Initialize message queues / shared memory files
NCCLCHECKGOTO(CliqueManager::BootstrapRootInit(pid, hash), res, out);
} // [/RCCL]
// Send the connect handle for the next rank in the AllGather ring
for (int r=0; r<nranks; ++r) {
int next = (r+1) % nranks;
void *tmpSendComm;
NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot+r, &tmpSendComm), res, out);
NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
{ // [RCCL] Send the root pid for shared file naming
NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, &pid, sizeof(int)), res, out);
} // [/RCCL]
NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
}
TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
@@ -229,7 +248,14 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
void* listenComm;
NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, netHandle, &listenComm));
pthread_t thread;
pthread_create(&thread, NULL, bootstrapRoot, listenComm);
// [RCCL] Use the ncclUniqueId to get a hash for bootstrap
struct bootstrapRootStruct* rootStruct = new bootstrapRootStruct;
rootStruct->hash = djb2Hash(id->internal);
rootStruct->listenComm = listenComm;
pthread_create(&thread, NULL, bootstrapRoot, (void *)rootStruct);
// [/RCCL]
return ncclSuccess;
}
@@ -267,9 +293,10 @@ struct extState {
int rank;
int nranks;
int dev;
int rootPid; // [RCCL] PID of root
};
ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) {
ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState, int* rootPid) { // [RCCL] Adding rootPid
ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
bool idFromEnv = getenv("NCCL_COMM_ID") != NULL;
struct extState* state;
@@ -314,6 +341,9 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
ncclNetHandle_t extHandleNext;
NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm));
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
{ // [RCCL] Receive PID from root
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, rootPid, sizeof(int)));
} // [/RCCL]
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
+75
Просмотреть файл
@@ -0,0 +1,75 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef ALLREDUCECLIQUEKERNEL_H
#define ALLREDUCECLIQUEKERNEL_H
#include "CliqueCommon.h"
#include "devcomm.h"
#include "reduce_kernel.h"
#include "common_kernel.h"
template <class FUNC, typename T, int NUM_RANKS>
__device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
{
// Clique-specific kernel arguments
cliqueDevicePtrs_t* cliquePtrs = args->clique.ptrs; // Collection of all input/output pointers across ranks in clique
size_t const N = args->clique.count; // Total number of elements to reduce
int const nBlocks = args->clique.nChannels; // Total number of blocks assigned to this kernel (may be different than gridDim.x)
int const blockId = args->clique.bid; // 0-indexed blockIdx for this threadblock (may be different than blockIdx.x)
int const rank = args->comm->rank; // Current rank
// Each threadblock works independently of others on a subsection of the input
// First split evently across ranks, while maintaining multiples of blocksize
size_t const perRankN = RoundUp((N + NUM_RANKS - 1) / NUM_RANKS, blockDim.x);
size_t const perBlockN = RoundUp((perRankN + nBlocks - 1) / nBlocks, blockDim.x);
size_t const currBlockStart = min((rank * nBlocks + blockId) * perBlockN, N);
size_t const currBlockStop = min(currBlockStart + perBlockN, N);
size_t const blockN = currBlockStop - currBlockStart;
if (blockN > 0)
{
// Prepare input / output subarrays
T const** inputs = (T const**)cliquePtrs->inputs;
T** outputs = (T **)cliquePtrs->outputs;
T const* srcs[NUM_RANKS];
T* dsts[NUM_RANKS];
#pragma unroll
for (int r = 0; r < NUM_RANKS; r++)
{
srcs[r] = inputs[r] + currBlockStart;
dsts[r] = outputs[r] + currBlockStart;
}
// Perform the reduction
#define ALL_REDUCE_CLIQUE_UNROLL 2
ReduceOrCopyMulti<ALL_REDUCE_CLIQUE_UNROLL, FUNC, T, NUM_RANKS, NUM_RANKS, NUM_RANKS, NUM_RANKS>(
threadIdx.x, blockDim.x, NUM_RANKS, srcs, NUM_RANKS, dsts, blockN);
}
// Even if there was nothing for this GPU to do, it must participate in a barrier
// because other GPUs may be modifying this GPUs output buffer still
if (blockId == 0) WaitForBarrier<NUM_RANKS>(cliquePtrs->barrier);
}
#endif
+93
Просмотреть файл
@@ -0,0 +1,93 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef CLIQUE_COMMON_H
#define CLIQUE_COMMON_H
#include "nccl.h"
#include <cstdint>
#define MIN_CLIQUE_SIZE 2
#define MAX_CLIQUE_SIZE 8
typedef struct
{
int* globalCount; // Shared across GPUs
int* globalSense; // Shared across GPUs
int* localSense; // Local to this GPU
} gpuBarrier_t;
typedef struct
{
// Input/output pointers from participating ranks
void const* inputs[MAX_CLIQUE_SIZE];
void* outputs[MAX_CLIQUE_SIZE];
// Barrier variable
gpuBarrier_t barrier;
} cliqueDevicePtrs_t;
// Helper macro to launch an appropriate kernel by converting rank to a template argument
#define LAUNCH_CLIQUE_KERNEL(kernelname, FUNC, T, args) \
{ \
switch (args->comm->nRanks){ \
case 2: kernelname<FUNC, T, 2>(args); break; \
case 3: kernelname<FUNC, T, 3>(args); break; \
case 4: kernelname<FUNC, T, 4>(args); break; \
case 5: kernelname<FUNC, T, 5>(args); break; \
case 6: kernelname<FUNC, T, 6>(args); break; \
case 7: kernelname<FUNC, T, 7>(args); break; \
case 8: kernelname<FUNC, T, 8>(args); break; \
} \
}
// Multi-GPU (on same node) barrier. One thread per grid per GPU updates barrier / waits
template <int NUM_RANKS>
__forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier)
{
if (threadIdx.x == 0)
{
// Sense inversion barrier
*barrier.localSense = 1 - *barrier.localSense;
int localSense = *barrier.localSense;
int val = __atomic_add_fetch(barrier.globalCount, 1, __ATOMIC_SEQ_CST);
if (val == NUM_RANKS)
{
// Last arrival resets barrier
__atomic_store_n(barrier.globalCount, 0, __ATOMIC_SEQ_CST);
__atomic_store_n(barrier.globalSense, localSense, __ATOMIC_SEQ_CST);
}
else
{
// Wait for all ranks to reach barrier
while (__atomic_load_n(barrier.globalSense, __ATOMIC_SEQ_CST) != localSense);
}
}
}
__forceinline__ __host__ __device__ size_t RoundUp(size_t X, size_t Y)
{
return (X+Y-1)/Y * Y;
}
#endif
+519
Просмотреть файл
@@ -0,0 +1,519 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "CliqueManager.h"
#include "CliqueShmNames.h"
#include "MsgQueue.h"
#include "nccl.h"
#include "core.h"
#include "Hash.h"
#include "AllReduceCliqueKernel.h"
#include <hip/hip_runtime.h>
#include <hsa/hsa_ext_amd.h>
#include <stdio.h>
#include <stdlib.h>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <thread>
cliqueDevicePtrs_t CliqueManager::m_staticCliquePtrs[NCCL_MAX_OPS] = {};
int* CliqueManager::m_staticGpuBarrierMem = NULL;
// Define some environment variables that affect clique-based kernels
RCCL_PARAM(EnableClique, "ENABLE_CLIQUE", 0); // Opt-in environment variable for clique-based kernels
RCCL_PARAM(AllReduceCliqueByteLimit, "CLIQUE_ALLREDUCE_BYTE_LIMIT", 2097152); // Max number of bytes to use clique-based kernels for all reduce
RCCL_PARAM(AllReduceNumChannels, "CLIQUE_ALLREDUCE_NCHANNELS", 4); // Number of channels to use for all-reduce
CliqueManager::CliqueManager(int const rank,
int const numRanks,
cliqueMode_t const cliqueMode) :
m_rank(rank),
m_numRanks(numRanks),
m_cliqueMode(cliqueMode),
m_init(false),
m_pinnedCliquePtrs(NULL),
m_fineGrainBarrierMem(NULL)
{
}
CliqueManager::~CliqueManager()
{
if (m_init)
{
CleanUp();
}
}
void CliqueManager::CleanUp()
{
if (m_cliqueMode == CLIQUE_DISABLED) return;
// Free variables that are shared between SINGLE_PROCESS / SINGLE_NODE
if (m_pinnedCliquePtrs) hipHostFree(m_pinnedCliquePtrs);
if (m_gpuBarrierLocalSense) hipFree(m_gpuBarrierLocalSense);
if (m_cliqueMode == CLIQUE_SINGLE_NODE)
{
// Release caches
if (m_ipcHandleSendCache) delete m_ipcHandleSendCache;
if (m_ipcHandleSendCache) delete m_ipcHandleRecvCache;
// Close shared memory
m_shmHandles.Close();
m_sharedCpuMemory.Close();
m_sharedIpcHandle.Close();
if (m_fineGrainBarrierMem)
{
if (m_rank == 0)
hipFree(m_fineGrainBarrierMem);
else
hipIpcCloseMemHandle(m_fineGrainBarrierMem);
}
}
else if (m_cliqueMode == CLIQUE_SINGLE_PROCESS)
{
if (m_rank == 0 && m_staticGpuBarrierMem)
hipFree(m_staticGpuBarrierMem);
}
m_init = false;
}
ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
{
ncclResult_t res;
if (m_init) return ncclSuccess;
m_init = true;
if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
// Check parameters
if (m_rank < 0 || m_rank >= m_numRanks)
{
WARN("Invalid rank specified. Expected 0 <= %d < %d for CliqueManager", m_rank, m_numRanks);
return ncclInvalidUsage;
}
if (commId == NULL)
{
WARN("CommId should not be empty");
return ncclInvalidUsage;
}
// For now, opt-into clique based kernels via RCCL_ENABLE_CLIQUE env var
if (!rcclParamEnableClique())
{
INFO(NCCL_INIT, "Disabling clique-based kernels (did not find env var RCCL_ENABLE_CLIQUE)");
m_cliqueMode = CLIQUE_DISABLED;
return ncclSuccess;
}
// Allocate pinned CPU memory for holding clique pointers, which kernels will have access to
if (hipHostMalloc(&m_pinnedCliquePtrs, sizeof(cliqueDevicePtrs_t) * NCCL_MAX_OPS) != hipSuccess)
{
WARN("Unable to allocated pinned host memory for clique pointers. Disabling clique-based kernels");
m_cliqueMode = CLIQUE_DISABLED;
m_init = true;
return ncclSuccess;
}
unsigned long hash = djb2Hash(commId->internal);
std::string shmSuffix = std::to_string(hash) + "_" + std::to_string(suffix);
// Allocate sense barrier variable on local GPU
NCCLCHECKGOTO(ncclCudaCalloc(&m_gpuBarrierLocalSense, NCCL_MAX_OPS * sizeof(int)), res, dropback);
if (m_cliqueMode == CLIQUE_SINGLE_NODE)
{
// Initialize shared memory file for IPC handles (based on commId hash)
m_shmHandles = NcclIpcHandleShm(m_rank, m_numRanks, hash, NUM_HANDLES_PER_RANK, NCCL_MAX_OPS, shmSuffix);
NCCLCHECKGOTO(m_shmHandles.Open(), res, dropback);
// Initialize IPC caches
m_ipcHandleSendCache = new NcclIpcHandleSendCache(m_numRanks * NUM_HANDLES_PER_RANK * NCCL_MAX_OPS);
m_ipcHandleRecvCache = new NcclIpcHandleRecvCache(m_numRanks * NUM_HANDLES_PER_RANK * NCCL_MAX_OPS,
100,
hipIpcMemHandleHash,
hipIpcMemHandleEqual);
// Initialize shared object for GPU barrier IPC handle
m_sharedIpcHandle = ShmObject<hipIpcMemHandle_t>(std::max(4096LU, sizeof(hipIpcMemHandle_t)),
CliqueShmNames["Barriers"] + shmSuffix,
m_rank,
m_numRanks,
hash);
NCCLCHECKGOTO(m_sharedIpcHandle.Open(), res, dropback);
if (m_rank == 0)
{
hipIpcMemHandle_t handle;
// Allocate fine-grained device memory on rank 0 and get IPC handle for it
// Re-usable barrier consists of (globalCount / globalSense) pair of integers
NCCLCHECKGOTO(ncclCudaCalloc(&m_fineGrainBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), true), res, dropback);
if (hipIpcGetMemHandle(&handle, m_fineGrainBarrierMem) != hipSuccess)
{
WARN("Unable to get IPC handle for barrier memory");
goto dropback;
}
// Write IPC handle to shared memory for other ranks to receive
*m_sharedIpcHandle.Get() = handle;
// Set up global count/sense for first rank
m_gpuBarrierGlobalCount = &m_fineGrainBarrierMem[0];
m_gpuBarrierGlobalSense = &m_fineGrainBarrierMem[NCCL_MAX_OPS];
}
// Initialize shared CPU memory to be used for barrier variables
m_sharedCpuMemory = ShmObject<int32_t>(2 * sizeof(int32_t),
CliqueShmNames["SharedCounters"] + shmSuffix,
m_rank,
m_numRanks,
hash);
NCCLCHECKGOTO(m_sharedCpuMemory.Open(), res, dropback);
// Split up the shared CPU memory for barrier counters / global sense
m_cpuBarrierGlobalCount = &m_sharedCpuMemory.Get()[0];
m_cpuBarrierGlobalSense = &m_sharedCpuMemory.Get()[1];
// Initialize CPU barriers
if (m_rank == 0)
{
*m_cpuBarrierGlobalCount = 0;
*m_cpuBarrierGlobalSense = 0;
}
m_cpuBarrierLocalSense = 0;
}
else if (m_cliqueMode == CLIQUE_SINGLE_PROCESS)
{
// First rank prepares fine-grained memory shared across ranks used for the two barrier variables
if (m_rank == 0)
{
NCCLCHECKGOTO(ncclCudaCalloc(&m_staticGpuBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), true), res, dropback);
// Prepare all barriers
for (int opIndex = 0; opIndex < NCCL_MAX_OPS; opIndex++)
{
m_staticCliquePtrs[opIndex].barrier.globalCount = &m_staticGpuBarrierMem[opIndex];
m_staticCliquePtrs[opIndex].barrier.globalSense = &m_staticGpuBarrierMem[opIndex + NCCL_MAX_OPS];;
}
}
}
m_init = true;
INFO(NCCL_INIT, "Clique-based kernels enabled (mode %d)", m_cliqueMode);
return ncclSuccess;
dropback:
// NOTE: This currently assumes that all ranks will fail the same way
// Additional support is required to handle cases when some processes succeed while others fail
WARN("Unable to initialize shared memory. Disabling clique-based kernels");
CleanUp();
m_cliqueMode = CLIQUE_DISABLED;
return ncclSuccess;
}
bool CliqueManager::IsSupported(ncclFunc_t const coll,
size_t const count,
ncclDataType_t const datatype,
ncclRedOp_t const op) const
{
if (m_cliqueMode == CLIQUE_DISABLED) return false;
// Filter based on total input size for each collective type
size_t totalBytes = count * ncclTypeSize(datatype);
if (coll == ncclCollAllReduce && (totalBytes <= rcclParamAllReduceCliqueByteLimit())) return true;
return false;
}
ncclResult_t CliqueManager::DeclarePointers(uint64_t opCount, void const* inputPtr, void* outputPtr)
{
// Do nothing if disabled
if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
if (!m_init)
{
WARN("CliqueManager must be initialized before use");
return ncclInvalidUsage;
}
int const opIndex = opCount % NCCL_MAX_OPS;
// Add opIndex to queue of in-progress collectives
m_inProgress.push(opIndex);
if (m_cliqueMode == CLIQUE_SINGLE_NODE)
{
// Get fine-grained device memory if not already done
if (m_fineGrainBarrierMem == NULL)
{
hipIpcMemHandle_t handle = *m_sharedIpcHandle.Get();
CUDACHECK(hipIpcOpenMemHandle((void**)&m_fineGrainBarrierMem, handle, hipIpcMemLazyEnablePeerAccess));
// Prepare global count/sense barrier variables used the ipc-shared gpu device memory
m_gpuBarrierGlobalCount = &m_fineGrainBarrierMem[0];
m_gpuBarrierGlobalSense = &m_fineGrainBarrierMem[NCCL_MAX_OPS];
}
std::vector<std::pair<hipIpcMemHandle_t,size_t>> handles(NUM_HANDLES_PER_RANK);
// Get IPC handles for input/output pointers from cache
NCCLCHECK(CheckCacheForPtr(const_cast<void*>(inputPtr), m_ipcHandleSendCache, m_rank, &handles[0]));
NCCLCHECK(CheckCacheForPtr(outputPtr , m_ipcHandleSendCache, m_rank, &handles[1]));
// Prepare barrier pointers (done after the IpcOpenMemory)
m_pinnedCliquePtrs[opIndex].barrier.globalCount = &m_gpuBarrierGlobalCount[opIndex];
m_pinnedCliquePtrs[opIndex].barrier.globalSense = &m_gpuBarrierGlobalSense[opIndex];
m_pinnedCliquePtrs[opIndex].barrier.localSense = &m_gpuBarrierLocalSense[opIndex];
// Write IPC handles to shared memory for given rank / opCount
NCCLCHECK(m_shmHandles.WriteHandles(opIndex, handles));
}
else if (m_cliqueMode == CLIQUE_SINGLE_PROCESS)
{
// Store this rank's input/output pointers into static member
m_staticCliquePtrs[opIndex].inputs[m_rank] = inputPtr;
m_staticCliquePtrs[opIndex].outputs[m_rank] = outputPtr;
}
return ncclSuccess;
}
ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll,
size_t const count,
ncclDataType_t const datatype,
ncclRedOp_t const op,
int const totalNumChannels,
uint8_t* numChannelstoUse)
{
size_t const totalBytes = count * ncclTypeSize(datatype);
*numChannelstoUse = 1;
if (coll == ncclCollAllReduce) {
*numChannelstoUse = std::min((int)rcclParamAllReduceNumChannels(), totalNumChannels);
}
return ncclSuccess;
}
ncclResult_t CliqueManager::SetCliqueCollectiveArgs(CollectiveArgs* args)
{
// Do nothing if disabled
if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
if (!m_init)
{
WARN("CliqueManager must be initialized before use");
return ncclInvalidUsage;
}
// Prepare clique argments (NOTE: clique pointers are not ready yet)
int opIndex = args->opCount % NCCL_MAX_OPS;
args->clique.ptrs = &m_pinnedCliquePtrs[opIndex];
// Determine number of channels to use for this collective
args->clique.nChannels = rcclParamAllReduceNumChannels();
return ncclSuccess;
}
ncclResult_t CliqueManager::WaitForPointers()
{
// Do nothing if disabled
if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
if (!m_init)
{
WARN("CliqueManager must be initialized before use");
return ncclInvalidUsage;
}
// Do nothing if there are no outstanding clique-kernels
if (m_inProgress.empty()) return ncclSuccess;
// Copy clique device pointers to pinned device memory
if (m_cliqueMode == CLIQUE_SINGLE_NODE)
{
// Wait for all ranks to arrive
WaitForBarrier();
int numHandles = m_numRanks * NUM_HANDLES_PER_RANK;
std::vector<std::pair<hipIpcMemHandle_t,size_t>> handles(numHandles);
while (!m_inProgress.empty())
{
int const opIndex = m_inProgress.front();
m_inProgress.pop();
// Collect the ready handles from shared memory and convert them to device pointers
NCCLCHECK(m_shmHandles.ReadHandles(opIndex, handles));
for (int i = 0; i < m_numRanks; i++)
{
void *input;
NCCLCHECK(CheckCacheForHandle(handles[i * NUM_HANDLES_PER_RANK],
m_ipcHandleRecvCache, &input));
m_pinnedCliquePtrs[opIndex].inputs[i] = const_cast<const void *>(input);
NCCLCHECK(CheckCacheForHandle(handles[(i * NUM_HANDLES_PER_RANK) + 1],
m_ipcHandleRecvCache, &m_pinnedCliquePtrs[opIndex].outputs[i]));
}
}
}
else if (m_cliqueMode == CLIQUE_SINGLE_PROCESS)
{
while (!m_inProgress.empty())
{
int const opIndex = m_inProgress.front();
m_inProgress.pop();
// Copy from static memory to pinned host memory and set local sense
memcpy(&m_pinnedCliquePtrs[opIndex], &m_staticCliquePtrs[opIndex], sizeof(cliqueDevicePtrs_t));
m_pinnedCliquePtrs[opIndex].barrier.localSense = &m_gpuBarrierLocalSense[opIndex];
}
}
return ncclSuccess;
}
std::string HandleToString(hipIpcMemHandle_t handle)
{
char mapping[17] = "0123456789ABCDEF";
std::string result;
for (int i = 0; i < 4; i++)
{
unsigned char val = (unsigned char)handle.reserved[i];
result += mapping[val / 16];
result += mapping[val % 16];
}
return result;
}
ncclResult_t CliqueManager::CheckCacheForPtr(void* devPtr,
NcclIpcHandleSendCache* cache,
int rank,
std::pair<hipIpcMemHandle_t, size_t>* handlePair)
{
// Get the base address for this device allocation
hsa_status_t status;
hsa_amd_pointer_info_t info;
info.size = sizeof(hsa_amd_pointer_info_t);
status = hsa_amd_pointer_info(devPtr, &info, NULL, NULL, NULL);
if (status != HSA_STATUS_SUCCESS) {
WARN("Uanble to get pointer information for %p", devPtr);
return ncclInvalidArgument;
}
// Compute the offset between the device addres and the base address
uint64_t baseAddr = (uint64_t)info.agentBaseAddress;
uint64_t realAddr = (uint64_t)devPtr;
handlePair->second = realAddr - baseAddr;
// IPC handles are only supported for base address pointers
NcclIpcHandleSendCache::iterator it = cache->find(baseAddr);
if (it == cache->end())
{
CUDACHECK(hipIpcGetMemHandle(&handlePair->first, (void*)baseAddr));
cache->insert(baseAddr, handlePair->first);
}
else
{
handlePair->first = (it->second).first;
}
return ncclSuccess;
}
ncclResult_t CliqueManager::CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
NcclIpcHandleRecvCache* cache,
void** ptr)
{
NcclIpcHandleRecvCache::iterator it = cache->find(handlePair.first);
// Get base address pointer from cache if it exists
void* baseAddr;
if (it == cache->end())
{
CUDACHECK(hipIpcOpenMemHandle(&baseAddr, handlePair.first, hipIpcMemLazyEnablePeerAccess));
cache->insert(handlePair.first, baseAddr);
}
else
{
baseAddr = (it->second).first;
}
// Modify base address pointer with offset
uint64_t realAddr = (uint64_t)baseAddr + handlePair.second;
*ptr = (void*)realAddr;
return ncclSuccess;
}
void CliqueManager::WaitForBarrier()
{
// Sense inversion barrier
m_cpuBarrierLocalSense = 1 - m_cpuBarrierLocalSense;
if (__sync_add_and_fetch(m_cpuBarrierGlobalCount, 1) == m_numRanks)
{
// Reset the barrier
STORE(m_cpuBarrierGlobalCount, 0);
STORE(m_cpuBarrierGlobalSense, m_cpuBarrierLocalSense);
} else {
while (LOAD(m_cpuBarrierGlobalSense) != m_cpuBarrierLocalSense);
}
}
ncclResult_t CliqueManager::BootstrapRootInit(int pid, unsigned long hash)
{
for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++)
{
int msgid, fd;
std::string msgQueueName = "/tmp/" + it->second + std::to_string(hash) + "_" + std::to_string(pid);
SYSCHECKVAL(open(msgQueueName.c_str(), O_CREAT | O_RDWR, 0606), "open", fd);
NCCLCHECK(MsgQueueGetId(msgQueueName, hash, true, msgid));
SYSCHECK(close(fd), "close");
}
std::string shmDir = "/dev/shm/";
for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++)
{
struct stat fileStatus;
std::string shmFileName = it->second + std::to_string(hash) + "_" + std::to_string(pid);
std::string shmFullPath = shmDir + shmFileName;
// Check if shm file already exists; if so, unlink it
if (stat(shmFullPath.c_str(), &fileStatus) == 0)
{
NCCLCHECK(shmUnlink(shmFileName.c_str()));
}
}
return ncclSuccess;
}
+128
Просмотреть файл
@@ -0,0 +1,128 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef RCCL_CLIQUE_MANAGER_HPP_
#define RCCL_CLIQUE_MANAGER_HPP_
#include <semaphore.h>
#include <mutex>
#include <queue>
#include "nccl.h"
#include "devcomm.h"
#include "CliqueCommon.h"
#include "HandleCache.h"
#include "HandleShm.h"
#define NUM_HANDLES_PER_RANK 2
class CliqueManager
{
public:
typedef enum
{
CLIQUE_DISABLED = 0,
CLIQUE_SINGLE_PROCESS = 1,
CLIQUE_SINGLE_NODE = 2
} cliqueMode_t;
CliqueManager(int const rank, int const numRanks, cliqueMode_t const cliqueMode);
~CliqueManager();
void CleanUp();
ncclResult_t Init(ncclUniqueId const* commId, int suffix);
// Returns true if the collective is supported via a clique-based kernel
bool IsSupported(ncclFunc_t const coll,
size_t const count,
ncclDataType_t const datatype,
ncclRedOp_t const op) const;
// Provide the pointers to be exchanged across the clique for the given rank / opCount
ncclResult_t DeclarePointers(uint64_t opCount, void const* inputPtr, void* outputPtr);
// Determine the number of channels / CUs to use for this call
ncclResult_t GetNumChannelsToUse(ncclFunc_t const coll,
size_t const count,
ncclDataType_t const datatype,
ncclRedOp_t const op,
int const totalNumChannels,
uint8_t* numChannelstoUse);
// Set pointers for where clique-related arguments will be found
// This sets pointers to device-accessible memory where the arguments will eventually reside
ncclResult_t SetCliqueCollectiveArgs(CollectiveArgs* args);
// Blocking call that only returns after all out-standing clique pointers are ready
ncclResult_t WaitForPointers();
// Prepares shared memory files upon initialization
static ncclResult_t BootstrapRootInit(int pid, unsigned long hash);
protected:
ncclResult_t CheckCacheForPtr(void* devPtr,
NcclIpcHandleSendCache* cache,
int rank,
std::pair<hipIpcMemHandle_t, size_t>* handlePair);
ncclResult_t CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
NcclIpcHandleRecvCache* cache,
void** ptr);
// Race-condition helper functions
void WaitForBarrier();
int m_rank; // Associated rank
int m_numRanks; // Total number of ranks
cliqueMode_t m_cliqueMode; // Clique mode (off/single process/single node)
bool m_init; // Whether CliqueManager has been initialized
cliqueDevicePtrs_t* m_pinnedCliquePtrs; // Pinned-host-memory (device accessible) containing device pointers
int* m_gpuBarrierGlobalCount; // Part of GPU barrier (count variable shared across ranks)
int* m_gpuBarrierGlobalSense; // Part of GPU barrier (reset variable shared across ranks)
int* m_gpuBarrierLocalSense; // Part of GPU barrier (reset variable local to this rank)
std::queue<int> m_inProgress; // Queue of clique-based collectives waiting for pointers
// IPC-related (CLIQUE_SINGLE_NODE)
NcclIpcHandleShm m_shmHandles; // Used to exchange IPC handles between ranks
NcclIpcHandleSendCache* m_ipcHandleSendCache; // Caches pointers to IPC handles (to send to other processes)
NcclIpcHandleRecvCache* m_ipcHandleRecvCache; // Caches IPC handles to pointers (received from other processes)
ShmObject<int32_t> m_sharedCpuMemory; // Used to pass shared memory used for CPU barrier
ShmObject<hipIpcMemHandle_t> m_sharedIpcHandle; // Used to pass fine-grained device memory buffer IPC handle
int* m_fineGrainBarrierMem; // Fine-grained GPU memory barrier (allocated only on 1st rank, shared on others)
int* m_cpuBarrierGlobalCount; // Part of CPU barrier (count variable shared across ranks)
int* m_cpuBarrierGlobalSense; // Part of CPU barrier (reset variable shared across ranks)
int m_cpuBarrierLocalSense; // Part of CPU barrier (reset variable local to this rank)
// Single-process (CLIQUE_SINGLE_PROCESS)
static cliqueDevicePtrs_t m_staticCliquePtrs[NCCL_MAX_OPS]; // Use shared static memory to exchange pointer info
static int* m_staticGpuBarrierMem; // Static storage backing for fine-grained gpu barrier
};
// For use in bootstrapping code
struct bootstrapRootStruct {
void* listenComm;
unsigned long hash;
};
#endif
+37
Просмотреть файл
@@ -0,0 +1,37 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef NCCL_CLIQUE_SHM_NAMES_H_
#define NCCL_CLIQUE_SHM_NAMES_H_
#include <string>
#include <map>
static std::map<std::string, std::string> CliqueShmNames =
{
{"SharedCounters", "RcclCounters" },
{"Mutexes" , "RcclMutexes" },
{"IpcHandles" , "RcclIpcHandles"},
{"Barriers" , "RcclBarriers" }
};
#endif
+31
Просмотреть файл
@@ -0,0 +1,31 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "HandleCache.h"
#include "Hash.h"
// djb2 hash function for hashing char array in hipIpcMemHandle_t
unsigned long hipIpcMemHandleHash(const hipIpcMemHandle_t& handle)
{
return djb2Hash(handle.reserved);
}
+142
Просмотреть файл
@@ -0,0 +1,142 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef NCCL_HANDLE_CACHE_H_
#define NCCL_HANDLE_CACHE_H_
#include <list>
#include <unordered_map>
#include <functional>
#include "core.h"
//#include "llvm/ADT/DenseMap.h"
template <
class Key,
class Value,
class Hash,
class KeyEqual,
class Allocator
>
class NcclIpcHandleCache
{
public:
typedef std::pair<Value, typename std::list<Key>::iterator> NcclIpcHandleCacheValueType;
typedef std::unordered_map<Key, NcclIpcHandleCacheValueType, Hash, KeyEqual, Allocator> LRUCache;
using iterator = typename LRUCache::iterator;
NcclIpcHandleCache(size_t size,
size_t bucket_count = 100,
const Hash& hash = Hash(),
const KeyEqual& eql = KeyEqual(),
const Allocator& alloc = Allocator() ) : m_cache(bucket_count, hash, eql, alloc)
{
m_capacity = size;
}
~NcclIpcHandleCache()
{
m_lruHistory.clear();
m_cache.clear();
}
iterator begin()
{
return m_cache.begin();
}
iterator end()
{
return m_cache.end();
}
iterator find(const Key& key)
{
iterator it = m_cache.find(key);
if (it != m_cache.end())
{
updateHistory(it);
}
return it;
}
std::pair<iterator, bool> insert(const Key& key, const Value& value)
{
if (m_cache.size() == m_capacity)
{
// remove entry
pop();
}
typename LRUCache::iterator it = m_cache.find(key);
bool inserted;
if (it == m_cache.end())
{
typename std::list<Key>::iterator it = m_lruHistory.insert(m_lruHistory.end(), key);
m_cache.insert(std::make_pair(key, std::make_pair(value, it)));
inserted = true;
}
else
{
inserted = false;
}
return std::pair<iterator, bool>(it, inserted);
}
private:
void pop()
{
typename LRUCache::iterator it = m_cache.find(m_lruHistory.front());
m_cache.erase(it);
m_lruHistory.pop_front();
}
void updateHistory(const iterator& it)
{
if (m_lruHistory.size() > 0)
{
m_lruHistory.splice(m_lruHistory.end(), m_lruHistory, (it->second).second);
}
}
size_t m_capacity;
std::list<Key> m_lruHistory;
LRUCache m_cache;
};
// djb2 hash function for hashing char array in hipIpcMemHandle_t
unsigned long hipIpcMemHandleHash(const hipIpcMemHandle_t& handle);
// equality function required for unordered_map
auto hipIpcMemHandleEqual = [](const hipIpcMemHandle_t& l, const hipIpcMemHandle_t& r)
{
return memcmp(l.reserved, r.reserved, sizeof(l.reserved)) == 0;
};
//typedef llvm::DenseMap<uint64_t, hipIpcMemHandle_t> SendCache;
//typedef llvm::DenseMap<hipIpcMemHandle_t, void*, decltype(&HandleHash), decltype(HandleEqual)> RecvCache;
typedef NcclIpcHandleCache<uint64_t, hipIpcMemHandle_t, std::hash<uint64_t>, std::equal_to<uint64_t>, std::allocator< std::pair<const uint64_t, std::pair<hipIpcMemHandle_t, std::list<uint64_t>::iterator>>>> NcclIpcHandleSendCache;
typedef NcclIpcHandleCache<hipIpcMemHandle_t, void*, decltype(&hipIpcMemHandleHash), decltype(hipIpcMemHandleEqual), std::allocator< std::pair<const hipIpcMemHandle_t, std::pair<void*, std::list<hipIpcMemHandle_t>::iterator>>>> NcclIpcHandleRecvCache;
#endif
+67
Просмотреть файл
@@ -0,0 +1,67 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip/hip_runtime.h>
#include "HandleShm.h"
#include "CliqueShmNames.h"
#include "core.h"
#include "Hash.h"
#include "shm.h"
NcclIpcHandleShm::NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string suffix) :
ShmObject<std::pair<hipIpcMemHandle_t,size_t>>(numRanks * numHandlesPerRank * capacity * sizeof(std::pair<hipIpcMemHandle_t,size_t>),
CliqueShmNames["IpcHandles"] + suffix,
rank,
numRanks,
projid),
m_numHandlesPerRank(numHandlesPerRank),
m_numHandlesPerOpCount(numRanks * numHandlesPerRank)
{
}
NcclIpcHandleShm::NcclIpcHandleShm()
{
}
NcclIpcHandleShm::~NcclIpcHandleShm()
{
}
ncclResult_t NcclIpcHandleShm::Open()
{
return ShmObject::Open();
}
ncclResult_t NcclIpcHandleShm::WriteHandles(uint64_t opCount, std::vector<std::pair<hipIpcMemHandle_t,size_t>> const& sendHandles)
{
size_t idx = (opCount * m_numHandlesPerOpCount) + (m_rank * m_numHandlesPerRank);
memcpy(m_shmPtr + idx, sendHandles.data(), sizeof(std::pair<hipIpcMemHandle_t,size_t>) * m_numHandlesPerRank);
return ncclSuccess;
}
ncclResult_t NcclIpcHandleShm::ReadHandles(uint64_t opCount, std::vector<std::pair<hipIpcMemHandle_t,size_t>>& recvHandles)
{
size_t idx = opCount * m_numHandlesPerOpCount;
memcpy(recvHandles.data(), m_shmPtr + idx, m_numHandlesPerOpCount * sizeof(std::pair<hipIpcMemHandle_t,ssize_t>));
return ncclSuccess;
}
+53
Просмотреть файл
@@ -0,0 +1,53 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef NCCL_IPC_HANDLE_SHM_H_
#define NCCL_IPC_HANDLE_SHM_H_
#include <hip/hip_runtime.h>
#include <vector>
#include <string>
#include "nccl.h"
#include "ShmObject.h"
class NcclIpcHandleShm : public ShmObject<std::pair<hipIpcMemHandle_t,size_t>>
{
public:
NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string suffix);
NcclIpcHandleShm();
~NcclIpcHandleShm();
ncclResult_t Open();
ncclResult_t WriteHandles(uint64_t opCount, std::vector<std::pair<hipIpcMemHandle_t,size_t>> const& sendHandles);
ncclResult_t ReadHandles(uint64_t opCount, std::vector<std::pair<hipIpcMemHandle_t,size_t>>& recvHandles);
private:
int m_numHandlesPerRank;
int m_numHandlesPerOpCount;
};
#endif
+34
Просмотреть файл
@@ -0,0 +1,34 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "Hash.h"
unsigned long djb2Hash(const char* data)
{
unsigned long hash = 5381;
int c;
while ((c = *(data)++))
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
return hash;
}
+28
Просмотреть файл
@@ -0,0 +1,28 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef NCCL_HASH_H_
#define NCCL_HASH_H_
unsigned long djb2Hash(const char* data);
#endif
+72
Просмотреть файл
@@ -0,0 +1,72 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "MsgQueue.h"
#include <sys/ipc.h>
#include <sys/msg.h>
#define MSG_QUEUE_PERM 0666
ncclResult_t MsgQueueGetId(std::string name, int projid, bool exclusive, int& msgid)
{
key_t key;
SYSCHECKVAL(ftok(name.c_str(), projid), "ftok", key);
int flag = (exclusive == true ? IPC_CREAT | IPC_EXCL : IPC_CREAT);
msgid = msgget(key, MSG_QUEUE_PERM | flag);
// Check if we're trying to create message queue and it already exists; if so, delete existing queue
if (msgid == -1 && exclusive == true && errno == EEXIST)
{
NCCLCHECK(MsgQueueClose(name, projid));
SYSCHECKVAL(msgget(key, MSG_QUEUE_PERM | flag), "msgget", msgid);
}
else if (msgid == -1)
{
WARN("Call to MsgQueueGetId failed : %s", strerror(errno));
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t MsgQueueSend(int msgid, const void* msgp, size_t msgsz, int msgflg)
{
SYSCHECK(msgsnd(msgid, msgp, msgsz, msgflg), "msgsnd");
return ncclSuccess;
}
ncclResult_t MsgQueueRecv(int msgid, void* msgp, size_t msgsz, long msgtyp, bool wait)
{
int msgflg = (wait == false ? IPC_NOWAIT : 0);
SYSCHECK(msgrcv(msgid, msgp, msgsz, msgtyp, msgflg), "msgrcv");
return ncclSuccess;
}
ncclResult_t MsgQueueClose(std::string name, int projid)
{
key_t key;
int msgid;
key = ftok(name.c_str(), projid);
SYSCHECKVAL(msgget(key, IPC_CREAT), "msgget", msgid);
SYSCHECK(msgctl(msgid, IPC_RMID, NULL), "msgctl");
return ncclSuccess;
}
+42
Просмотреть файл
@@ -0,0 +1,42 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef RCCL_MSG_QUEUE_HPP_
#define RCCL_MSG_QUEUE_HPP_
#include <string>
#include "nccl.h"
#include "core.h"
struct MsgBuffer
{
long msg_type;
char msg_text[1];
};
ncclResult_t MsgQueueGetId(std::string name, int projid, bool exclusive, int& msgid);
ncclResult_t MsgQueueSend(int msgid, const void* msgp, size_t msgsz, int msgflg);
ncclResult_t MsgQueueRecv(int msgid, void* msgp, size_t msgsz, long msgtyp, bool wait);
ncclResult_t MsgQueueClose(std::string name, int projid);
#endif
+43
Просмотреть файл
@@ -0,0 +1,43 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef SHAREDMEMHELPER_H
#define SHAREDMEMHELPER_H
class SharedMemHelper
{
public:
SharedMemHelper(int rank, int numRanks, int numEntries);
ncclStatus_t Init(std::string const& baseFilename);
ncclStatus_t
protected:
bool m_initialized;
int m_rank;
int m_numRanks;
};
#endif
+45
Просмотреть файл
@@ -0,0 +1,45 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "ShmObject.h"
#include <string>
// Template specializations for sem_t objects which require additional initialization
template<>
ncclResult_t ShmObject<sem_t>::Close()
{
size_t numMutexes = m_shmSize / sizeof(sem_t);
for (size_t i = 0; i < numMutexes; i++)
{
sem_destroy(static_cast<sem_t*>(&m_shmPtr[i]));
}
int retVal = shm_unlink(m_shmName.c_str());
if (retVal == -1 && errno != ENOENT)
{
WARN("Call to shm_unlink in ShmObject failed : %s", strerror(errno));
return ncclSystemError;
}
return ncclSuccess;
}
+210
Просмотреть файл
@@ -0,0 +1,210 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef NCCL_SHM_OBJECT_H_
#define NCCL_SHM_OBJECT_H_
#include <string>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <type_traits>
#include <semaphore.h>
#include "MsgQueue.h"
#include "nccl.h"
#include "core.h"
#include "shm.h"
// ShmObject abstracts away the nitty-gritty when multiple processes need to handle opening a shared
// memory object at the same time.
static ncclResult_t shmSetupExclusive(const char* shmname, const int shmsize, int* fd, void** ptr, int create) {
*fd = shm_open(shmname, O_CREAT | O_RDWR | O_EXCL, S_IRUSR | S_IWUSR);
if (*fd == -1) return ncclSystemError;
if (create) SYSCHECK(shm_allocate(*fd, shmsize), "posix_fallocate");
SYSCHECK(shm_map(*fd, shmsize, ptr), "mmap");
close(*fd);
*fd = -1;
if (create) memset(*ptr, 0, shmsize);
return ncclSuccess;
}
template <typename T>
class ShmObject
{
public:
ShmObject(size_t size, std::string fileName, int rank, int numRanks, int projid) :
m_shmSize(size),
m_shmName(fileName),
m_rank(rank),
m_numRanks(numRanks),
m_projid(projid),
m_alloc(false),
m_shmPtr(nullptr) {}
ShmObject() {}
~ShmObject() {}
ncclResult_t Open();
ncclResult_t Close()
{
if (m_alloc)
{
if (m_rank == 0)
{
std::string tmpFileName = "/tmp/" + m_shmName;
remove(tmpFileName.c_str());
}
int retVal = shm_unlink(m_shmName.c_str());
if (retVal == -1 && errno != ENOENT)
{
WARN("Call to shm_unlink in ShmObject failed : %s", strerror(errno));
return ncclSystemError;
}
}
return ncclSuccess;
}
T*& Get()
{
return m_shmPtr;
}
protected:
ncclResult_t BroadcastMessage(int msgid, bool pass)
{
MsgBuffer msg;
msg.msg_text[0] = (pass == 0 ? 'F': 'P');
for (int rank = 0; rank < m_numRanks; rank++)
{
if (rank == m_rank) continue;
msg.msg_type = rank;
NCCLCHECK(MsgQueueSend(msgid, &msg, sizeof(msg), 0));
}
return ncclSuccess;
}
// tag for dispatch
template<class U>
struct OpenTag{};
ncclResult_t InitIfSemaphore(OpenTag<int> tag);
ncclResult_t InitIfSemaphore(OpenTag<uint32_t> tag);
ncclResult_t InitIfSemaphore(OpenTag<hipIpcMemHandle_t> tag);
ncclResult_t InitIfSemaphore(OpenTag<sem_t> tag);
ncclResult_t InitIfSemaphore(OpenTag<std::pair<hipIpcMemHandle_t,size_t>> tag);
size_t m_shmSize;
std::string m_shmName;
int m_rank;
int m_numRanks;
int m_projid;
bool m_alloc;
T* m_shmPtr;
};
template <typename T>
ncclResult_t ShmObject<T>::Open()
{
if (m_alloc == false)
{
int shmFd;
int protection = PROT_READ | PROT_WRITE;
int visibility = MAP_SHARED;
int msgid;
std::string tmpFileName = "/tmp/" + m_shmName;
NCCLCHECK(MsgQueueGetId(tmpFileName, m_projid, false, msgid));
if (m_rank == 0)
{
ncclResult_t resultSetup = shmSetupExclusive(m_shmName.c_str(), m_shmSize, &shmFd, (void**)&m_shmPtr, 1);
ncclResult_t resultSemInit = InitIfSemaphore(OpenTag<T>{});
if ((resultSetup != ncclSuccess && errno != EEXIST) || (resultSemInit != ncclSuccess))
{
NCCLCHECK(BroadcastMessage(msgid, false));
WARN("Call to ShmObject::Open in root rank failed : %s", strerror(errno));
return ncclSystemError;
}
NCCLCHECK(BroadcastMessage(msgid, true));
}
else
{
MsgBuffer msg;
NCCLCHECK(MsgQueueRecv(msgid, &msg, sizeof(msg), m_rank, true));
if (msg.msg_text[0] == 'P')
{
NCCLCHECK(shmSetup(m_shmName.c_str(), m_shmSize, &shmFd, (void**)&m_shmPtr, 0));
}
else
{
WARN("Call to shm_open from non-root rank in ShmObject failed : %s", strerror(errno));
return ncclSystemError;
}
}
m_alloc = true;
}
else
{
WARN("Cannot allocate ShmObject twice.\n");
return ncclInvalidUsage;
}
return ncclSuccess;
}
template<typename T>
ncclResult_t ShmObject<T>::InitIfSemaphore(OpenTag<int> tag)
{
return ncclSuccess;
}
template<typename T>
ncclResult_t ShmObject<T>::InitIfSemaphore(OpenTag<unsigned int> tag)
{
return ncclSuccess;
}
template<typename T>
ncclResult_t ShmObject<T>::InitIfSemaphore(OpenTag<hipIpcMemHandle_t> tag)
{
return ncclSuccess;
}
template<typename T>
ncclResult_t ShmObject<T>::InitIfSemaphore(OpenTag<std::pair<hipIpcMemHandle_t,size_t>> tag)
{
return ncclSuccess;
}
template<typename T>
ncclResult_t ShmObject<T>::InitIfSemaphore(OpenTag<sem_t> tag)
{
size_t numMutexes = m_shmSize / sizeof(sem_t);
for (size_t i = 0; i < numMutexes; i++)
{
SYSCHECK(sem_init(static_cast<sem_t*>(&m_shmPtr[i]), 1, 1), "sem_init");
}
return ncclSuccess;
}
#endif
+6 -69
Просмотреть файл
@@ -8,6 +8,7 @@
#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
#include "clique/AllReduceCliqueKernel.h" // [RCCL] AllReduce Clique-based kernel support
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
@@ -310,6 +311,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->coll.count;
if (loopSize > size) {
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
@@ -417,76 +419,10 @@ __device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->coll.nThreads;
const int bid = args->coll.bid;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*nranks*chunkSize;
const ssize_t size = args->coll.count;
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
int nelem;
int chunk;
// step 0: push data to next GPU
chunk = ring->devUserRanks[nranks-1];
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
chunk = ring->devUserRanks[nranks-j];
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
chunk = ring->devUserRanks[0];
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
chunk = ring->devUserRanks[nranks-j];
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvCopySend(thisOutput+offset, nelem);
}
// Make final copy from buffer to dest.
chunk = ring->devUserRanks[1];
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.
LLprims.recv(thisOutput+offset, nelem);
}
// [RCCL] RingLL128 is re-purposed as clique-based kernel
LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, FUNC, T, args);
// [/RCCL]
}
template<int UNUSED, class FUNC, typename T>
@@ -507,6 +443,7 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
const ssize_t size = args->coll.count;
if (loopSize > size) {
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
+34 -2
Просмотреть файл
@@ -89,13 +89,45 @@ static inline __device__ void exitIfAbortBarrier(int abort) {
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy)
// [RCCL] Adding clique-based kernels for AllReduce, in-place of unused RingLL28 kernels
#define NCCL_FUNC5B(coll, op, dtype) \
NCCL_COLL_NAME(coll##LL, op, dtype), \
NCCL_COLL_NAME(coll##LL128, op, dtype), \
NCCL_COLL_NAME(coll, op, dtype)
#define NCCL_FUNC4B(coll, op, dtype) \
NCCL_FUNC5(coll##Tree, op, dtype), \
NCCL_FUNC5B(coll##Ring, op, dtype), \
NCCL_FUNC5(coll##CollNet, op, dtype)
#define NCCL_FUNCS3C(coll, op) \
NCCL_FUNC4B(coll, op, i8), \
NCCL_FUNC4B(coll, op, u8), \
NCCL_FUNC4B(coll, op, i32), \
NCCL_FUNC4B(coll, op, u32), \
NCCL_FUNC4B(coll, op, i64), \
NCCL_FUNC4B(coll, op, u64), \
NCCL_FUNC4B(coll, op, f16), \
NCCL_FUNC4B(coll, op, f32), \
NCCL_FUNC4B(coll, op, f64), \
NCCL_FUNC4B(coll, op, b16)
#define NCCL_FUNCS2C(coll) \
NCCL_FUNCS3C(coll, sum ), \
NCCL_FUNCS3C(coll, prod), \
NCCL_FUNCS3C(coll, max ), \
NCCL_FUNCS3C(coll, min )
// [/RCCL]
// Must be consistent with ncclFunc_t
#define NCCL_FUNCS() { \
NCCL_FUNCS2B(ncclBroadcast), \
NCCL_FUNCS2A(ncclReduce), \
NCCL_FUNCS2B(ncclAllGather), \
NCCL_FUNCS2A(ncclReduceScatter), \
NCCL_FUNCS2A(ncclAllReduce), \
NCCL_FUNCS2C(ncclAllReduce), \
NCCL_COLL_NAME(ncclGather, copy, i8), \
NCCL_COLL_NAME(ncclScatter, copy, i8), \
NCCL_COLL_NAME(ncclAllToAll, copy, i8), \
@@ -114,7 +146,7 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
NCCL_FUNCS2A(ncclReduceScatter),
NCCL_FUNCS2A(ncclAllReduce),
NCCL_FUNCS2C(ncclAllReduce),
NCCL_COLL_NAME(ncclGather, copy, i8),
NCCL_COLL_NAME(ncclScatter, copy, i8),
NCCL_COLL_NAME(ncclAllToAll, copy, i8),
+5
Просмотреть файл
@@ -350,9 +350,14 @@ __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(int32_t); }
__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
#endif
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
// Multiply UNROLL by 2 if single source/single destination
#define AUTOUNROLL (UNROLL*((MINSRCS==1 && MINDSTS==1) ? 2 : 1))
#else
// Try to limit consecutive load/stores to 8.
// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
#endif
template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
+47 -2
Просмотреть файл
@@ -9,6 +9,8 @@
#include "argcheck.h"
#include "coll_net.h"
#include "../graph/topo.h"
#include <hip/hip_runtime.h>
#include <hip/hip_ext.h>
// Only generate inline kernels for LL
#define NCCL_FUNC5(coll, op, dtype) \
@@ -116,6 +118,10 @@ ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2);
}
{ // [RCCL] Wait for any clique-based collectives
NCCLCHECK(comm->cliqueManager->WaitForPointers());
} // [/RCCL]
// Find the first operation, choose the kernel accordingly and pass it
// as the first argument.
struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
@@ -210,7 +216,8 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
(comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
}
hipEvent_t startEvent;
hipEvent_t stopEvent;
if (comm->launchMode == ncclComm::PARALLEL) {
hipLaunchKernelGGL(((void (*)(struct ncclDevComm*))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclDevComm ***)(params->args)));
} else {
@@ -257,6 +264,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
info->algorithm = -1;
info->protocol = -1;
int nAlgos = NCCL_NUM_ALGORITHMS;
// Check collNet support
int collNetTypeSupport = 0;
if (info->comm->collNetSupport)
@@ -373,6 +381,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
#endif
return ncclSuccess;
}
// Set nstepsPerLoop and nchunksPerLoop
NCCLCHECK(getAlgoInfo(info));
NCCLCHECK(getPatternInfo(info));
@@ -391,6 +400,33 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
{ // [RCCL] Check for clique-based kernel support
if (info->comm->cliqueManager->IsSupported(info->coll,
info->count,
info->datatype,
info->op))
{
// Declare the input / output pointers being used (to exchange via IPC with other ranks)
NCCLCHECK(info->comm->cliqueManager->DeclarePointers(info->comm->opCount,
info->sendbuff,
info->recvbuff));
info->algorithm = NCCL_ALGO_RING;
info->protocol = NCCL_PROTO_CLIQUE;
// Determine the number of channels to use for clique-kernel
NCCLCHECK(info->comm->cliqueManager->GetNumChannelsToUse(info->coll,
info->count,
info->datatype,
info->op,
info->comm->nChannels,
&coll->args.clique.nChannels));
coll->args.clique.count = info->count;
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
return ncclSuccess;
}
} // [RCCL]
int stepSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1;
@@ -478,6 +514,7 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
int nChannels = info->coll == ncclCollSendRecv ? 1 : coll.args.coll.nChannels;
int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;
for (int bid=0; bid<nChannels*nSubChannels; bid++) {
@@ -519,8 +556,15 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
memcpy(c->args.a2av.extra+info->comm->nRanks*2, info->recvcounts, sizeof(size_t*)*(info->comm->nRanks));
memcpy(c->args.a2av.extra+info->comm->nRanks*3, info->rdispls, sizeof(size_t*)*(info->comm->nRanks));
c->args.a2av.bid = bid % coll.args.coll.nChannels;
} else if (info->coll != ncclCollSendRecv)
} else if (info->coll != ncclCollSendRecv) {
c->args.coll.bid = bid % coll.args.coll.nChannels;
}
// [RCCL] Setup pointers to where all the input/output pointers will be
if (info->protocol == NCCL_PROTO_CLIQUE) {
NCCLCHECK(info->comm->cliqueManager->SetCliqueCollectiveArgs(&c->args));
}
// [/RCCL]
STORE(&c->active, 1);
opIndex = (opIndex+1)%NCCL_MAX_OPS;
@@ -599,6 +643,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
} else {
NCCLCHECKGOTO(ncclSaveKernel(info), ret, end);
}
end:
if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
ncclAsyncErrCheck(ret);
+7 -6
Просмотреть файл
@@ -414,14 +414,15 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
// Check if we are close enough that it makes sense to enable GDR
int netGdrLevel = PATH_PXB;
#ifdef TOPO_EXPL
int arch, vendor, model;
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME)
netGdrLevel = PATH_PHB;
#endif
NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL"));
if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel;
else {
int arch, vendor, model;
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
if((system->nodes[GPU].nodes[g].id & 0xf0000) == (system->nodes[NET].nodes[n].net.busId & 0xf0000))
netGdrLevel = PATH_PHB;
}
int distance = gpu->paths[NET][n].type;
if (distance > netGdrLevel) {
INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
+50
Просмотреть файл
@@ -29,6 +29,7 @@ struct rcclRomeModel {
int nNics;
int nLinks;
int64_t gpuIds[MAX_ROME_GPUS];
int64_t nicIds[MAX_ROME_NICS];
int64_t gpuNuma[MAX_ROME_GPUS];
int64_t nicNuma[MAX_ROME_NICS];
int connMatrix[MAX_ROME_GPUS*MAX_ROME_GPUS];
@@ -39,6 +40,7 @@ struct rcclRomeModel {
static struct rcclRomeModel rome_model_22 = {
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 2,
.gpuIds = { 0x3000, 0x43000, 0x26000, 0xc3000, 0x83000, 0x23000, 0xc6000, 0xa3000, },
.nicIds = { 0xe1000, },
.gpuNuma = { 1, 0, 1, 2, 3, 1, 2, 3, },
.nicNuma = { 2, },
.connMatrix = { 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, },
@@ -49,6 +51,7 @@ static struct rcclRomeModel rome_model_22 = {
static struct rcclRomeModel rome_model_25 = {
.nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
.nicIds = { 0x61000, 0xa1000, },
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
.nicNuma = { 0, 3, },
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
@@ -59,6 +62,7 @@ static struct rcclRomeModel rome_model_25 = {
static struct rcclRomeModel rome_model_27 = {
.nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
.nicIds = { 0x61000, 0xa1000, },
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
.nicNuma = { 0, 3, },
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
@@ -69,6 +73,7 @@ static struct rcclRomeModel rome_model_27 = {
static struct rcclRomeModel rome_model_29 = {
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
.nicIds = { 0xe1000, },
.gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
.nicNuma = { 2, },
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
@@ -79,6 +84,7 @@ static struct rcclRomeModel rome_model_29 = {
static struct rcclRomeModel rome_model_31 = {
.nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
.nicIds = { 0x61000, 0xa1000, },
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
.nicNuma = { 0, 6, },
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
@@ -89,6 +95,7 @@ static struct rcclRomeModel rome_model_31 = {
static struct rcclRomeModel rome_model_33 = {
.nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
.nicIds = { 0x61000, 0xa1000, },
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
.nicNuma = { 0, 6, },
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
@@ -99,6 +106,7 @@ static struct rcclRomeModel rome_model_33 = {
static struct rcclRomeModel rome_model_30 = {
.nGpus = 8, .nCpus = 8, .nNics = 0, .nLinks = 2,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
.nicIds = { },
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
.nicNuma = { },
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
@@ -109,6 +117,7 @@ static struct rcclRomeModel rome_model_30 = {
static struct rcclRomeModel rome_model_32 = {
.nGpus = 8, .nCpus = 8, .nNics = 0, .nLinks = 2,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
.nicIds = { },
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
.nicNuma = { },
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
@@ -119,6 +128,7 @@ static struct rcclRomeModel rome_model_32 = {
static struct rcclRomeModel rome_model_24 = {
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
.nicIds = { },
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
.nicNuma = { },
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
@@ -129,6 +139,7 @@ static struct rcclRomeModel rome_model_24 = {
static struct rcclRomeModel rome_model_26 = {
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
.nicIds = { },
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
.nicNuma = { },
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
@@ -139,6 +150,7 @@ static struct rcclRomeModel rome_model_26 = {
static struct rcclRomeModel rome_model_23 = {
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
.nicIds = { },
.gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
.nicNuma = { },
.connMatrix = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, },
@@ -149,6 +161,7 @@ static struct rcclRomeModel rome_model_23 = {
static struct rcclRomeModel rome_model_38 = {
.nGpus = 8, .nCpus = 7, .nNics = 0, .nLinks = 2,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
.nicIds = { },
.gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, },
.nicNuma = { },
.connMatrix = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, },
@@ -159,6 +172,7 @@ static struct rcclRomeModel rome_model_38 = {
static struct rcclRomeModel rome_model_28 = {
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
.nicIds = { },
.gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
.nicNuma = { },
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
@@ -166,6 +180,39 @@ static struct rcclRomeModel rome_model_28 = {
.ringBase = "0 3 2 1 4 5 6 7|7 6 5 4 1 2 3 0|0 2 5 7 4 6 3 1|1 3 6 4 7 5 2 0",
};
static struct rcclRomeModel rome_model_40 = {
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
.nicIds = { 0xe1000, },
.gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
.nicNuma = { 2, },
.connMatrix = { 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, },
.pattern = "10302120",
.ringBase = "6 7 1 4 0 5 3 2|7 6 4 1 0 2 3 5",
};
static struct rcclRomeModel rome_model_42 = {
.nGpus = 8, .nCpus = 7, .nNics = 1, .nLinks = 3,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
.nicIds = { 0xe1000, },
.gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, },
.nicNuma = { 4, },
.connMatrix = { 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, },
.pattern = "00102010012010",
.ringBase = "7 4 6 1 3 0 2 5|6 4 7 1 3 2 5 0",
};
static struct rcclRomeModel rome_model_44 = {
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
.gpuIds = { 0x63000, 0x43000, 0x27000, 0x3000, 0xe3000, 0xc3000, 0xa3000, 0x83000, },
.nicIds = { 0xc4000, },
.gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
.nicNuma = { 2, },
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
.pattern = "20202120",
.ringBase = "5 4 7 6 2 1 3 0|5 6 7 4 1 0 2 3",
};
static struct rcclRomeModel romeTopoModels[] = {
rome_model_22,
rome_model_25,
@@ -180,4 +227,7 @@ static struct rcclRomeModel romeTopoModels[] = {
rome_model_23,
rome_model_38,
rome_model_28,
rome_model_40,
rome_model_42,
rome_model_44,
};
+9
Просмотреть файл
@@ -905,6 +905,7 @@ static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRo
}
if (j >= romeTopo->nNics) {
net_map[j] = i;
romeTopo->nicIds[romeTopo->nNics] = system->nodes[NET].nodes[i].net.busId;
(romeTopo->nNics)++;
if (romeTopo->nNics >= MAX_ROME_NICS) break;
}
@@ -941,6 +942,9 @@ static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRo
fprintf(file, " .gpuIds = { ");
for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "0x%lx, ", romeTopo->gpuIds[i]);
fprintf(file, "},\n");
fprintf(file, " .nicIds = { ");
for (int i = 0; i < romeTopo->nNics; i ++) fprintf(file, "0x%lx, ", romeTopo->nicIds[i]);
fprintf(file, "},\n");
fprintf(file, " .gpuNuma = { ");
for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "%ld, ", romeTopo->gpuNuma[i]);
fprintf(file, "},\n");
@@ -1038,13 +1042,18 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopo
}
char line[1024];
#ifdef ENABLE_TRACE
sprintf(line, "Found matching Rome model index %d in %.2fms (%d iter) with GPU mapping: ", i, t, time);
#else
sprintf(line, "Found matching Rome model index %d with GPU mapping: ", i);
#endif
int offset = strlen(line);
for (int k = 0; k < ngpus; k++) {
sprintf(line+offset, "%d ", g[k]);
offset = strlen(line);
}
INFO(NCCL_GRAPH, "%s", line);
// create 4P2H based on reference and remapped ids
NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, romeTopo.nNics, net_map));
return ncclSuccess;
+6 -5
Просмотреть файл
@@ -266,7 +266,7 @@ ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
return ncclSuccess;
}
ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic, int64_t busId) {
int dev;
NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
@@ -286,6 +286,7 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s
if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0;
if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS;
if (ncclCollNet && xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0;
net->net.busId = busId;
ncclDebugNoWarn = 0;
NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width));
@@ -293,14 +294,14 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s
return ncclSuccess;
}
ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic, int64_t busId) {
for (int s=0; s<xmlNic->nSubs; s++) {
struct ncclXmlNode* xmlNet = xmlNic->subs[s];
if (strcmp(xmlNet->name, "net") != 0) continue;
int index;
NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
if (index == -1) continue;
NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic));
NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic, busId));
}
return ncclSuccess;
}
@@ -354,7 +355,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId));
node = nicNode; // Connect it to parent later on
}
NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));
NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode, busId));
} else if (type == PCI) {
NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
for (int s=0; s<xmlPci->nSubs; s++) {
@@ -421,7 +422,7 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH));
NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH));
}
NCCLCHECK(ncclTopoAddNic(node, system, nic));
NCCLCHECK(ncclTopoAddNic(node, system, nic, 0));
}
}
return ncclSuccess;
+1
Просмотреть файл
@@ -100,6 +100,7 @@ struct ncclTopoNode {
int gdrSupport;
int collSupport;
int maxChannels;
int64_t busId;
}net;
struct {
int arch;
+1 -1
Просмотреть файл
@@ -12,7 +12,7 @@
ncclResult_t bootstrapNetInit();
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState, int* rootPid); // [RCCL] Adding rootPid
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
+9 -2
Просмотреть файл
@@ -10,6 +10,9 @@
#include "transport.h"
#include "p2p.h"
// [RCCL]
#include "clique/CliqueManager.h"
// [/RCCL]
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
@@ -143,8 +146,12 @@ struct ncclComm {
//list of async p2p operation queued in a group semantics
struct ncclP2Plist p2plist;
// RCCL AllToAll/Scatter/Gather API
bool alltoallDisable;
// [RCCL]
bool alltoallDisable; // RCCL AllToAll/Scatter/Gather API
CliqueManager* cliqueManager; // CliqueManager handles pointer collection / distribution for clique-based kernels
int rootPid; // Process ID of root
// [/RCCL]
};
#endif
+15
Просмотреть файл
@@ -12,6 +12,9 @@
#include "rccl_bfloat16.h"
#include "align.h"
#include <stdint.h>
// [RCCL] Support for clique-based kernels
#include "clique/CliqueCommon.h"
// [/RCCL]
// Convert volatile access to atomic
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
@@ -22,6 +25,7 @@
#define STORE(DST, SRC) *(DST) = (SRC)
#endif
#define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollGather, ncclCollScatter, ncclCollAllToAll, ncclCollAllToAllv, ncclCollSendRecv} ncclFunc_t;
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+4];
@@ -35,6 +39,7 @@ extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define NCCL_PROTO_LL 0
#define NCCL_PROTO_LL128 1
#define NCCL_PROTO_CLIQUE 1 // [RCCL] Clique takes up same protocol as unused LL128
#define NCCL_PROTO_SIMPLE 2
extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
@@ -190,8 +195,18 @@ struct CollectiveArgs {
size_t count;
size_t* extra;
} a2av;
// [RCCL] Clique-based arguments
struct {
uint16_t nThreads;
uint8_t bid;
uint8_t nChannels;
size_t count;
cliqueDevicePtrs_t* ptrs;
} clique;
// [/RCCL]
};
};
struct ncclColl {
union {
struct {
+82 -22
Просмотреть файл
@@ -28,6 +28,10 @@
#include <unistd.h>
#include "graph/topo.h"
// [RCCL]
#include "clique/CliqueManager.h"
// [/RCCL]
#define STR2(v) #v
#define STR(v) STR2(v)
@@ -299,7 +303,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
return ncclSuccess;
}
RCCL_PARAM(AllToAllDisable, "ALLTOALL_KERNEL_DISABLE", 0);
RCCL_PARAM(AllToAllDisable, "ALLTOALL_KERNEL_DISABLE", 1);
RCCL_PARAM(ForceEnableClique, "FORCE_ENABLE_CLIQUE", 0);
static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
if (ndev < 1) {
@@ -678,7 +683,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
int nranks = comm->nRanks;
uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
// [RCCL] Collect the PID of the root
int rootPid;
NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap, &rootPid));
// [/RCCL]
// AllGather1 - begin
struct {
@@ -996,36 +1004,84 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
connect->nsend[c] = 0;
}
}
// We should have allocated all buffers, collective fifos, ... we can
// restore the affinity.
affinity_restore:
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
if (ret != ncclSuccess) return ret;
// Compute intra ranks (using AllGather1 data)
int intraRank0 = -1, intraRank = -1, intraRanks = 0;
for (int i = 0; i < nranks; i++) {
if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
(allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
if (intraRanks == 0) intraRank0 = i;
if (i == rank) intraRank = intraRanks;
intraRanks++;
do {
int intraRank0 = -1, intraRank = -1, intraRanks = 0;
for (int i = 0; i < nranks; i++) {
if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
(allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
if (intraRanks == 0) intraRank0 = i;
if (i == rank) intraRank = intraRanks;
intraRanks++;
}
}
}
TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
return ncclInternalError;
}
NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
return ncclInternalError;
}
NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
{ // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager if so
CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
if (comm->localRanks == comm->nRanks)
{
// Check that all the GPUs have peer access to one another
bool hasPeerAccess = true;
for (int i = 0; i < nranks && hasPeerAccess; i++)
{
int cudaDev1 = allGather1Data[i].peerInfo.cudaDev;
for (int j = 0; j < nranks; j++)
{
if (i == j) continue;
int cudaDev2 = allGather1Data[j].peerInfo.cudaDev;
int p2p;
if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess || !p2p)
{
hasPeerAccess = false;
break;
}
}
}
if (hasPeerAccess)
{
if (intraRanks == nranks)
cliqueMode = CliqueManager::CLIQUE_SINGLE_PROCESS;
else
cliqueMode = CliqueManager::CLIQUE_SINGLE_NODE;
}
// For now, only enable clique-based kernels on CR8_G topologies, unless explicitly asked
if (!rcclParamForceEnableClique())
{
// Disable clique-kernel support if not on CR8 topology
if (!(comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G))
{
INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (force enable with RCCL_FORCE_ENABLE_CLIQUE)");
cliqueMode = CliqueManager::CLIQUE_DISABLED;
}
}
}
comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode);
NCCLCHECK(comm->cliqueManager->Init(commId, rootPid));
} // [/RCCL]
} while(0);
// Done with AllGather1 data
free(allGather1Data);
if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
// We should have allocated all buffers, collective fifos, ... we can
// restore the affinity.
affinity_restore:
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
if (ret != ncclSuccess) return ret;
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
return ncclSuccess;
}
@@ -1144,6 +1200,10 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
return ncclInvalidArgument;
}
// [RCCL] Delete CliqueManager if it exists
if (comm->cliqueManager) delete comm->cliqueManager;
// [/RCCL]
return commDestroy(comm);
}
+2 -2
Просмотреть файл
@@ -73,10 +73,10 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
}
} else {
// Check CUDA device pointers
if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) {
if ((info->coll != ncclCollBroadcast && info->coll != ncclCollScatter) || info->comm->rank == info->root) {
NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
}
if (info->coll != ncclCollReduce || info->comm->rank == info->root) {
if ((info->coll != ncclCollReduce && info->coll != ncclCollGather) || info->comm->rank == info->root) {
NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
}
}
+10 -9
Просмотреть файл
@@ -418,23 +418,24 @@ namespace CorrectnessTests
switch (dataset.dataType)
{
case ncclInt8:
printf("Expected %d. Output %d on device %d[%d]\n", outputI1[j], expectedI1[j], i, j); break;
printf("Expected %d. Output %d on device %d[%d]\n", expectedI1[j], outputI1[j], i, j);
break;
case ncclUint8:
printf("Expected %u. Output %u on device %d[%d]\n", outputU1[j], expectedU1[j], i, j); break;
printf("Expected %u. Output %u on device %d[%d]\n", expectedU1[j], outputU1[j], i, j); break;
case ncclInt32:
printf("Expected %d. Output %d on device %d[%d]\n", outputI4[j], expectedI4[j], i, j); break;
printf("Expected %d. Output %d on device %d[%d]\n", expectedI4[j], outputI4[j], i, j); break;
case ncclUint32:
printf("Expected %u. Output %u on device %d[%d]\n", outputU4[j], expectedU4[j], i, j); break;
printf("Expected %u. Output %u on device %d[%d]\n", expectedU4[j], outputU4[j], i, j); break;
case ncclInt64:
printf("Expected %ld. Output %ld on device %d[%d]\n", outputI8[j], expectedI8[j], i, j); break;
printf("Expected %ld. Output %ld on device %d[%d]\n", expectedI8[j], outputI8[j], i, j); break;
case ncclUint64:
printf("Expected %lu. Output %lu on device %d[%d]\n", outputU8[j], expectedU8[j], i, j); break;
printf("Expected %lu. Output %lu on device %d[%d]\n", expectedU8[j], outputU8[j], i, j); break;
case ncclFloat32:
printf("Expected %f. Output %f on device %d[%d]\n", outputF4[j], expectedF4[j], i, j); break;
printf("Expected %f. Output %f on device %d[%d]\n", expectedF4[j], outputF4[j], i, j); break;
case ncclFloat64:
printf("Expected %lf. Output %lf on device %d[%d]\n", outputF8[j], expectedF8[j], i, j); break;
printf("Expected %lf. Output %lf on device %d[%d]\n", expectedF8[j], outputF8[j], i, j); break;
case ncclBfloat16:
printf("Expected %f. Output %f on device %d[%d]\n", (float)outputB2[j], (float)expectedB2[j], i, j); break;
printf("Expected %f. Output %f on device %d[%d]\n", (float)expectedB2[j], (float)outputB2[j], i, j); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
+1 -1
Просмотреть файл
@@ -58,6 +58,6 @@ namespace CorrectnessTests
testing::Values(2,3,4,5,6,7,8),
// In-place or not
testing::Values(false, true),
testing::Values("")),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
CorrectnessTest::PrintToStringParamName());
} // namespace
+1 -1
Просмотреть файл
@@ -96,6 +96,6 @@ namespace CorrectnessTests
testing::Values(2,3,4,5,6,7,8),
// In-place or not
testing::Values(false, true),
testing::Values("")),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
CorrectnessTest::PrintToStringParamName());
} // namespace
+1 -1
Просмотреть файл
@@ -116,6 +116,6 @@ namespace CorrectnessTests
testing::Values(2,3,4,5,6,7,8),
// In-place or not
testing::Values(false, true),
testing::Values("")),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
CorrectnessTest::PrintToStringParamName());
} // namespace
+257
Просмотреть файл
@@ -0,0 +1,257 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <sys/socket.h>
#include <ifaddrs.h>
#include <netdb.h>
#include <unistd.h>
#include <cstdio>
#include <string>
#include <chrono>
#include <hip/hip_runtime.h>
#include <rccl.h>
#include "HelloRccl.hpp"
void Usage(char *argv0);
void ExecuteTest(int numIntraRank, int intraRankStartId, int numTotalRanks, ncclComm_t* comm);
int main(int argc, char **argv)
{
if (getenv("NCCL_COMM_ID") && argc == 3) // Run in multi-process mode
{
int nranks = atoi(argv[1]);
int rank = atoi(argv[2]);
if (rank == 0) printf("Running in multi-process mode\n");
// Create communicator for this rank
ncclUniqueId commId;
NCCL_CALL(ncclGetUniqueId(&commId));
// Initialize communicator
ncclComm_t comm;
HIP_CALL(hipSetDevice(rank));
NCCL_CALL(ncclCommInitRank(&comm, nranks, commId, rank));
// Run the test
ExecuteTest(1, rank, nranks, &comm);
}
else if (argc == 2) // Run in single-process mode
{
printf("Running in single-process mode\n");
int nranks = atoi(argv[1]);
// Initialize communicators for each rank
ncclComm_t comm[nranks];
NCCL_CALL(ncclCommInitAll(comm, nranks, NULL));
// Run the test
ExecuteTest(nranks, 0, nranks, comm);
}
else
{
Usage(argv[0]);
return 1;
}
return 0;
}
void ExecuteTest(int numIntraRank, int intraRankStartId, int numTotalRanks, ncclComm_t* comm)
{
// Test configuration settings
int minPow = 10; // Starting power of 2 input size
int maxPow = 28; // Ending power of 2 input size
int numWarmups = 3; // Number of untimed warmup iterations
int numIterations = 10; // Number of timed iterations
// Allocate GPU resources for this process
hipStream_t stream[numIntraRank];
hipEvent_t startEvent[numIntraRank];
hipEvent_t stopEvent[numIntraRank];
for (int i = 0; i < numIntraRank; i++)
{
HIP_CALL(hipSetDevice(intraRankStartId + i));
HIP_CALL(hipStreamCreate(&stream[i]));
HIP_CALL(hipEventCreate(&startEvent[i]));
HIP_CALL(hipEventCreate(&stopEvent[i]));
}
if (intraRankStartId == 0)
{
printf("AllReduce Performance (sum of floats):\n");
printf("%10s %10s %10s\n", "Bytes", "CpuTime(ms)", "GpuTime(ms)");
}
// Loop over power-of-two input sizes
for (int power = minPow; power <= maxPow; power++)
{
int N = 1 << power;
// Allocate GPU memory
float *iputGpu[numIntraRank], *oputGpu[numIntraRank];
for (int r = 0; r < numIntraRank; r++)
{
HIP_CALL(hipSetDevice(intraRankStartId + r));
HIP_CALL(hipMalloc((void **)&iputGpu[r], N * sizeof(float)));
HIP_CALL(hipMalloc((void **)&oputGpu[r], N * sizeof(float)));
}
// Allocate CPU memory for input/output
float *iputCpu = (float *)malloc(N * sizeof(float));
float *oputCpu = (float *)malloc(N * sizeof(float));
// Fill CPU memory with a simple pattern
for (int i = 0; i < N; i++)
{
iputCpu[i] = 1.0f;
oputCpu[i] = 0.0f;
}
// Copy the input from CPU memory to GPU memory
for (int r = 0; r < numIntraRank; r++)
{
HIP_CALL(hipSetDevice(intraRankStartId + r));
HIP_CALL(hipMemcpy(iputGpu[r], iputCpu, N * sizeof(float), hipMemcpyHostToDevice));
}
// Perform some untimed initial warmup iterations
for (int iteration = 0; iteration < numWarmups; iteration++)
{
NCCL_CALL(ncclGroupStart());
for (int r = 0; r < numIntraRank; r++)
{
HIP_CALL(hipSetDevice(intraRankStartId + r));
NCCL_CALL(ncclAllReduce(iputGpu[r], oputGpu[r], N, ncclFloat, ncclSum, comm[r], stream[r]));
}
NCCL_CALL(ncclGroupEnd());
}
for (int r = 0; r < numIntraRank; r++)
HIP_CALL(hipStreamSynchronize(stream[r]));
// Perform timed iterations
auto cpuStart = std::chrono::high_resolution_clock::now();
for (int r = 0; r < numIntraRank; r++)
HIP_CALL(hipEventRecord(startEvent[r], stream[r]));
for (int iteration = 0; iteration < numIterations; iteration++)
{
NCCL_CALL(ncclGroupStart());
for (int r = 0; r < numIntraRank; r++)
{
HIP_CALL(hipSetDevice(intraRankStartId + r));
NCCL_CALL(ncclAllReduce(iputGpu[r], oputGpu[r], N, ncclFloat, ncclSum, comm[r], stream[r]));
}
NCCL_CALL(ncclGroupEnd());
}
for (int r = 0; r < numIntraRank; r++)
HIP_CALL(hipEventRecord(stopEvent[r], stream[r]));
for (int r = 0; r < numIntraRank; r++)
HIP_CALL(hipStreamSynchronize(stream[r]));
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(cpuDelta).count();
float totalGpuTime;
HIP_CALL(hipEventElapsedTime(&totalGpuTime, startEvent[0], stopEvent[0]));
if (intraRankStartId == 0) printf("%10lu %10.3f %10.3f\n", N * sizeof(float), (totalCpuTime / numIterations), (totalGpuTime / numIterations));
// Validate results
for (int r = 0; r < numIntraRank; r++)
{
HIP_CALL(hipMemcpy(oputCpu, oputGpu[r], N * sizeof(float), hipMemcpyDeviceToHost));
bool isOK = true;
int expected = numTotalRanks;
for (int i = 0; i < N; i++)
{
isOK &= (oputCpu[i] == expected);
}
if (!isOK)
{
printf("[ERROR] Rank %d Incorrect results for N = %d\n", intraRankStartId + r, N);
NCCL_CALL(ncclCommDestroy(comm[r]));
exit(1);
}
}
// Release GPU resources
for (int r = 0; r < numIntraRank; r++)
{
HIP_CALL(hipFree(oputGpu[r]));
HIP_CALL(hipFree(iputGpu[r]));
}
free(iputCpu);
free(oputCpu);
}
for (int r = 0; r < numIntraRank; r++)
{
HIP_CALL(hipStreamDestroy(stream[r]));
HIP_CALL(hipEventDestroy(startEvent[r]));
HIP_CALL(hipEventDestroy(stopEvent[r]));
NCCL_CALL(ncclCommDestroy(comm[r]));
}
}
void Usage(char *argv0)
{
printf("Single Process Usage: %s numRanks\n", argv0);
printf("\n");
printf("Multi Process Usage: %s numRanks rank\n", argv0);
printf(" - NCCL_COMM_ID must be set in order to use this\n\n");
printf(" - To use this process as the root process you may use any of the following:\n");
char hostname[256];
gethostname(hostname, 256);
printf(" export NCCL_COMM_ID=%s:12345\n", hostname);
// Loop over linked list of interfaces
struct ifaddrs *ifaddr;
getifaddrs(&ifaddr);
for (struct ifaddrs* ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next)
{
// Skip anything not based on IPv4 / IPv6
int family = ifa->ifa_addr->sa_family;
if (family != AF_INET && family != AF_INET6) continue;
// Skip iPv6 loopback interface
if (family == AF_INET6)
{
struct sockaddr_in6* sa = (struct sockaddr_in6*)(ifa->ifa_addr);
if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
}
socklen_t saLen = (family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6));
char host[NI_MAXHOST];
char service[NI_MAXSERV];
getnameinfo(ifa->ifa_addr, saLen, host, NI_MAXHOST, service, NI_MAXSERV,
NI_NUMERICHOST|NI_NUMERICSERV);
std::string result = std::string(host) + ":12345";
printf(" export NCCL_COMM_ID=%s\n", result.c_str());
}
freeifaddrs(ifaddr);
}
+49
Просмотреть файл
@@ -0,0 +1,49 @@
/*
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HELLORCCL_HPP
#define HELLORCCL_HPP
#include <iostream>
#define HIP_CALL(cmd) \
do { \
hipError_t error = (cmd); \
if (error != hipSuccess) \
{ \
std::cerr << "Encountered HIP error (" << hipGetErrorString(error) << ") at line " \
<< __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while (0)
#define NCCL_CALL(cmd) \
do { \
ncclResult_t error = (cmd); \
if (error != ncclSuccess) \
{ \
std::cerr << "Encountered NCCL error (" << ncclGetErrorString(error) << ") at line " \
<< __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while (0)
#endif
+21
Просмотреть файл
@@ -0,0 +1,21 @@
# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
# Set to where RCCL is installed
RCCL_INSTALL=../../build/release
HIP_PATH?= $(wildcard /opt/rocm/hip)
ifeq (,$(HIP_PATH))
HIP_PATH=../../..
endif
HIPCC=$(HIP_PATH)/bin/hipcc
EXE=HelloRccl
CXXFLAGS = -std=c++11 -O3 -I../../src/include -I$(RCCL_INSTALL) -L$(RCCL_INSTALL) -lrccl
all: $(EXE)
$(EXE): $(EXE).cpp $(shell find -regex ".*\.\hpp")
$(HIPCC) $(CXXFLAGS) $< -o $@
clean:
rm -f *.o $(EXE)
Исполняемый файл
+22
Просмотреть файл
@@ -0,0 +1,22 @@
#!/bin/bash
RCCL_INSTALL=../../build/release
EXE=$PWD/HelloRccl
LDPATH=$LD_LIBRARY_PATH:$RCCL_INSTALL
echo "Single process - With clique-based kernels:"
RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT=1073741824 RCCL_FORCE_ENABLE_CLIQUE=1 NCCL_DEBUG=INFO RCCL_ENABLE_CLIQUE=1 LD_LIBRARY_PATH=$LDPATH $EXE 4
echo "Single process - Without clique-based kernels:"
NCCL_DEBUG=INFO LD_LIBRARY_PATH=$LDPATH $EXE 4
echo "With clique-based kernels:"
RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT=1073741824 RCCL_FORCE_ENABLE_CLIQUE=1 NCCL_DEBUG=INFO RCCL_ENABLE_CLIQUE=1 NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 0 &
RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT=1073741824 RCCL_FORCE_ENABLE_CLIQUE=1 NCCL_DEBUG=INFO RCCL_ENABLE_CLIQUE=1 NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 1 &
RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT=1073741824 RCCL_FORCE_ENABLE_CLIQUE=1 NCCL_DEBUG=INFO RCCL_ENABLE_CLIQUE=1 NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 2 &
RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT=1073741824 RCCL_FORCE_ENABLE_CLIQUE=1 NCCL_DEBUG=INFO RCCL_ENABLE_CLIQUE=1 NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 3
echo "Without clique-based kernels:"
NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 0 &
NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 1 &
NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 2 &
NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 3
+1 -1
Просмотреть файл
@@ -6,7 +6,7 @@ endif
HIPCC=$(HIP_PATH)/bin/hipcc
EXE=TransferBench
CXXFLAGS = -O3 -fopenmp -I../../src/include -I.
CXXFLAGS = -O3 -I../../src/include -I.
all: $(EXE)
+279 -98
Просмотреть файл
@@ -26,7 +26,7 @@ THE SOFTWARE.
#include "TransferBench.hpp"
// Simple configuration parameters
size_t const DEFAULT_BYTES_PER_LINK = (1<<28);
size_t const DEFAULT_BYTES_PER_LINK = (1<<26);
int const DEFAULT_NUM_WARMUPS = 3;
int const DEFAULT_NUM_ITERATIONS = 10;
@@ -40,6 +40,27 @@ int main(int argc, char **argv)
exit(0);
}
// If a negative value is listed for N, generate a comprehensive config file for this node
if (argc > 2 && atoi(argv[2]) < 0)
{
GenerateConfigFile(argv[1], -1*atoi(argv[2]));
exit(0);
}
// Collect environment variables / display current run configuration
bool useHipCall = getenv("USE_HIP_CALL"); // Use hipMemcpy/hipMemset instead of custom shader kernels
bool useMemset = getenv("USE_MEMSET"); // Perform a memset instead of a copy (ignores source memory)
bool useFineGrainMem = getenv("USE_FINEGRAIN_MEM"); // Allocate fine-grained GPU memory instead of coarse-grained GPU memory
bool useSingleSync = getenv("USE_SINGLE_SYNC"); // Perform synchronization only once after all iterations instead of per iteration
bool useInteractive = getenv("USE_INTERACTIVE"); // Pause for user-input before starting transfer loop
bool useSleep = getenv("USE_SLEEP"); // Adds a 100ms sleep after each synchronization
bool reuseStreams = getenv("REUSE_STREAMS"); // Re-use streams instead of creating / destroying per test
bool showAddr = getenv("SHOW_ADDR"); // Print out memory addresses for each Link
bool outputToCsv = getenv("OUTPUT_TO_CSV"); // Output in CSV format
int byteOffset = getenv("BYTE_OFFSET") ? atoi(getenv("BYTE_OFFSET")) : 0; // Byte-offset for memory allocations
int numWarmups = getenv("NUM_WARMUPS") ? atoi(getenv("NUM_WARMUPS")) : DEFAULT_NUM_WARMUPS;
int numIterations = getenv("NUM_ITERATIONS") ? atoi(getenv("NUM_ITERATIONS")) : DEFAULT_NUM_ITERATIONS;
// Determine number of bytes to run per link
// If a non-zero number of bytes is specified, use it
// Otherwise generate array of bytes values to execute over
@@ -55,12 +76,10 @@ int main(int argc, char **argv)
if (numBytesPerLink != 0)
{
size_t N = numBytesPerLink / sizeof(float);
printf("Operating on %zu bytes per link (%zu floats)\n", numBytesPerLink, N);
valuesOfN.push_back(N);
}
else
{
printf("Operating on range of sizes\n");
for (int N = 256; N <= (1<<27); N *= 2)
{
int decimationFactor = 1; // This can be modified to increase number of samples between powers of two
@@ -74,19 +93,6 @@ int main(int argc, char **argv)
}
}
// Collect environment variables / display current run configuration
bool useHipCall = getenv("USE_HIP_CALL"); // Use hipMemcpy/hipMemset instead of custom shader kernels
bool useMemset = getenv("USE_MEMSET"); // Perform a memset instead of a copy (ignores source memory)
bool useFineGrainMem = getenv("USE_FINEGRAIN_MEM"); // Allocate fine-grained GPU memory instead of coarse-grained GPU memory
bool useSingleSync = getenv("USE_SINGLE_SYNC"); // Perform synchronization only once after all iterations instead of per iteration
bool useInteractive = getenv("USE_INTERACTIVE"); // Pause for user-input before starting transfer loop
bool useSleep = getenv("USE_SLEEP"); // Adds a 100ms sleep after each synchronization
bool reuseStreams = getenv("REUSE_STREAMS"); // Re-use streams instead of creating / destroying per test
bool showAddr = getenv("SHOW_ADDR"); // Print out memory addresses for each Link
int byteOffset = getenv("BYTE_OFFSET") ? atoi(getenv("BYTE_OFFSET")) : 0; // Byte-offset for memory allocations
int numWarmups = getenv("NUM_WARMUPS") ? atoi(getenv("NUM_WARMUPS")) : DEFAULT_NUM_WARMUPS;
int numIterations = getenv("NUM_ITERATIONS") ? atoi(getenv("NUM_ITERATIONS")) : DEFAULT_NUM_ITERATIONS;
if (byteOffset % 4)
{
printf("[ERROR] byteOffset must be a multiple of 4\n");
@@ -95,49 +101,55 @@ int main(int argc, char **argv)
int initOffset = byteOffset / sizeof(float);
char *env;
printf("Run configuration\n");
printf("=====================================================\n");
printf("%-20s %8s: Using %s\n",
"USE_HIP_CALL", useHipCall ? "(set)" : "(unset)",
useHipCall ? "HIP functions" : "custom kernels");
printf("%-20s %8s: Performing %s\n",
"USE_MEMSET", useMemset ? "(set)" : "(unset)",
useMemset ? "memset" : "memcopy");
if (useHipCall && !useMemset)
if (!outputToCsv)
{
env = getenv("HSA_ENABLE_SDMA");
printf("Run configuration\n");
printf("=====================================================\n");
printf("%-20s %8s: Using %s\n",
"USE_HIP_CALL", useHipCall ? "(set)" : "(unset)",
useHipCall ? "HIP functions" : "custom kernels");
printf("%-20s %8s: Performing %s\n",
"USE_MEMSET", useMemset ? "(set)" : "(unset)",
useMemset ? "memset" : "memcopy");
if (useHipCall && !useMemset)
{
env = getenv("HSA_ENABLE_SDMA");
printf("%-20s %8s: %s\n",
"HSA_ENABLE_SDMA", env ? env : "(unset)",
(env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
}
printf("%-20s %8s: GPU destination memory type: %s-grained\n",
"USE_FINEGRAIN_MEM", useFineGrainMem ? "(set)" : "(unset)",
useFineGrainMem ? "fine" : "coarse");
printf("%-20s %8s: %s\n",
"HSA_ENABLE_SDMA", env ? env : "(unset)",
(env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
"USE_SINGLE_SYNC", useSingleSync ? "(set)" : "(unset)",
useSingleSync ? "Synchronizing only once, after all iterations" : "Synchronizing per iteration");
printf("%-20s %8s: Running in %s mode\n",
"USE_INTERACTIVE", useInteractive ? "(set)" : "(unset)",
useInteractive ? "interactive" : "non-interactive");
printf("%-20s %8s: %s\n",
"USE_SLEEP", useSleep ? "(set)" : "(unset)",
useSleep ? "Add sleep after each sync" : "No sleep per sync");
printf("%-20s %8s: %s\n",
"REUSE_STREAMS", reuseStreams ? "(set)" : "(unset)",
reuseStreams ? "Re-using streams per topology" : "Creating/destroying streams per topology");
printf("%-20s %8s: %s\n",
"SHOW_ADDR", showAddr ? "(set)" : "(unset)",
showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses");
env = getenv("OUTPUT_TO_CSV");
printf("%-20s %8s: Output to csv\n",
"OUTPUT_TO_CSV", env ? env : "(unset)");
env = getenv("BYTE_OFFSET");
printf("%-20s %8s: Using byte offset of %d\n",
"BYTE_OFFSET", env ? env : "(unset)", byteOffset);
env = getenv("NUM_WARMUPS");
printf("%-20s %8s: Running %d warmup iteration(s) per topology\n",
"NUM_WARMUPS", env ? env : "(unset)", numWarmups);
env = getenv("NUM_ITERATIONS");
printf("%-20s %8s: Running %d timed iteration(s) per topology\n",
"NUM_ITERATIONS", env ? env : "(unset)", numIterations);
printf("\n");
}
printf("%-20s %8s: GPU destination memory type: %s-grained\n",
"USE_FINEGRAIN_MEM", useFineGrainMem ? "(set)" : "(unset)",
useFineGrainMem ? "fine" : "coarse");
printf("%-20s %8s: %s\n",
"USE_SINGLE_SYNC", useSingleSync ? "(set)" : "(unset)",
useSingleSync ? "Synchronizing only once, after all iterations" : "Synchronizing per iteration");
printf("%-20s %8s: Running in %s mode\n",
"USE_INTERACTIVE", useInteractive ? "(set)" : "(unset)",
useInteractive ? "interactive" : "non-interactive");
printf("%-20s %8s: %s\n",
"USE_SLEEP", useSleep ? "(set)" : "(unset)",
useSleep ? "Add sleep after each sync" : "No sleep per sync");
printf("%-20s %8s: %s\n",
"REUSE_STREAMS", reuseStreams ? "(set)" : "(unset)",
reuseStreams ? "Re-using streams per topology" : "Creating/destroying streams per topology");
printf("%-20s %8s: %s\n",
"SHOW_ADDR", showAddr ? "(set)" : "(unset)",
showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses");
env = getenv("BYTE_OFFSET");
printf("%-20s %8s: Using byte offset of %d\n",
"BYTE_OFFSET", env ? env : "(unset)", byteOffset);
env = getenv("NUM_WARMUPS");
printf("%-20s %8s: Running %d warmup iteration(s) per topology\n",
"NUM_WARMUPS", env ? env : "(unset)", numWarmups);
env = getenv("NUM_ITERATIONS");
printf("%-20s %8s: Running %d timed iteration(s) per topology\n",
"NUM_ITERATIONS", env ? env : "(unset)", numIterations);
printf("\n");
// Collect the number of available CPUs/GPUs on this machine
int numGpuDevices;
@@ -160,8 +172,14 @@ int main(int argc, char **argv)
std::map<std::pair<int, int>, int> linkMap;
std::vector<std::vector<hipStream_t>> streamCache(numGpuDevices);
// Print CSV header
if (outputToCsv)
{
printf("Test,NumBytes,ExeGpu,SrcMem,DstMem,BW(GB/s),Time(ms),LinkDesc,SrcAddr,DstAddr,numWarmups,numIters,useHipCall,useMemSet,useFineGrain,useSingleSync,resuseStreams\n");
}
// Loop over each line in the configuration file
int lineNum = 0;
int testNum = 0;
char line[2048];
while(fgets(line, 2048, fp))
{
@@ -171,12 +189,12 @@ int main(int argc, char **argv)
int const numLinks = links.size();
if (numLinks == 0) continue;
lineNum++;
testNum++;
// Loop over all the different number of bytes to use per Link
for (auto N : valuesOfN)
{
printf("Test %d: [%lu bytes]\n", lineNum, N * sizeof(float));
if (!outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float));
float* linkSrcMem[numLinks]; // Source memory per Link
float* linkDstMem[numLinks]; // Destination memory per Link
hipStream_t streams[numLinks]; // hipStream to use per Link
@@ -191,7 +209,6 @@ int main(int argc, char **argv)
for (int i = 0; i < numGpuDevices; i++)
linkCount[i] = 0;
char name[MAX_NAME_LEN+1] = {}; // Used to describe the set of Links
for (int i = 0; i < numLinks; i++)
{
MemType srcMemType = links[i].srcMemType;
@@ -206,12 +223,10 @@ int main(int argc, char **argv)
(dstIndex < 0 || dstIndex >= numGpuDevices) ||
(exeIndex < 0 || exeIndex >= numGpuDevices))
{
printf("[ERROR] Invalid link %d:(%c%d->%c%d). Total devices: %d\n",
exeIndex, MemTypeStr[srcMemType], srcIndex, MemTypeStr[dstMemType], dstIndex, numGpuDevices);
printf("[ERROR] Invalid link %d:(%c%d->%c%d) GPU index must be between 0 and %d inclusively\n",
exeIndex, MemTypeStr[srcMemType], srcIndex, MemTypeStr[dstMemType], dstIndex, numGpuDevices-1);
exit(1);
}
snprintf(name + strlen(name), MAX_NAME_LEN, "%d:(%c%d->%c%d:%d)",
exeIndex, MemTypeStr[srcMemType], srcIndex, MemTypeStr[dstMemType], dstIndex, blocksToUse);
// Enable peer-to-peer access if this is the first time seeing this pair
if (srcMemType == MEM_GPU && dstMemType == MEM_GPU)
@@ -304,8 +319,7 @@ int main(int argc, char **argv)
// Start CPU timing for this iteration
auto cpuStart = std::chrono::high_resolution_clock::now();
// Run all links in parallel (one thread per link)
#pragma omp parallel for num_threads(numLinks)
// Enqueue all links
for (int i = 0; i < numLinks; i++)
{
HIP_CALL(hipSetDevice(links[i].exeIndex));
@@ -331,17 +345,13 @@ int main(int argc, char **argv)
}
else
{
// Record start event
//if (recordStart) HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
hipExtLaunchKernelGGL(useMemset ? MemsetKernel : CopyKernel,
dim3(links[i].numBlocksToUse, 1, 1),
dim3(BLOCKSIZE, 1, 1),
0, streams[i],
recordStart ? startEvents[i] : dummyEvents[i],
recordStop ? stopEvents[i] : dummyEvents[i],
recordStart ? startEvents[i] : NULL,
recordStop ? stopEvents[i] : NULL,
0, gpuBlockParams[i]);
// Record stop event
//if (recordStop) HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
}
}
@@ -393,19 +403,57 @@ int main(int argc, char **argv)
CheckOrFill(MODE_CHECK, N, useMemset, useHipCall, linkDstMem[i] + initOffset);
// Report timings
totalCpuTime = totalCpuTime / (1.0 * numIterations) * 1000;
double totalBandwidthGbs = 0.0;
for (int i = 0; i < numLinks; i++)
{
double linkDurationMsec = totalGpuTime[i] / (1.0 * numIterations);
double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f;
printf(" Link %02d: %c%02d -> [GPU %02d:%02d] -> %c%02d | %9.3f GB/s | %8.3f ms |",
i + 1,
MemTypeStr[links[i].srcMemType], links[i].srcIndex,
links[i].exeIndex, links[i].numBlocksToUse,
MemTypeStr[links[i].dstMemType], links[i].dstIndex,
linkBandwidthGbs, linkDurationMsec);
if (showAddr) printf(" %16p | %16p |", linkSrcMem[i] + initOffset, linkDstMem[i] + initOffset);
printf("\n");
totalBandwidthGbs += linkBandwidthGbs;
if (!outputToCsv)
{
printf(" Link %02d: %c%02d -> [GPU %02d:%02d] -> %c%02d | %9.3f GB/s | %8.3f ms | %9s |",
i + 1,
MemTypeStr[links[i].srcMemType], links[i].srcIndex,
links[i].exeIndex, links[i].numBlocksToUse,
MemTypeStr[links[i].dstMemType], links[i].dstIndex,
linkBandwidthGbs, linkDurationMsec,
GetLinkDesc(links[i]).c_str());
if (showAddr) printf(" %16p | %16p |", linkSrcMem[i] + initOffset, linkDstMem[i] + initOffset);
printf("\n");
}
else
{
printf("%d,%lu,%02d,%c%02d,%c%02d,%9.3f,%8.3f,%s,%p,%p,%d,%d,%s,%s,%s,%s,%s\n",
testNum, N * sizeof(float), links[i].exeIndex,
MemTypeStr[links[i].srcMemType], links[i].srcIndex,
MemTypeStr[links[i].dstMemType], links[i].dstIndex,
linkBandwidthGbs, linkDurationMsec,
GetLinkDesc(links[i]).c_str(),
linkSrcMem[i] + initOffset, linkDstMem[i] + initOffset,
numWarmups, numIterations,
useHipCall ? "true" : "false",
useMemset ? "true" : "false",
useFineGrainMem ? "true" : "false",
useSingleSync ? "true" : "false",
reuseStreams ? "true" : "false");
}
}
// Display aggregate statistics
if (!outputToCsv)
{
printf(" Aggregate Bandwidth | %9.3f GB/s | %8.3f ms |\n", totalBandwidthGbs, totalCpuTime);
}
else
{
printf("%d,%lu,ALL,ALL,ALL,%9.3f,%8.3f,ALL,ALL,ALL,%d,%d,%s,%s,%s,%s,%s\n",
testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, numWarmups, numIterations,
useHipCall ? "true" : "false",
useMemset ? "true" : "false",
useFineGrainMem ? "true" : "false",
useSingleSync ? "true" : "false",
reuseStreams ? "true" : "false");
}
// Release GPU memory
@@ -431,23 +479,6 @@ int main(int argc, char **argv)
HIP_CALL(hipStreamDestroy(stream));
}
// Print link information
printf("Link topology:\n");
uint32_t linkType;
uint32_t hopCount;
for (auto mapPair : linkMap)
{
int src = mapPair.first.first;
int dst = mapPair.first.second;
HIP_CALL(hipExtGetLinkTypeAndHopCount(src, dst, &linkType, &hopCount));
printf("%d -> %d: %s [%d hop(s)]\n", src, dst,
linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "HYPERTRANSPORT" :
linkType == HSA_AMD_LINK_INFO_TYPE_QPI ? "QPI" :
linkType == HSA_AMD_LINK_INFO_TYPE_PCIE ? "PCIE" :
linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND ? "INFINIBAND" :
linkType == HSA_AMD_LINK_INFO_TYPE_XGMI ? "XGMI" : "UNKNOWN",
hopCount);
}
return 0;
}
@@ -459,6 +490,7 @@ void DisplayUsage(char const* cmdName)
printf(" N : (Optional) Number of bytes to transfer per link.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 128 bytes\n", DEFAULT_BYTES_PER_LINK);
printf(" If 0 is specified, a range of Ns will be benchmarked\n");
printf(" If a negative number is specified, a configFile gets generated with this number as default number of CUs per link\n");
printf("\n");
printf("Configfile Format:\n");
printf("==================\n");
@@ -508,11 +540,115 @@ void DisplayUsage(char const* cmdName)
printf(" USE_SLEEP - Adds a 100ms sleep after each synchronization\n");
printf(" REUSE_STREAMS - Re-use streams instead of creating / destroying per test\n");
printf(" SHOW_ADDR - Print out memory addresses for each Link\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n");
printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n");
printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n");
}
void GenerateConfigFile(char const* cfgFile, int numBlocks)
{
// Detect number of available GPUs and skip if less than 2
int numGpuDevices;
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
printf("Generated configFile %s for %d device(s) / %d CUs per link\n", cfgFile, numGpuDevices, numBlocks);
if (numGpuDevices < 2)
{
printf("Skipping. (Less than 2 GPUs detected)\n");
exit(0);
}
// Open config file for writing
FILE* fp = fopen(cfgFile, "w");
if (!fp)
{
printf("Unable to open [%s] for writing\n", cfgFile);
exit(1);
}
// CU testing
fprintf(fp, "# CU scaling tests\n");
for (int i = 1; i < 16; i++)
fprintf(fp, "1 %d (0 G0 G1)\n", i);
fprintf(fp, "\n");
// Pinned memory testing
fprintf(fp, "# Pinned CPU memory read tests\n");
for (int i = 0; i < numGpuDevices; i++)
fprintf(fp, "1 %d (%d C%d G%d)\n", numBlocks, i, i, i);
fprintf(fp, "\n");
fprintf(fp, "# Pinned CPU memory write tests\n");
for (int i = 0; i < numGpuDevices; i++)
fprintf(fp, "1 %d (%d G%d C%d)\n", numBlocks, i, i, i);
fprintf(fp, "\n");
// Single link testing GPU testing
fprintf(fp, "# Unidirectional link GPU tests\n");
for (int i = 0; i < numGpuDevices; i++)
for (int j = 0; j < numGpuDevices; j++)
{
if (i == j) continue;
fprintf(fp, "1 %d (%d G%d G%d)\n", numBlocks, i, i, j);
}
fprintf(fp, "\n");
// Bi-directional link testing
fprintf(fp, "# Bi-directional link tests\n");
for (int i = 0; i < numGpuDevices; i++)
for (int j = 0; j < numGpuDevices; j++)
{
if (i == j) continue;
fprintf(fp, "2 %d (%d G%d G%d) (%d G%d G%d)\n", numBlocks, i, i, j, j, j, i);
}
fprintf(fp, "\n");
// Simple uni-directional ring
fprintf(fp, "# Simple unidirectional ring\n");
fprintf(fp, "%d %d", numGpuDevices, numBlocks);
for (int i = 0; i < numGpuDevices; i++)
{
fprintf(fp, " (%d G%d G%d)", i, i, (i+1)%numGpuDevices);
}
fprintf(fp, "\n\n");
// Simple bi-directional ring
fprintf(fp, "# Simple bi-directional ring\n");
fprintf(fp, "%d %d", numGpuDevices * 2, numBlocks);
for (int i = 0; i < numGpuDevices; i++)
fprintf(fp, " (%d G%d G%d)", i, i, (i+1)%numGpuDevices);
for (int i = 0; i < numGpuDevices; i++)
fprintf(fp, " (%d G%d G%d)", i, i, (i+numGpuDevices-1)%numGpuDevices);
fprintf(fp, "\n\n");
// Broadcast from GPU 0
fprintf(fp, "# GPU 0 Broadcast\n");
fprintf(fp, "%d %d", numGpuDevices-1, numBlocks);
for (int i = 1; i < numGpuDevices; i++)
fprintf(fp, " (%d G%d G%d)", 0, 0, i);
fprintf(fp, "\n\n");
// Gather to GPU 0
fprintf(fp, "# GPU 0 Gather\n");
fprintf(fp, "%d %d", numGpuDevices-1, numBlocks);
for (int i = 1; i < numGpuDevices; i++)
fprintf(fp, " (%d G%d G%d)", 0, i, 0);
fprintf(fp, "\n\n");
// Full stress test
fprintf(fp, "# Full stress test\n");
fprintf(fp, "%d %d", numGpuDevices * (numGpuDevices-1), numBlocks);
for (int i = 0; i < numGpuDevices; i++)
for (int j = 0; j < numGpuDevices; j++)
{
if (i == j) continue;
fprintf(fp, " (%d G%d G%d)", i, i, j);
}
fprintf(fp, "\n\n");
fclose(fp);
}
void DisplayTopology()
{
printf("\nDetected topology:\n");
@@ -700,3 +836,48 @@ void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, float* ptr
free(refBuffer);
}
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount)
{
char result[10];
switch (linkType)
{
case HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT: sprintf(result, " HT-%d", hopCount); break;
case HSA_AMD_LINK_INFO_TYPE_QPI : sprintf(result, " QPI-%d", hopCount); break;
case HSA_AMD_LINK_INFO_TYPE_PCIE : sprintf(result, "PCIE-%d", hopCount); break;
case HSA_AMD_LINK_INFO_TYPE_INFINBAND : sprintf(result, "INFB-%d", hopCount); break;
case HSA_AMD_LINK_INFO_TYPE_XGMI : sprintf(result, "XGMI-%d", hopCount); break;
default: sprintf(result, "??????");
}
return result;
}
std::string GetLinkDesc(Link const& link)
{
std::string result = "";
// Currently only describe links between src/dst on GPU
if (link.srcMemType == MEM_GPU && link.dstMemType == MEM_GPU)
{
if (link.exeIndex != link.srcIndex)
{
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(link.srcIndex, link.exeIndex, &linkType, &hopCount));
result += GetLinkTypeDesc(linkType, hopCount);
}
if (link.exeIndex != link.dstIndex)
{
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(link.exeIndex, link.dstIndex, &linkType, &hopCount));
if (result != "") result += "+";
result += GetLinkTypeDesc(linkType, hopCount);
}
}
else
{
result = "???";
}
return result;
}
+7 -3
Просмотреть файл
@@ -81,12 +81,16 @@ struct BlockParam
float* dst;
};
void DisplayUsage(char const* cmdName); // Display usage instructions
void DisplayTopology(); // Display GPU topology
void ParseLinks(char* line, std::vector<Link>& links); // Parse Link information
void DisplayUsage(char const* cmdName); // Display usage instructions
void GenerateConfigFile(char const* cfgFile, int numBlocks); // Generate a sample config file
void DisplayTopology(); // Display GPU topology
void ParseLinks(char* line, std::vector<Link>& links); // Parse Link information
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, bool useFineGrainMem, float** memPtr);
void DeallocateMemory(MemType memType, int devIndex, float* memPtr);
void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, float* ptr);
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
std::string GetLinkDesc(Link const& link);
#define MAX_NAME_LEN 64
#define BLOCKSIZE 256
+1 -1
Просмотреть файл
@@ -21,7 +21,7 @@
DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
for i in {0..38}
for i in {0..44}
do
$DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
$DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log"
+87
Просмотреть файл
@@ -0,0 +1,87 @@
<system version="2">
<cpu numaid="0" affinity="00000000,00000000,00000000,ffffffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="0" sm="90" gcn="908" arch="38911" rank="0" gdr="1">
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="1" affinity="00000000,00000000,ffffffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="1" sm="90" gcn="908" arch="38911" rank="1" gdr="1">
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="2" sm="90" gcn="908" arch="38911" rank="2" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="3" sm="90" gcn="908" arch="38911" rank="3" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,ffffffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="4" sm="90" gcn="908" arch="38911" rank="4" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="5" sm="90" gcn="908" arch="38911" rank="5" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:e1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" guid="0x70cd600003da341c" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
<cpu numaid="3" affinity="ffffffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:a1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:a3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="6" sm="90" gcn="908" arch="38911" rank="6" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="7" sm="90" gcn="908" arch="38911" rank="7" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
</system>
+87
Просмотреть файл
@@ -0,0 +1,87 @@
<system version="2">
<cpu numaid="0" affinity="00000000,00000000,00ffffff,00000000,00000000,00ffffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:61:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:63:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="0" sm="90" gcn="908" arch="38911" rank="0" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:27:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="1" sm="90" gcn="908" arch="38911" rank="1" gdr="1">
<xgmi target="0000:63:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:27:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="1" affinity="00000000,0000ffff,ff000000,00000000,0000ffff,ff000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:25:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:27:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="2" sm="90" gcn="908" arch="38911" rank="2" gdr="1">
<xgmi target="0000:63:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="3" sm="90" gcn="908" arch="38911" rank="3" gdr="1">
<xgmi target="0000:63:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:27:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="000000ff,ffff0000,00000000,000000ff,ffff0000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:e1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:e3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="4" sm="90" gcn="908" arch="38911" rank="4" gdr="1">
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="5" sm="90" gcn="908" arch="38911" rank="5" gdr="1">
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c4:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" guid="0x22fd9f00039b0398" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
<cpu numaid="3" affinity="ffffff00,00000000,00000000,ffffff00,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:a1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:a3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="6" sm="90" gcn="908" arch="38911" rank="6" gdr="1">
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="7" sm="90" gcn="908" arch="38911" rank="7" gdr="1">
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
</system>
+93
Просмотреть файл
@@ -0,0 +1,93 @@
<system version="2">
<cpu numaid="1" affinity="00000000,00000000,00000000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="0" sm="90" gcn="908" arch="38911" rank="0" gdr="1">
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="1" sm="90" gcn="908" arch="38911" rank="1" gdr="1">
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="2" sm="90" gcn="908" arch="38911" rank="2" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="3" sm="90" gcn="908" arch="38911" rank="3" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="5" affinity="00000000,ffff0000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="4" sm="90" gcn="908" arch="38911" rank="4" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="5" sm="90" gcn="908" arch="38911" rank="5" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="6" affinity="0000ffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:a1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:a3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="6" sm="90" gcn="908" arch="38911" rank="6" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="7" affinity="ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="7" sm="90" gcn="908" arch="38911" rank="7" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:e1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" guid="0x70cd600003da341c" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
</system>
+6
Просмотреть файл
@@ -108,6 +108,12 @@ NodeModelDesc model_descs[] = {
{4, "topo_8p_rome_n2_2.xml", "4 nodes 8 VEGA20 Rome NPS=2 Alt. Model 2 NET/IF"},
{4, "topo_8p_ts1_n4_2.xml", "4 nodes 8 VEGA20 TS1 NPS=4 3 NET/IF"},
{1, "topo_8p_rome_n4.xml", "single node 8 VEGA20 Rome NPS=4"},
{1, "topo_4p3l_n2.xml", "single node 8 gfx908 Rome"},
{4, "topo_4p3l_n2.xml", "4 nodes 8 gfx908 Rome"},
{1, "topo_4p3l_n4.xml", "single node 8 gfx908 Rome NPS=4"},
{4, "topo_4p3l_n4.xml", "4 nodes 8 gfx908 Rome NPS=4"},
{1, "topo_4p3l_n2_1.xml", "single node 8 gfx908 Rome"},
{4, "topo_4p3l_n2_1.xml", "4 nodes 8 gfx908 Rome"},
};
int main(int argc,char* argv[])