[RAS] Add support for RAS client (#1748)
Enable RAS client binary `rcclras`
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
75d22b47cb
Коммит
8d3a5542fb
@@ -1231,6 +1231,9 @@ License: See LICENSE.txt for license information\n")
|
||||
Optimized primitives for collective multi-GPU communication")
|
||||
endif()
|
||||
|
||||
## Building RCCL RAS
|
||||
include(cmake/rcclRAS.cmake)
|
||||
|
||||
if(BUILD_TESTS)
|
||||
rocm_package_setup_component(clients)
|
||||
rocm_package_setup_client_component(tests PACKAGE_NAME unittests)
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
|
||||
cmake_minimum_required(VERSION 3.16)
|
||||
|
||||
message("Building rccl RAS client executable")
|
||||
|
||||
set(CMAKE_IGNORE_PATH "${ROCM_PATH}/lib" "${ROCM_PATH}/include")
|
||||
|
||||
add_executable(rcclras "${PROJECT_BINARY_DIR}/hipify/src/ras/client.cc")
|
||||
|
||||
target_include_directories(rcclras PRIVATE ${PROJECT_BINARY_DIR}/include)
|
||||
target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src)
|
||||
target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src/include)
|
||||
|
||||
if(BUILD_SHARED_LIBS)
|
||||
target_link_libraries(rcclras PRIVATE rccl)
|
||||
set_property(TARGET rcclras PROPERTY INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
|
||||
else()
|
||||
add_dependencies(rccl-UnitTests rccl)
|
||||
target_link_libraries(rcclras PRIVATE dl rt -lrccl -L${CMAKE_BINARY_DIR} -lamdhip64 -L${ROCM_PATH}/lib)
|
||||
endif()
|
||||
|
||||
set_target_properties(rcclras PROPERTIES BUILD_RPATH "${CMAKE_BINARY_DIR}")
|
||||
|
||||
rocm_install(TARGETS rcclras)
|
||||
@@ -84,7 +84,7 @@ static void parseArgs(int argc, char** argv) {
|
||||
printUsage(argv[0]);
|
||||
exit(0);
|
||||
case 'r':
|
||||
fprintf(stderr, "NCCL RAS client version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "."
|
||||
fprintf(stderr, "RCCL RAS client version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "."
|
||||
STR(NCCL_PATCH) NCCL_SUFFIX "\n");
|
||||
exit(0);
|
||||
default:
|
||||
|
||||
@@ -487,18 +487,20 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
|
||||
int firstIdx, nPeers;
|
||||
struct rasValCount valCounts[NCCL_MAX_LOCAL_RANKS];
|
||||
int nValCounts;
|
||||
static int cudaDriver = -1, cudaRuntime = -1;
|
||||
static int hipRuntime = -1, amdgpuDriver = -1;
|
||||
|
||||
TRACE(NCCL_RAS, "RAS: rasClientRunInit: starting");
|
||||
|
||||
rasOutReset();
|
||||
rasOutAppend("NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX
|
||||
" compiled with CUDA " STR(CUDA_MAJOR) "." STR(CUDA_MINOR) "\n");
|
||||
if (cudaRuntime == -1)
|
||||
cudaRuntimeGetVersion(&cudaRuntime);
|
||||
if (cudaDriver == -1)
|
||||
cudaDriverGetVersion(&cudaDriver);
|
||||
rasOutAppend("CUDA runtime version %d, driver version %d\n\n", cudaRuntime, cudaDriver);
|
||||
rasOutAppend("RCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX
|
||||
" compiled with ROCm " STR(ROCM_BUILD_INFO) "\n");
|
||||
if (hipRuntime == -1)
|
||||
hipRuntimeGetVersion(&hipRuntime);
|
||||
if (amdgpuDriver == -1)
|
||||
hipDriverGetVersion(&amdgpuDriver);
|
||||
//Find a better way to query amdgpu driver version, as hipDriverGetVersion() reports the same as hipRuntimeGetVersion()
|
||||
//Else, cudaRuntimeGetVersion() and cudaDriverGetVersion() are anyways hipified, so no need of this mod
|
||||
rasOutAppend("HIP runtime version %d, amdgpu driver version %d\n\n", hipRuntime, amdgpuDriver);
|
||||
msgLen = rasOutLength();
|
||||
NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
|
||||
rasOutExtract(msg);
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#include "ras.h"
|
||||
#include "socket.h"
|
||||
#include "utils.h"
|
||||
#include "hip_rocm_version_info.h"
|
||||
|
||||
// Type of a RAS network or client message.
|
||||
typedef enum {
|
||||
|
||||
Ссылка в новой задаче
Block a user