[RAS] Add support for RAS client (#1748)

Enable RAS client binary `rcclras`
Этот коммит содержится в:
Nilesh M Negi
2025-06-29 18:53:16 -05:00
коммит произвёл GitHub
родитель 75d22b47cb
Коммит 8d3a5542fb
5 изменённых файлов: 40 добавлений и 9 удалений
+3
Просмотреть файл
@@ -1231,6 +1231,9 @@ License: See LICENSE.txt for license information\n")
Optimized primitives for collective multi-GPU communication")
endif()
## Building RCCL RAS
include(cmake/rcclRAS.cmake)
if(BUILD_TESTS)
rocm_package_setup_component(clients)
rocm_package_setup_client_component(tests PACKAGE_NAME unittests)
+25
Просмотреть файл
@@ -0,0 +1,25 @@
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
cmake_minimum_required(VERSION 3.16)
message("Building rccl RAS client executable")
set(CMAKE_IGNORE_PATH "${ROCM_PATH}/lib" "${ROCM_PATH}/include")
add_executable(rcclras "${PROJECT_BINARY_DIR}/hipify/src/ras/client.cc")
target_include_directories(rcclras PRIVATE ${PROJECT_BINARY_DIR}/include)
target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src)
target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src/include)
if(BUILD_SHARED_LIBS)
target_link_libraries(rcclras PRIVATE rccl)
set_property(TARGET rcclras PROPERTY INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
else()
add_dependencies(rccl-UnitTests rccl)
target_link_libraries(rcclras PRIVATE dl rt -lrccl -L${CMAKE_BINARY_DIR} -lamdhip64 -L${ROCM_PATH}/lib)
endif()
set_target_properties(rcclras PROPERTIES BUILD_RPATH "${CMAKE_BINARY_DIR}")
rocm_install(TARGETS rcclras)
+1 -1
Просмотреть файл
@@ -84,7 +84,7 @@ static void parseArgs(int argc, char** argv) {
printUsage(argv[0]);
exit(0);
case 'r':
fprintf(stderr, "NCCL RAS client version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "."
fprintf(stderr, "RCCL RAS client version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "."
STR(NCCL_PATCH) NCCL_SUFFIX "\n");
exit(0);
default:
+10 -8
Просмотреть файл
@@ -487,18 +487,20 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
int firstIdx, nPeers;
struct rasValCount valCounts[NCCL_MAX_LOCAL_RANKS];
int nValCounts;
static int cudaDriver = -1, cudaRuntime = -1;
static int hipRuntime = -1, amdgpuDriver = -1;
TRACE(NCCL_RAS, "RAS: rasClientRunInit: starting");
rasOutReset();
rasOutAppend("NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX
" compiled with CUDA " STR(CUDA_MAJOR) "." STR(CUDA_MINOR) "\n");
if (cudaRuntime == -1)
cudaRuntimeGetVersion(&cudaRuntime);
if (cudaDriver == -1)
cudaDriverGetVersion(&cudaDriver);
rasOutAppend("CUDA runtime version %d, driver version %d\n\n", cudaRuntime, cudaDriver);
rasOutAppend("RCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX
" compiled with ROCm " STR(ROCM_BUILD_INFO) "\n");
if (hipRuntime == -1)
hipRuntimeGetVersion(&hipRuntime);
if (amdgpuDriver == -1)
hipDriverGetVersion(&amdgpuDriver);
//Find a better way to query amdgpu driver version, as hipDriverGetVersion() reports the same as hipRuntimeGetVersion()
//Else, cudaRuntimeGetVersion() and cudaDriverGetVersion() are anyways hipified, so no need of this mod
rasOutAppend("HIP runtime version %d, amdgpu driver version %d\n\n", hipRuntime, amdgpuDriver);
msgLen = rasOutLength();
NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
rasOutExtract(msg);
+1
Просмотреть файл
@@ -22,6 +22,7 @@
#include "ras.h"
#include "socket.h"
#include "utils.h"
#include "hip_rocm_version_info.h"
// Type of a RAS network or client message.
typedef enum {