From 8d3a5542fb0ce26b90b3006b31bd64bb6a12de9a Mon Sep 17 00:00:00 2001 From: Nilesh M Negi Date: Sun, 29 Jun 2025 18:53:16 -0500 Subject: [PATCH] [RAS] Add support for RAS client (#1748) Enable RAS client binary `rcclras` --- CMakeLists.txt | 3 +++ cmake/rcclRAS.cmake | 25 +++++++++++++++++++++++++ src/ras/client.cc | 2 +- src/ras/client_support.cc | 18 ++++++++++-------- src/ras/ras_internal.h | 1 + 5 files changed, 40 insertions(+), 9 deletions(-) create mode 100644 cmake/rcclRAS.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b8aea2245..9ae628058e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1231,6 +1231,9 @@ License: See LICENSE.txt for license information\n") Optimized primitives for collective multi-GPU communication") endif() +## Building RCCL RAS +include(cmake/rcclRAS.cmake) + if(BUILD_TESTS) rocm_package_setup_component(clients) rocm_package_setup_client_component(tests PACKAGE_NAME unittests) diff --git a/cmake/rcclRAS.cmake b/cmake/rcclRAS.cmake new file mode 100644 index 0000000000..66155d9e29 --- /dev/null +++ b/cmake/rcclRAS.cmake @@ -0,0 +1,25 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. + +cmake_minimum_required(VERSION 3.16) + +message("Building rccl RAS client executable") + +set(CMAKE_IGNORE_PATH "${ROCM_PATH}/lib" "${ROCM_PATH}/include") + +add_executable(rcclras "${PROJECT_BINARY_DIR}/hipify/src/ras/client.cc") + +target_include_directories(rcclras PRIVATE ${PROJECT_BINARY_DIR}/include) +target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src) +target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src/include) + +if(BUILD_SHARED_LIBS) + target_link_libraries(rcclras PRIVATE rccl) + set_property(TARGET rcclras PROPERTY INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") +else() + add_dependencies(rccl-UnitTests rccl) + target_link_libraries(rcclras PRIVATE dl rt -lrccl -L${CMAKE_BINARY_DIR} -lamdhip64 -L${ROCM_PATH}/lib) +endif() + +set_target_properties(rcclras PROPERTIES BUILD_RPATH "${CMAKE_BINARY_DIR}") + +rocm_install(TARGETS rcclras) diff --git a/src/ras/client.cc b/src/ras/client.cc index 8061cef4e6..56937b139b 100644 --- a/src/ras/client.cc +++ b/src/ras/client.cc @@ -84,7 +84,7 @@ static void parseArgs(int argc, char** argv) { printUsage(argv[0]); exit(0); case 'r': - fprintf(stderr, "NCCL RAS client version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." + fprintf(stderr, "RCCL RAS client version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "\n"); exit(0); default: diff --git a/src/ras/client_support.cc b/src/ras/client_support.cc index 3eafe1b791..d3a46bc371 100644 --- a/src/ras/client_support.cc +++ b/src/ras/client_support.cc @@ -487,18 +487,20 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { int firstIdx, nPeers; struct rasValCount valCounts[NCCL_MAX_LOCAL_RANKS]; int nValCounts; - static int cudaDriver = -1, cudaRuntime = -1; + static int hipRuntime = -1, amdgpuDriver = -1; TRACE(NCCL_RAS, "RAS: rasClientRunInit: starting"); rasOutReset(); - rasOutAppend("NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX - " compiled with CUDA " STR(CUDA_MAJOR) "." STR(CUDA_MINOR) "\n"); - if (cudaRuntime == -1) - cudaRuntimeGetVersion(&cudaRuntime); - if (cudaDriver == -1) - cudaDriverGetVersion(&cudaDriver); - rasOutAppend("CUDA runtime version %d, driver version %d\n\n", cudaRuntime, cudaDriver); + rasOutAppend("RCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX + " compiled with ROCm " STR(ROCM_BUILD_INFO) "\n"); + if (hipRuntime == -1) + hipRuntimeGetVersion(&hipRuntime); + if (amdgpuDriver == -1) + hipDriverGetVersion(&amdgpuDriver); + //Find a better way to query amdgpu driver version, as hipDriverGetVersion() reports the same as hipRuntimeGetVersion() + //Else, cudaRuntimeGetVersion() and cudaDriverGetVersion() are anyways hipified, so no need of this mod + rasOutAppend("HIP runtime version %d, amdgpu driver version %d\n\n", hipRuntime, amdgpuDriver); msgLen = rasOutLength(); NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail); rasOutExtract(msg); diff --git a/src/ras/ras_internal.h b/src/ras/ras_internal.h index 17326c342a..b35305207a 100644 --- a/src/ras/ras_internal.h +++ b/src/ras/ras_internal.h @@ -22,6 +22,7 @@ #include "ras.h" #include "socket.h" #include "utils.h" +#include "hip_rocm_version_info.h" // Type of a RAS network or client message. typedef enum {