diff --git a/CMakeLists.txt b/CMakeLists.txt index 57bc073fec..b0aafd195f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -373,6 +373,7 @@ set(SRC_FILES src/include/git_version.h src/include/graph.h src/include/group.h + src/include/hip_rocm_version_info.h src/include/ibvcore.h src/include/ibvsymbols.h src/include/ibvwrap.h diff --git a/src/include/hip_rocm_version_info.h b/src/include/hip_rocm_version_info.h new file mode 100644 index 0000000000..661d0d5222 --- /dev/null +++ b/src/include/hip_rocm_version_info.h @@ -0,0 +1,58 @@ +/* +Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef RCCL_HIP_ROCM_VERSION_INFO_H_ +#define RCCL_HIP_ROCM_VERSION_INFO_H_ + +#define STR2(v) #v +#define STR(v) STR2(v) + +// HIP version info retrieval +#if ROCM_VERSION >= 50000 + #define HIP_BUILD_INFO STR(HIP_VERSION_MAJOR) "." STR(HIP_VERSION_MINOR) "." STR(HIP_VERSION_PATCH) "-" HIP_VERSION_GITHASH +// HIP Githash info not available in older ROCm versions < 5.0 +#elif ROCM_VERSION >= 40000 + #define HIP_BUILD_INFO STR(HIP_VERSION_MAJOR) "." STR(HIP_VERSION_MINOR) "." STR(HIP_VERSION_PATCH) +#else + #define HIP_BUILD_INFO "Unknown" +#endif + +// ROCm version info retrieval +#if ROCM_VERSION >= 60000 + // rocm_version.h moved to rocm/include/rocm-core from ROCm 6.0 + #include +#else + // rocm-core/rocm_version.h not present in some ROCm versions < 6.0. + // So, including it from rocm/include/rocm_version.h + #if ROCM_VERSION >= 50000 + #include + //ROCM_BUILD_INFO not defined in ROCm Versions < 5.50 + #ifndef ROCM_BUILD_INFO + #define ROCM_BUILD_INFO STR(ROCM_VERSION_MAJOR) "." STR(ROCM_VERSION_MINOR) "." STR(ROCM_VERSION_PATCH) + #endif + //ROCm version info not available for ROCm versions < 5.0 + #else + #define ROCM_BUILD_INFO "Unknown" + #endif +#endif + +#endif \ No newline at end of file diff --git a/src/init.cc b/src/init.cc index 92feb355bd..aed173c764 100644 --- a/src/init.cc +++ b/src/init.cc @@ -42,6 +42,7 @@ // [RCCL] #include "git_version.h" #include "rccl_vars.h" +#include "hip_rocm_version_info.h" //#include "clique/CliqueManager.h" //#include // [/RCCL] @@ -49,8 +50,13 @@ #include "msccl/msccl_lifecycle.h" #include "msccl/msccl_status.h" -#define STR2(v) #v -#define STR(v) STR2(v) +#ifndef STR2 + #define STR2(v) #v +#endif + +#ifndef STR + #define STR(v) STR2(v) +#endif #if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) #define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream @@ -683,17 +689,40 @@ fail: // Pre-process the string so that running "strings" on the lib can quickly reveal the version. #if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) -#define VERSION_STRING "RCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip" STR(HIP_VERSION_MAJOR) "." STR(HIP_VERSION_MINOR) +#define VERSION_STRING "RCCL version : " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX +#define VERSION_STRING_EXTENDED "HIP version : " HIP_BUILD_INFO "\nROCm version : " ROCM_BUILD_INFO #else -#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR) +#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX +#define VERSION_STRING_EXTENDED "CUDA version " STR(CUDA_MAJOR) "." STR(CUDA_MINOR) #endif static void showVersion() { static int shown = 0; if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) { - printf("%s %s\n", VERSION_STRING, rcclGitHash); + char hostInfo[HOST_NAME_MAX] = {}, libPathInfo[2048] = {}; + size_t hostInfoSize = sizeof(hostInfo), libPathInfoSize = sizeof(libPathInfo); + + // Retrieve Hostname info + if (gethostname(hostInfo, hostInfoSize-1) != 0) { + // Returns Unknown in hostInfo if function call unsuccessful + strncpy(hostInfo, "Unknown", hostInfoSize-1); + } + + // Retrieve librccl path + Dl_info pathInfo; + if (dladdr((void*)ncclCommInitRank, &pathInfo)) { + strncpy(libPathInfo, pathInfo.dli_fname, libPathInfoSize-1); + } else { + // Sets libPath to Unknown if the above function call is not successful + strncpy(libPathInfo, "Unknown", libPathInfoSize-1); + } + + printf("%s-%s\n%s\n", VERSION_STRING, rcclGitHash, VERSION_STRING_EXTENDED); + printf("%-12s : %s\n%-12s : %s\n", "Hostname", hostInfo, "Librccl path", libPathInfo); fflush(stdout); - if (ncclDebugFile != stdout) - INFO(NCCL_ALL,"%s %s", VERSION_STRING, rcclGitHash); // Also log NCCL version in one of the files + if (ncclDebugFile != stdout) { + INFO(NCCL_ALL, "%s-%s\n%s\n", VERSION_STRING, rcclGitHash, VERSION_STRING_EXTENDED); // Also log NCCL version in one of the files + INFO(NCCL_ALL, "%-12s : %s\n%-12s : %s\n", "Hostname", hostInfo, "Librccl path", libPathInfo); + } shown = 1; } }