Improved version reporting in NCCL_DEBUG=VERSION (#1232)
* Improved version reporting in NCCL_DEBUG=VERSION.
Signed-off-by: rahulvaidya20 <ravaidya@amd.com>
* Version reporting changes
Signed-off-by: rahulvaidya20 <ravaidya@amd.com>
* Versioning changes: Initialized char arrays to null and fixed typo.
---------
Signed-off-by: rahulvaidya20 <ravaidya@amd.com>
[ROCm/rccl commit: c755b9cf93]
Este commit está contenido en:
@@ -373,6 +373,7 @@ set(SRC_FILES
|
||||
src/include/git_version.h
|
||||
src/include/graph.h
|
||||
src/include/group.h
|
||||
src/include/hip_rocm_version_info.h
|
||||
src/include/ibvcore.h
|
||||
src/include/ibvsymbols.h
|
||||
src/include/ibvwrap.h
|
||||
|
||||
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef RCCL_HIP_ROCM_VERSION_INFO_H_
|
||||
#define RCCL_HIP_ROCM_VERSION_INFO_H_
|
||||
|
||||
#define STR2(v) #v
|
||||
#define STR(v) STR2(v)
|
||||
|
||||
// HIP version info retrieval
|
||||
#if ROCM_VERSION >= 50000
|
||||
#define HIP_BUILD_INFO STR(HIP_VERSION_MAJOR) "." STR(HIP_VERSION_MINOR) "." STR(HIP_VERSION_PATCH) "-" HIP_VERSION_GITHASH
|
||||
// HIP Githash info not available in older ROCm versions < 5.0
|
||||
#elif ROCM_VERSION >= 40000
|
||||
#define HIP_BUILD_INFO STR(HIP_VERSION_MAJOR) "." STR(HIP_VERSION_MINOR) "." STR(HIP_VERSION_PATCH)
|
||||
#else
|
||||
#define HIP_BUILD_INFO "Unknown"
|
||||
#endif
|
||||
|
||||
// ROCm version info retrieval
|
||||
#if ROCM_VERSION >= 60000
|
||||
// rocm_version.h moved to rocm/include/rocm-core from ROCm 6.0
|
||||
#include <rocm-core/rocm_version.h>
|
||||
#else
|
||||
// rocm-core/rocm_version.h not present in some ROCm versions < 6.0.
|
||||
// So, including it from rocm/include/rocm_version.h
|
||||
#if ROCM_VERSION >= 50000
|
||||
#include <rocm_version.h>
|
||||
//ROCM_BUILD_INFO not defined in ROCm Versions < 5.50
|
||||
#ifndef ROCM_BUILD_INFO
|
||||
#define ROCM_BUILD_INFO STR(ROCM_VERSION_MAJOR) "." STR(ROCM_VERSION_MINOR) "." STR(ROCM_VERSION_PATCH)
|
||||
#endif
|
||||
//ROCm version info not available for ROCm versions < 5.0
|
||||
#else
|
||||
#define ROCM_BUILD_INFO "Unknown"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -42,6 +42,7 @@
|
||||
// [RCCL]
|
||||
#include "git_version.h"
|
||||
#include "rccl_vars.h"
|
||||
#include "hip_rocm_version_info.h"
|
||||
//#include "clique/CliqueManager.h"
|
||||
//#include <hsa/hsa_ext_amd.h>
|
||||
// [/RCCL]
|
||||
@@ -49,8 +50,13 @@
|
||||
#include "msccl/msccl_lifecycle.h"
|
||||
#include "msccl/msccl_status.h"
|
||||
|
||||
#define STR2(v) #v
|
||||
#define STR(v) STR2(v)
|
||||
#ifndef STR2
|
||||
#define STR2(v) #v
|
||||
#endif
|
||||
|
||||
#ifndef STR
|
||||
#define STR(v) STR2(v)
|
||||
#endif
|
||||
|
||||
#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
|
||||
@@ -683,17 +689,40 @@ fail:
|
||||
|
||||
// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define VERSION_STRING "RCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip" STR(HIP_VERSION_MAJOR) "." STR(HIP_VERSION_MINOR)
|
||||
#define VERSION_STRING "RCCL version : " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX
|
||||
#define VERSION_STRING_EXTENDED "HIP version : " HIP_BUILD_INFO "\nROCm version : " ROCM_BUILD_INFO
|
||||
#else
|
||||
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
|
||||
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX
|
||||
#define VERSION_STRING_EXTENDED "CUDA version " STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
|
||||
#endif
|
||||
static void showVersion() {
|
||||
static int shown = 0;
|
||||
if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
|
||||
printf("%s %s\n", VERSION_STRING, rcclGitHash);
|
||||
char hostInfo[HOST_NAME_MAX] = {}, libPathInfo[2048] = {};
|
||||
size_t hostInfoSize = sizeof(hostInfo), libPathInfoSize = sizeof(libPathInfo);
|
||||
|
||||
// Retrieve Hostname info
|
||||
if (gethostname(hostInfo, hostInfoSize-1) != 0) {
|
||||
// Returns Unknown in hostInfo if function call unsuccessful
|
||||
strncpy(hostInfo, "Unknown", hostInfoSize-1);
|
||||
}
|
||||
|
||||
// Retrieve librccl path
|
||||
Dl_info pathInfo;
|
||||
if (dladdr((void*)ncclCommInitRank, &pathInfo)) {
|
||||
strncpy(libPathInfo, pathInfo.dli_fname, libPathInfoSize-1);
|
||||
} else {
|
||||
// Sets libPath to Unknown if the above function call is not successful
|
||||
strncpy(libPathInfo, "Unknown", libPathInfoSize-1);
|
||||
}
|
||||
|
||||
printf("%s-%s\n%s\n", VERSION_STRING, rcclGitHash, VERSION_STRING_EXTENDED);
|
||||
printf("%-12s : %s\n%-12s : %s\n", "Hostname", hostInfo, "Librccl path", libPathInfo);
|
||||
fflush(stdout);
|
||||
if (ncclDebugFile != stdout)
|
||||
INFO(NCCL_ALL,"%s %s", VERSION_STRING, rcclGitHash); // Also log NCCL version in one of the files
|
||||
if (ncclDebugFile != stdout) {
|
||||
INFO(NCCL_ALL, "%s-%s\n%s\n", VERSION_STRING, rcclGitHash, VERSION_STRING_EXTENDED); // Also log NCCL version in one of the files
|
||||
INFO(NCCL_ALL, "%-12s : %s\n%-12s : %s\n", "Hostname", hostInfo, "Librccl path", libPathInfo);
|
||||
}
|
||||
shown = 1;
|
||||
}
|
||||
}
|
||||
|
||||
Referencia en una nueva incidencia
Block a user