From 8bb3340fcbc65196ceda1ce8285492bb84edee7f Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Tue, 9 May 2023 07:59:56 -0700 Subject: [PATCH] Skip checking of some settings in Cray OS (#739) --- src/init.cc | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/init.cc b/src/init.cc index 325ede8e04..0a1facbf9c 100644 --- a/src/init.cc +++ b/src/init.cc @@ -106,17 +106,27 @@ static ncclResult_t ncclInit() { NCCLCHECK(ncclTopoGetStrFromSys("/proc/sys/kernel", "numa_balancing", strValue)); if (strcmp(strValue, "1") == 0) WARN("NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by \"sudo sysctl kernel.numa_balancing=0\""); - NCCLCHECK(ncclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue)); - if (strncmp("Hyper-V UEFI Release", strValue, 20) != 0) { - NCCLCHECK(ncclTopoGetStrFromSys("/proc", "cmdline", strValue)); - if (strstr(strValue, "amd_iommu=on") == NULL) - WARN("Missing \"amd_iommu=on\" from kernel command line which can lead to system instablity or hang!"); - if (strstr(strValue, "iommu=pt") == NULL) - WARN("Missing \"iommu=pt\" from kernel command line which can lead to system instablity or hang!"); + NCCLCHECK(ncclTopoGetStrFromSys("/proc", "version", strValue)); + char *verStr, *state; + verStr = strtok_r(strValue, " ", &state); + for (int i = 0; i < 2; i ++) { + verStr = strtok_r(NULL, " ", &state); + if (verStr == NULL) break; + } + INFO(NCCL_INIT, "Kernel version: %s", verStr); + if (strstr(verStr, "cray") == NULL) { + NCCLCHECK(ncclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue)); + if (strncmp("Hyper-V UEFI Release", strValue, 20) != 0) { + NCCLCHECK(ncclTopoGetStrFromSys("/proc", "cmdline", strValue)); + if (strstr(strValue, "amd_iommu=on") == NULL) + WARN("Missing \"amd_iommu=on\" from kernel command line which can lead to system instablity or hang!"); + if (strstr(strValue, "iommu=pt") == NULL) + WARN("Missing \"iommu=pt\" from kernel command line which can lead to system instablity or hang!"); + } + char *env = getenv("HSA_FORCE_FINE_GRAIN_PCIE"); + if (env == NULL || strcmp(env, "1") != 0) + WARN("Missing \"HSA_FORCE_FINE_GRAIN_PCIE=1\" from environment which can lead to low RCCL performance, system instablity or hang!"); } - char *env = getenv("HSA_FORCE_FINE_GRAIN_PCIE"); - if (env == NULL || strcmp(env, "1") != 0) - WARN("Missing \"HSA_FORCE_FINE_GRAIN_PCIE=1\" from environment which can lead to low RCCL performance, system instablity or hang!"); #ifndef NVTX_NO_IMPL initNvtxRegisteredEnums(); #endif