Skip checking of some settings in Cray OS (#739)
Tento commit je obsažen v:
+20
-10
@@ -106,17 +106,27 @@ static ncclResult_t ncclInit() {
|
||||
NCCLCHECK(ncclTopoGetStrFromSys("/proc/sys/kernel", "numa_balancing", strValue));
|
||||
if (strcmp(strValue, "1") == 0)
|
||||
WARN("NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by \"sudo sysctl kernel.numa_balancing=0\"");
|
||||
NCCLCHECK(ncclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue));
|
||||
if (strncmp("Hyper-V UEFI Release", strValue, 20) != 0) {
|
||||
NCCLCHECK(ncclTopoGetStrFromSys("/proc", "cmdline", strValue));
|
||||
if (strstr(strValue, "amd_iommu=on") == NULL)
|
||||
WARN("Missing \"amd_iommu=on\" from kernel command line which can lead to system instablity or hang!");
|
||||
if (strstr(strValue, "iommu=pt") == NULL)
|
||||
WARN("Missing \"iommu=pt\" from kernel command line which can lead to system instablity or hang!");
|
||||
NCCLCHECK(ncclTopoGetStrFromSys("/proc", "version", strValue));
|
||||
char *verStr, *state;
|
||||
verStr = strtok_r(strValue, " ", &state);
|
||||
for (int i = 0; i < 2; i ++) {
|
||||
verStr = strtok_r(NULL, " ", &state);
|
||||
if (verStr == NULL) break;
|
||||
}
|
||||
INFO(NCCL_INIT, "Kernel version: %s", verStr);
|
||||
if (strstr(verStr, "cray") == NULL) {
|
||||
NCCLCHECK(ncclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue));
|
||||
if (strncmp("Hyper-V UEFI Release", strValue, 20) != 0) {
|
||||
NCCLCHECK(ncclTopoGetStrFromSys("/proc", "cmdline", strValue));
|
||||
if (strstr(strValue, "amd_iommu=on") == NULL)
|
||||
WARN("Missing \"amd_iommu=on\" from kernel command line which can lead to system instablity or hang!");
|
||||
if (strstr(strValue, "iommu=pt") == NULL)
|
||||
WARN("Missing \"iommu=pt\" from kernel command line which can lead to system instablity or hang!");
|
||||
}
|
||||
char *env = getenv("HSA_FORCE_FINE_GRAIN_PCIE");
|
||||
if (env == NULL || strcmp(env, "1") != 0)
|
||||
WARN("Missing \"HSA_FORCE_FINE_GRAIN_PCIE=1\" from environment which can lead to low RCCL performance, system instablity or hang!");
|
||||
}
|
||||
char *env = getenv("HSA_FORCE_FINE_GRAIN_PCIE");
|
||||
if (env == NULL || strcmp(env, "1") != 0)
|
||||
WARN("Missing \"HSA_FORCE_FINE_GRAIN_PCIE=1\" from environment which can lead to low RCCL performance, system instablity or hang!");
|
||||
#ifndef NVTX_NO_IMPL
|
||||
initNvtxRegisteredEnums();
|
||||
#endif
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele