Warn user on incorrect system settings (#696)

* Warn user on incorrect system settings

* Fix typo

* Add possible impact

* Ignore iommu settings in VM

[ROCm/rccl commit: 79a2031951]
This commit is contained in:
Wenkai Du
2023-03-06 08:17:06 -08:00
کامیت شده توسط GitHub
والد 0da1d6a6cd
کامیت 62f5e6a82f
+16
مشاهده پرونده
@@ -32,6 +32,7 @@
#include <sys/stat.h>
#include <unistd.h>
#include "graph/topo.h"
#include "graph/xml.h"
// [RCCL]
#include "git_version.h"
@@ -108,6 +109,21 @@ static ncclResult_t ncclInit() {
NCCLCHECK(bootstrapNetInit());
NCCLCHECK(ncclNetPluginInit());
char strValue[MAX_STR_LEN];
NCCLCHECK(ncclTopoGetStrFromSys("/proc/sys/kernel", "numa_balancing", strValue));
if (strcmp(strValue, "1") == 0)
WARN("NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by \"sudo sysctl kernel.numa_balancing=0\"");
NCCLCHECK(ncclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue));
if (strncmp("Hyper-V UEFI Release", strValue, 20) != 0) {
NCCLCHECK(ncclTopoGetStrFromSys("/proc", "cmdline", strValue));
if (strstr(strValue, "amd_iommu=on") == NULL)
WARN("Missing \"amd_iommu=on\" from kernel command line which can lead to system instablity or hang!");
if (strstr(strValue, "iommu=pt") == NULL)
WARN("Missing \"iommu=pt\" from kernel command line which can lead to system instablity or hang!");
}
char *env = getenv("HSA_FORCE_FINE_GRAIN_PCIE");
if (env == NULL || strcmp(env, "1") != 0)
WARN("Missing \"HSA_FORCE_FINE_GRAIN_PCIE=1\" from environment which can lead to low RCCL performance, system instablity or hang!");
#ifndef NVTX_NO_IMPL
initNvtxRegisteredEnums();
#endif