From 62f5e6a82f042cfe7786c275d0975c4bd5da33cd Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Mon, 6 Mar 2023 08:17:06 -0800 Subject: [PATCH] Warn user on incorrect system settings (#696) * Warn user on incorrect system settings * Fix typo * Add possible impact * Ignore iommu settings in VM [ROCm/rccl commit: 79a2031951f7d1afb2eadf2bf8dff0abb8825124] --- projects/rccl/src/init.cc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 59964ed005..c0b856e12f 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -32,6 +32,7 @@ #include #include #include "graph/topo.h" +#include "graph/xml.h" // [RCCL] #include "git_version.h" @@ -108,6 +109,21 @@ static ncclResult_t ncclInit() { NCCLCHECK(bootstrapNetInit()); NCCLCHECK(ncclNetPluginInit()); + char strValue[MAX_STR_LEN]; + NCCLCHECK(ncclTopoGetStrFromSys("/proc/sys/kernel", "numa_balancing", strValue)); + if (strcmp(strValue, "1") == 0) + WARN("NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by \"sudo sysctl kernel.numa_balancing=0\""); + NCCLCHECK(ncclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue)); + if (strncmp("Hyper-V UEFI Release", strValue, 20) != 0) { + NCCLCHECK(ncclTopoGetStrFromSys("/proc", "cmdline", strValue)); + if (strstr(strValue, "amd_iommu=on") == NULL) + WARN("Missing \"amd_iommu=on\" from kernel command line which can lead to system instablity or hang!"); + if (strstr(strValue, "iommu=pt") == NULL) + WARN("Missing \"iommu=pt\" from kernel command line which can lead to system instablity or hang!"); + } + char *env = getenv("HSA_FORCE_FINE_GRAIN_PCIE"); + if (env == NULL || strcmp(env, "1") != 0) + WARN("Missing \"HSA_FORCE_FINE_GRAIN_PCIE=1\" from environment which can lead to low RCCL performance, system instablity or hang!"); #ifndef NVTX_NO_IMPL initNvtxRegisteredEnums(); #endif