From 7130de30583bc33982473948e131aef103eafce7 Mon Sep 17 00:00:00 2001 From: gabrpham_amdeng Date: Wed, 4 Jun 2025 14:04:04 -0500 Subject: [PATCH] [SWDEV-536184] Modified KFD fallback condition for getting VRAM to include sysfs read failures Signed-off-by: gabrpham_amdeng --- rocm_smi/src/rocm_smi.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index c1985e0d35..8b992537a3 100644 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -3995,8 +3995,8 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, DEVICE_MUTEX ret = get_dev_value_int(mem_type_file, dv_ind, total); - // Fallback to KFD reported memory if VRAM total is 0 - if (mem_type == RSMI_MEM_TYPE_VRAM && *total == 0) { + // Fallback to KFD reported memory if VRAM total is 0 or sysfs read fails + if (mem_type == RSMI_MEM_TYPE_VRAM && (*total == 0 || ret != RSMI_STATUS_SUCCESS)) { GET_DEV_AND_KFDNODE_FROM_INDX if (kfd_node->get_total_memory(total) == 0 && *total > 0) { ss << __PRETTY_FUNCTION__ @@ -4072,8 +4072,9 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, DEVICE_MUTEX ret = get_dev_value_int(mem_type_file, dv_ind, used); - // Fallback to KFD reported memory if no VRAM - if (mem_type == RSMI_MEM_TYPE_VRAM && *used == 0) { + // Fallback to KFD reported memory if no VRAM or sysfs read fails + if (mem_type == RSMI_MEM_TYPE_VRAM && (*used == 0 || ret != RSMI_STATUS_SUCCESS)) { + std::cout << "fell back to KFD node" << std::endl; GET_DEV_AND_KFDNODE_FROM_INDX uint64_t total = 0; ret = get_dev_value_int(amd::smi::kDevMemTotVRAM, dv_ind, &total);