diff --git a/docs/reference/amdsmi-py-api.md b/docs/reference/amdsmi-py-api.md index fff571ace7..16cf3f78cc 100644 --- a/docs/reference/amdsmi-py-api.md +++ b/docs/reference/amdsmi-py-api.md @@ -1107,9 +1107,9 @@ Field | Description ---|--- `name` | Name of process. If user does not have permission this will be "N/A" `pid` | Process ID -`mem` | Process memory usage +`mem` | Process memory usage in Bytes `engine_usage` |
Subfield Description
`gfx`GFX engine usage in ns
`enc`Encode engine usage in ns
-`memory_usage` |
Subfield Description
`gtt_mem`GTT memory usage
`cpu_mem`CPU memory usage
`vram_mem`VRAM memory usage
+`memory_usage` |
Subfield Description
`gtt_mem`GTT memory usage in Bytes
`cpu_mem`CPU memory usage in Bytes
`vram_mem`VRAM memory usage in Bytes
`cu_occupancy` | Number of Compute Units utilized Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function: diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index d5cf983b93..e8353c4ba3 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -1080,18 +1080,18 @@ typedef uint32_t amdsmi_process_handle_t; typedef struct { char name[AMDSMI_MAX_STRING_LENGTH]; amdsmi_process_handle_t pid; - uint64_t mem; //!< In bytes + uint64_t mem; //!< In Bytes struct engine_usage_ { uint64_t gfx; //!< In nano-secs uint64_t enc; //!< In nano-secs uint32_t reserved[12]; } engine_usage; //!< time the process spends using these engines in ns struct memory_usage_ { - uint64_t gtt_mem; //!< In MB - uint64_t cpu_mem; //!< In MB - uint64_t vram_mem; //!< In MB + uint64_t gtt_mem; //!< In Bytes + uint64_t cpu_mem; //!< In Bytes + uint64_t vram_mem; //!< In Bytes uint32_t reserved[10]; - } memory_usage; //!< in bytes + } memory_usage; //!< In Bytes char container_name[AMDSMI_MAX_STRING_LENGTH]; uint32_t cu_occupancy; //!< Num CUs utilized uint32_t reserved[11]; diff --git a/include/amd_smi/impl/amd_smi_gpu_device.h b/include/amd_smi/impl/amd_smi_gpu_device.h index 1bef7182b1..b42a9b468a 100644 --- a/include/amd_smi/impl/amd_smi_gpu_device.h +++ b/include/amd_smi/impl/amd_smi_gpu_device.h @@ -62,6 +62,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor { uint32_t get_gpu_fd() const; uint32_t get_card_id(); // -e feature + we can get card_id for our internal functions uint32_t get_drm_render_minor(); // -e feature + we can get card_id for our internal functions + uint64_t get_kfd_gpu_id(); // Used to decode vram usage for KFD processes std::string& get_gpu_path(); amdsmi_bdf_t get_bdf(); bool check_if_drm_is_supported() { return drm_.check_if_drm_is_supported(); } @@ -84,6 +85,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor { AMDSmiDrm& drm_; uint32_t card_index_; uint32_t drm_render_minor_; + uint64_t kfd_gpu_id_; // Used to decode vram usage for KFD processes GPUComputeProcessList_t compute_process_list_; int32_t get_compute_process_list_impl(GPUComputeProcessList_t& compute_process_list, ComputeProcessListType_t list_type); diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 565688cfd6..f8ff10291a 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -1290,7 +1290,7 @@ struct_amdsmi_proc_info_t._fields_ = [ ('memory_usage', struct_memory_usage_), ('container_name', ctypes.c_char * 256), ('cu_occupancy', ctypes.c_uint32), - ('PADDING_1', ctypes.c_ubyte * 4), + ('reserved', ctypes.c_uint32 * 11), ] amdsmi_proc_info_t = struct_amdsmi_proc_info_t @@ -3163,19 +3163,18 @@ __all__ = \ 'AMDSMI_VOLT_LAST', 'AMDSMI_VOLT_LOWEST', 'AMDSMI_VOLT_MAX', 'AMDSMI_VOLT_MAX_CRIT', 'AMDSMI_VOLT_MIN', 'AMDSMI_VOLT_MIN_CRIT', 'AMDSMI_VOLT_TYPE_FIRST', 'AMDSMI_VOLT_TYPE_INVALID', - 'AMDSMI_VOLT_TYPE_LAST', 'AMDSMI_VOLT_TYPE_VDDGFX', - 'AMDSMI_VOLT_TYPE_VDDBOARD', - 'AMDSMI_VRAM_TYPE_DDR2', 'AMDSMI_VRAM_TYPE_DDR3', - 'AMDSMI_VRAM_TYPE_DDR4', 'AMDSMI_VRAM_TYPE_GDDR1', - 'AMDSMI_VRAM_TYPE_GDDR2', 'AMDSMI_VRAM_TYPE_GDDR3', - 'AMDSMI_VRAM_TYPE_GDDR4', 'AMDSMI_VRAM_TYPE_GDDR5', - 'AMDSMI_VRAM_TYPE_GDDR6', 'AMDSMI_VRAM_TYPE_GDDR7', - 'AMDSMI_VRAM_TYPE_HBM', 'AMDSMI_VRAM_TYPE_HBM2', - 'AMDSMI_VRAM_TYPE_HBM2E', 'AMDSMI_VRAM_TYPE_HBM3', - 'AMDSMI_VRAM_TYPE_UNKNOWN', 'AMDSMI_VRAM_TYPE__MAX', - 'AMDSMI_XGMI_LINK_DISABLE', 'AMDSMI_XGMI_LINK_DOWN', - 'AMDSMI_XGMI_LINK_UP', 'AMDSMI_XGMI_STATUS_ERROR', - 'AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS', + 'AMDSMI_VOLT_TYPE_LAST', 'AMDSMI_VOLT_TYPE_VDDBOARD', + 'AMDSMI_VOLT_TYPE_VDDGFX', 'AMDSMI_VRAM_TYPE_DDR2', + 'AMDSMI_VRAM_TYPE_DDR3', 'AMDSMI_VRAM_TYPE_DDR4', + 'AMDSMI_VRAM_TYPE_GDDR1', 'AMDSMI_VRAM_TYPE_GDDR2', + 'AMDSMI_VRAM_TYPE_GDDR3', 'AMDSMI_VRAM_TYPE_GDDR4', + 'AMDSMI_VRAM_TYPE_GDDR5', 'AMDSMI_VRAM_TYPE_GDDR6', + 'AMDSMI_VRAM_TYPE_GDDR7', 'AMDSMI_VRAM_TYPE_HBM', + 'AMDSMI_VRAM_TYPE_HBM2', 'AMDSMI_VRAM_TYPE_HBM2E', + 'AMDSMI_VRAM_TYPE_HBM3', 'AMDSMI_VRAM_TYPE_UNKNOWN', + 'AMDSMI_VRAM_TYPE__MAX', 'AMDSMI_XGMI_LINK_DISABLE', + 'AMDSMI_XGMI_LINK_DOWN', 'AMDSMI_XGMI_LINK_UP', + 'AMDSMI_XGMI_STATUS_ERROR', 'AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS', 'AMDSMI_XGMI_STATUS_NO_ERRORS', 'CLK_LIMIT_MAX', 'CLK_LIMIT_MIN', 'RD_BW0', 'WR_BW0', 'amd_metrics_table_header_t', 'amdsmi_accelerator_partition_profile_config_t', diff --git a/src/amd_smi/amd_smi_gpu_device.cc b/src/amd_smi/amd_smi_gpu_device.cc index 602041674d..a20ae44f2b 100644 --- a/src/amd_smi/amd_smi_gpu_device.cc +++ b/src/amd_smi/amd_smi_gpu_device.cc @@ -100,6 +100,35 @@ uint32_t AMDSmiGPUDevice::get_drm_render_minor() { return this->drm_render_minor_; } +uint64_t AMDSmiGPUDevice::get_kfd_gpu_id() { + std::ostringstream ss; + // Should never return not_supported, but just in case + rsmi_status_t ret = rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED; + uint32_t gpu_index = this->get_gpu_id(); + rsmi_device_identifiers_t identifiers = rsmi_device_identifiers_t{}; + ret = rsmi_dev_device_identifiers_get(gpu_index, &identifiers); + if (ret != rsmi_status_t::RSMI_STATUS_SUCCESS) { + this->kfd_gpu_id_ = std::numeric_limits::max(); + } else { + this->kfd_gpu_id_ = identifiers.kfd_gpu_id; + } + + ss << __PRETTY_FUNCTION__ + << " | rsmi_dev_identifiers_get status: " << getRSMIStatusString(ret, false) << "\n" + << " | gpu_id_: " << gpu_id_ << "\n" + << " | identifiers.card_index: " << identifiers.card_index << "\n" + << " | identifiers.drm_render_minor: " << identifiers.drm_render_minor << "\n" + << " | identifiers.bdfid: " << std::hex << "0x" << identifiers.bdfid << "\n" + << " | identifiers.kfd_gpu_id: " << std::dec << identifiers.kfd_gpu_id << "\n" + << " | identifiers.partition_id: " << identifiers.partition_id << "\n" + << " | identifiers.smi_device_id: " << identifiers.smi_device_id << "\n" + << " | returning kfd_gpu_id_: " + << this->kfd_gpu_id_ << std::endl; + // std::cout << ss.str(); + LOG_DEBUG(ss); + return this->kfd_gpu_id_; +} + uint32_t AMDSmiGPUDevice::get_gpu_fd() const { return fd_; } @@ -159,7 +188,6 @@ pthread_mutex_t* AMDSmiGPUDevice::get_mutex() { return amd::smi::GetMutex(gpu_id_); } - int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t& compute_process_list, ComputeProcessListType_t list_type) { @@ -231,6 +259,41 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t& // Copy the cu occupancy from rsmi_process_info_t to amdsmi_proc_info_t asmi_proc_info.cu_occupancy = rsmi_proc_info.cu_occupancy; + // Safely handle KFD file access + uint64_t kfd_gpu_id = get_kfd_gpu_id(); + std::string kfd_path = "/sys/class/kfd/kfd/proc/" + + std::to_string(rsmi_proc_info.process_id) + + "/vram_" + std::to_string(kfd_gpu_id); + + // Check if the file exists before attempting to open it + if (access(kfd_path.c_str(), R_OK) == 0) { + std::ifstream kfd_file(kfd_path.c_str()); + if (kfd_file.is_open()) { + std::string line; + if (std::getline(kfd_file, line)) { + try { + uint64_t vram_bytes = std::stoull(line); + asmi_proc_info.mem = vram_bytes; // Already in bytes + asmi_proc_info.memory_usage.vram_mem = vram_bytes; // Already in bytes + } catch (const std::exception& e) { + // Handle conversion error gracefully + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | Failed to parse VRAM value from KFD: " << e.what(); + LOG_DEBUG(ss); + } + } + kfd_file.close(); + } else { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | Failed to open KFD file: " << kfd_path; + LOG_DEBUG(ss); + } + } else { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | KFD file not accessible: " << kfd_path; + LOG_DEBUG(ss); + } + return status_code; }; diff --git a/src/amd_smi/fdinfo.cc b/src/amd_smi/fdinfo.cc index 13454a3b29..1dd43a9382 100644 --- a/src/amd_smi/fdinfo.cc +++ b/src/amd_smi/fdinfo.cc @@ -166,26 +166,26 @@ amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid, if (it == pasids.end()) pasids.push_back(pasid); } else if (line.find("drm-memory-gtt:") != std::string::npos) { unsigned long mem; - if (sscanf(line.c_str(), "drm-memory-gtt: %lu", &mem) != 1) continue; + if (sscanf(line.c_str(), "drm-memory-gtt: %" PRIu32, &mem) != 1) continue; info.mem += mem * 1000; info.memory_usage.gtt_mem += mem * 1000; } else if (line.find("drm-memory-cpu:") != std::string::npos) { unsigned long mem; - if (sscanf(line.c_str(), "drm-memory-cpu: %lu", &mem) != 1) continue; + if (sscanf(line.c_str(), "drm-memory-cpu: %" PRIu32, &mem) != 1) continue; info.mem += mem * 1000; info.memory_usage.cpu_mem += mem * 1000; } else if (line.find("drm-memory-vram:") != std::string::npos) { unsigned long mem; - if (sscanf(line.c_str(), "drm-memory-vram: %lu", &mem) != 1) continue;\ + if (sscanf(line.c_str(), "drm-memory-vram: %" PRIu32, &mem) != 1) continue; info.mem += mem * 1000; info.memory_usage.vram_mem += mem * 1000; } else if (line.find("drm-engine-gfx") != std::string::npos) { uint64_t engine_gfx; - if (sscanf(line.c_str(), "drm-engine-gfx: %lu", &engine_gfx) != 1) continue; + if (sscanf(line.c_str(), "drm-engine-gfx: %" PRIu32, &engine_gfx) != 1) continue; info.engine_usage.gfx = engine_gfx; } else if (line.find("drm-engine-enc") != std::string::npos) { uint64_t engine_enc; - if (sscanf(line.c_str(), "drm-engine-enc: %lu", &engine_enc) != 1) continue; + if (sscanf(line.c_str(), "drm-engine-enc: %" PRIu32, &engine_enc) != 1) continue; info.engine_usage.enc = engine_enc; } }