From 8b53e7812f39bdce89718bb8fc9c893e20dab2ea Mon Sep 17 00:00:00 2001 From: Ramesh Errabolu Date: Tue, 22 Sep 2020 08:56:12 -0500 Subject: [PATCH] Update ROCm SMI library with ability to read CU occupancy Change-Id: Ib9882fa2d81c13604af282279bfa116bc2fd05a4 [ROCm/rocm_smi_lib commit: 328878343ca18c5c0d347111d9683fba0f799cb7] --- .../rocm-smi-lib/include/rocm_smi/rocm_smi.h | 9 +-- .../include/rocm_smi/rocm_smi_kfd.h | 2 + .../python_smi_tools/rsmiBindings.py | 3 +- projects/rocm-smi-lib/src/rocm_smi_kfd.cc | 65 +++++++++++++++++-- .../functional/process_info_read.cc | 2 + 5 files changed, 69 insertions(+), 12 deletions(-) diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index d20c480a35..115918e9bb 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -859,10 +859,11 @@ typedef struct { * @brief This structure contains information specific to a process. */ typedef struct { - uint32_t process_id; //!< Process ID - uint32_t pasid; //!< PASID - uint64_t vram_usage; //!< VRAM usage - uint64_t sdma_usage; //!< SDMA usage in microseconds + uint32_t process_id; //!< Process ID + uint32_t pasid; //!< PASID + uint64_t vram_usage; //!< VRAM usage + uint64_t sdma_usage; //!< SDMA usage in microseconds + uint32_t cu_occupancy; //!< Compute Unit usage in percent } rsmi_process_info_t; diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h index 4ad44a001f..2142e4bd16 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h @@ -70,6 +70,7 @@ class KFDNode { uint32_t numa_node_number(void) const {return numa_node_number_;} uint64_t numa_node_weight(void) const {return numa_node_weight_;} uint64_t xgmi_hive_id(void) const {return xgmi_hive_id_;} + uint32_t cu_count(void) const {return cu_count_;} IO_LINK_TYPE numa_node_type(void) const {return numa_node_type_;} int get_io_link_type(uint32_t node_to, IO_LINK_TYPE *type); int get_io_link_weight(uint32_t node_to, uint64_t *weight); @@ -86,6 +87,7 @@ class KFDNode { uint64_t numa_node_weight_; IO_LINK_TYPE numa_node_type_; uint64_t xgmi_hive_id_; + uint32_t cu_count_; std::map io_link_type_; std::map io_link_weight_; std::map> io_link_map_; diff --git a/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py b/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py index 4a7c8f69dc..0c2ca1f22a 100644 --- a/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py +++ b/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py @@ -461,7 +461,8 @@ class rsmi_process_info_t(Structure): _fields_ = [('process_id', c_uint32), ('pasid', c_uint32), ('vram_usage', c_uint64), - ('sdma_usage', c_uint64)] + ('sdma_usage', c_uint64), + ('cu_occupancy', c_uint32)] class rsmi_func_id_iter_handle(Structure): diff --git a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc index fdb189ed20..70939c5b56 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc @@ -59,6 +59,7 @@ #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_device.h" +#include "rocm_smi/rocm_smi_main.h" namespace amd { namespace smi { @@ -83,13 +84,13 @@ static const char *kKFDPasidFName = "pasid"; // static const char *kKFDNodePropGDS_SIZE_IN_KBStr = "gds_size_in_kb"; // static const char *kKFDNodePropNUM_GWSStr = "num_gws"; // static const char *kKFDNodePropWAVE_FRONT_SIZEStr = "wave_front_size"; -// static const char *kKFDNodePropARRAY_COUNTStr = "array_count"; -// static const char *kKFDNodePropSIMD_ARRAYS_PER_ENGINEStr = -// "simd_arrays_per_engine"; -// static const char *kKFDNodePropCU_PER_SIMD_ARRAYStr = "cu_per_simd_array"; -// static const char *kKFDNodePropSIMD_PER_CUStr = "simd_per_cu"; -// static const char *kKFDNodePropMAX_SLOTS_SCRATCH_CUStr = -// "max_slots_scratch_cu"; + +static const char *kKFDNodePropARRAY_COUNTStr = "array_count"; +static const char *kKFDNodePropSIMD_ARRAYS_PER_ENGINEStr = "simd_arrays_per_engine"; +static const char *kKFDNodePropCU_PER_SIMD_ARRAYStr = "cu_per_simd_array"; +// static const char *kKFDNodePropSIMD_PER_CUStr = "simd_per_cu"; +// static const char *kKFDNodePropMAX_SLOTS_SCRATCH_CUStr = "max_slots_scratch_cu"; + // static const char *kKFDNodePropVENDOR_IDStr = "vendor_id"; // static const char *kKFDNodePropDEVICE_IDStr = "device_id"; static const char *kKFDNodePropLOCATION_IDStr = "location_id"; @@ -434,6 +435,11 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, proc->vram_usage = 0; proc->sdma_usage = 0; + proc->cu_occupancy = 0; + + uint32_t cu_count = 0; + static amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); + static std::map>& kfd_node_map = smi.kfd_node_map(); for (itr = gpu_set->begin(); itr != gpu_set->end(); itr++) { uint64_t gpu_id = (*itr); @@ -467,6 +473,29 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, } proc->sdma_usage += std::stoull(tmp); + + // Build the path and read from Sysfs file, info that + // encodes Compute Unit usage by a process of interest + std::string cu_occupancy_path = proc_str_path; + cu_occupancy_path += "/stats_"; + cu_occupancy_path += std::to_string(gpu_id); + cu_occupancy_path += "/cu_occupancy"; + err = ReadSysfsStr(cu_occupancy_path, &tmp); + if (err == 0) { + if (!is_number(tmp)) { + return EINVAL; + } + // Update CU usage by the process + proc->cu_occupancy += std::stoi(tmp); + + // Collect count of compute units + cu_count += kfd_node_map[gpu_id]->cu_count(); + } + } + + // Adjust CU occupancy to percent. + if (cu_count > 0) { + proc->cu_occupancy = ((proc->cu_occupancy * 100) / cu_count); } return 0; @@ -640,6 +669,28 @@ KFDNode::Initialize(void) { io_link_weight_[node_to] = link->weight(); } } + + // Pre-compute the total number of compute units a device has + uint64_t tmp_val; + ret = get_property_value(kKFDNodePropSIMD_ARRAYS_PER_ENGINEStr, &tmp_val); + if (ret != 0) { + throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, + "Failed to initialize rocm_smi library (get number of shader arrays per engine)."); + } + cu_count_ = uint32_t(tmp_val); + ret = get_property_value(kKFDNodePropARRAY_COUNTStr, &tmp_val); + if (ret != 0) { + throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, + "Failed to initialize rocm_smi library (get number of shader arrays)."); + } + cu_count_ = cu_count_ * uint32_t(tmp_val); + ret = get_property_value(kKFDNodePropCU_PER_SIMD_ARRAYStr, &tmp_val); + if (ret != 0) { + throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, + "Failed to initialize rocm_smi library (get number of CU's per array)."); + } + cu_count_ = cu_count_ * uint32_t(tmp_val); + return ret; } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/process_info_read.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/process_info_read.cc index 7159fdd197..f82d8ff402 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/process_info_read.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/process_info_read.cc @@ -210,6 +210,8 @@ void TestProcInfoRead::Run(void) { proc_info.vram_usage << " SDMA Usage: " << proc_info.sdma_usage << + " Compute Unit Usage: " << + proc_info.cu_occupancy << std::endl; } }