From 1a233f93fb10da726d4e5844bc5394b2a63c5bbe Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Tue, 3 Oct 2023 11:25:33 -0500 Subject: [PATCH] APIs for the cache level and size Read the cache level and size from topoogy sysfs file. Change-Id: Id3c558c95bcb79139a19e4adbaa7ff333d06098f --- example/amd_smi_drm_example.cc | 11 ++++ include/amd_smi/amdsmi.h | 24 +++++++++ rocm_smi/include/rocm_smi/rocm_smi.h | 32 ++++++++++++ rocm_smi/include/rocm_smi/rocm_smi_kfd.h | 3 ++ rocm_smi/src/rocm_smi.cc | 20 ++++++++ rocm_smi/src/rocm_smi_kfd.cc | 64 ++++++++++++++++++++++++ src/amd_smi/amd_smi.cc | 27 ++++++++++ 7 files changed, 181 insertions(+) diff --git a/example/amd_smi_drm_example.cc b/example/amd_smi_drm_example.cc index 2ee0bb3127..7b6997404b 100644 --- a/example/amd_smi_drm_example.cc +++ b/example/amd_smi_drm_example.cc @@ -306,6 +306,17 @@ int main() { printf("\tVBios Version String: %s\n\n", vbios_info.version); + // Get Cache info + amdsmi_gpu_cache_info_t cache_info = {}; + ret = amdsmi_get_gpu_cache_info(processor_handles[j], &cache_info); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_gpu_cache_info:\n"); + for (unsigned int i = 0 ; i < cache_info.num_cache_types; i++) { + printf("\tCache Level: %d, Cache Size: %d KB\n", + cache_info.cache[i].cache_level, + cache_info.cache[i].cache_size_kb); + } + // Get power measure amdsmi_power_info_t power_measure = {}; ret = amdsmi_get_power_info(processor_handles[j], &power_measure); diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index ce70d58f30..13897a9f83 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -84,6 +84,7 @@ typedef enum { #define AMDSMI_MAX_DRIVER_VERSION_LENGTH 80 #define AMDSMI_PRODUCT_NAME_LENGTH 128 #define AMDSMI_MAX_CONTAINER_TYPE 2 +#define AMDSMI_MAX_CACHE_TYPES 10 #define AMDSMI_GPU_UUID_SIZE 38 @@ -415,6 +416,16 @@ typedef struct { uint32_t reserved[16]; } amdsmi_vbios_info_t; +typedef struct { + uint32_t num_cache_types; + struct { + uint32_t cache_size_kb; /* In KB */ + uint32_t cache_level; + uint32_t reserved[3]; + } cache[AMDSMI_MAX_CACHE_TYPES]; + uint32_t reserved[15]; +} amdsmi_gpu_cache_info_t; + typedef struct { uint8_t num_fw_info; struct fw_info_list_ { @@ -2158,6 +2169,19 @@ amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle amdsmi_temperature_type_t sensor_type, amdsmi_temperature_metric_t metric, int64_t *temperature); +/** + * @brief Returns gpu cache info. + * + * @param[in] processor_handle PF of a processor for which to query + * + * @param[out] info reference to the cache info struct. + * Must be allocated by user. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_gpu_cache_info( + amdsmi_processor_handle processor_handle, amdsmi_gpu_cache_info_t *info); + /** * @brief Get the voltage metric value for the specified metric, from the * specified voltage sensor on the specified device. It is not supported on diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 31064f3cc9..c041eb24a5 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -845,6 +845,22 @@ typedef struct { /// \cond Ignore in docs. typedef rsmi_version_t rsmi_version; /// \endcond + +/** + * @brief This structure represents the cache size and level + */ +#define RSMI_MAX_CACHE_TYPES 10 +typedef struct { + uint32_t num_cache_types; + struct { + uint32_t cache_size_kb; /* In KB */ + uint32_t cache_level; + } cache[RSMI_MAX_CACHE_TYPES]; +} rsmi_gpu_cache_info_t; +/// \cond Ignore in docs. +typedef rsmi_gpu_cache_info_t rsmi_gpu_cache_info; +/// \endcond + /** * @brief This structure represents a range (e.g., frequencies or voltages). */ @@ -2035,6 +2051,22 @@ rsmi_status_t rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t *total); +/** + * @brief Get gpu cache info. + * + * @details Given a device index @p dv_ind, and a pointer to a cache + * info @p info, this function will write the cache size and level + * to the location pointed to by @p info. + * @param[in] dv_ind a device index + * + * @param[inout] info reference to the cache info struct. + * Must be allocated by user. + * + * @return ::rsmi_status_t | ::RSMI_STATUS_SUCCESS on success, non-zero on fail + */ +rsmi_status_t rsmi_dev_cache_info_get( + uint32_t dv_ind, rsmi_gpu_cache_info_t *info); + /** * @brief Get the current memory usage * diff --git a/rocm_smi/include/rocm_smi/rocm_smi_kfd.h b/rocm_smi/include/rocm_smi/rocm_smi_kfd.h index 90c7f6ff3b..7e11130bd4 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_kfd.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_kfd.h @@ -84,6 +84,9 @@ class KFDNode { int get_total_memory(uint64_t* total); int get_used_memory(uint64_t* used); + // Get cache info from kfd + int get_cache_info(rsmi_gpu_cache_info_t *info); + private: uint32_t node_indx_; uint32_t amdgpu_dev_index_; diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index 10cca04b8d..1d9da410d1 100755 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -3282,6 +3282,26 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, return ret; CATCH } + + +rsmi_status_t rsmi_dev_cache_info_get( + uint32_t dv_ind, rsmi_gpu_cache_info_t *info) { + TRY + rsmi_status_t ret; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); + + if (info == nullptr) return RSMI_STATUS_INVALID_ARGS; + + GET_DEV_AND_KFDNODE_FROM_INDX + + if (kfd_node->get_cache_info(info) == 0) return RSMI_STATUS_SUCCESS; + + return RSMI_STATUS_NOT_SUPPORTED; + CATCH +} + rsmi_status_t rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t *used) { diff --git a/rocm_smi/src/rocm_smi_kfd.cc b/rocm_smi/src/rocm_smi_kfd.cc index 7fe9004cc3..00c7404711 100755 --- a/rocm_smi/src/rocm_smi_kfd.cc +++ b/rocm_smi/src/rocm_smi_kfd.cc @@ -130,6 +130,25 @@ static std::string KFDDevicePath(uint32_t dev_id) { return node_path; } +// A generic function to extract out a property from file. +// return empty string if file or property not found +// Assume the property_name is at the beginning of the line. +static std::string get_properties_from_file(const std::string& file_name, + const std::string& property_name) { + std::ifstream infile(file_name); + if (!infile) return ""; + std::string line; + while (std::getline(infile, line)) { + std::istringstream iss(line); + // the property name is at the beginning of the line + if (line.rfind(property_name.c_str(), 0) == 0) { + return line.substr(property_name.length()); + } + } + return ""; +} + + static int OpenKFDNodeFile(uint32_t dev_id, std::string node_file, std::ifstream *fs) { std::string line; @@ -874,6 +893,51 @@ int KFDNode::get_used_memory(uint64_t* used) { return 1; } +int KFDNode::get_cache_info(rsmi_gpu_cache_info_t *info) { + if (info == nullptr) return EINVAL; + uint64_t caches_count = 0; + int ret = get_property_value("caches_count", &caches_count); + if (ret != 0) return ret; + + // /sys/class/kfd/kfd/topology/nodes/1/caches/0/properties + std::string f_path = kKFDNodesPathRoot; + f_path += "/"; + f_path += std::to_string(node_indx_); + f_path += "/"; + f_path += "caches/"; + + info->num_cache_types = 0; + for (unsigned int cache_id = 0; cache_id < caches_count; cache_id++) { + const auto prop_file = f_path + std::to_string(cache_id) + "/properties"; + std::string level = get_properties_from_file(prop_file, "level "); + try { + int cache_level = std::stoi(level); + if (cache_level < 0 ) continue; + + // only count once + bool is_count_already = false; + for (unsigned int i=0; i < info->num_cache_types; i++) { + if (info->cache->cache_level == static_cast(cache_level)) { + is_count_already = true; + break; + } + } + if (is_count_already) continue; + + if (info->num_cache_types >= RSMI_MAX_CACHE_TYPES) return 1; + std::string size = get_properties_from_file(prop_file, "size "); + int cache_size = std::stoi(size); + if (cache_size <= 0) continue; + info->cache[info->num_cache_types].cache_level = cache_level; + info->cache[info->num_cache_types].cache_size_kb = cache_size; + info->num_cache_types++; + } catch (...) { + continue; + } + } + return 0; +} + // /sys/class/kfd/kfd/topology/nodes/*/properties int read_node_properties(uint32_t node, std::string property_name, uint64_t *val) { diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 7040860e06..473d7595a1 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -478,6 +478,33 @@ amdsmi_status_t amdsmi_get_gpu_board_info(amdsmi_processor_handle processor_hand return AMDSMI_STATUS_SUCCESS; } +amdsmi_status_t amdsmi_get_gpu_cache_info( + amdsmi_processor_handle processor_handle, amdsmi_gpu_cache_info_t *info) { + AMDSMI_CHECK_INIT(); + if (info == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; + amdsmi_status_t status = get_gpu_device_from_handle( + processor_handle, &gpu_device); + if (status != AMDSMI_STATUS_SUCCESS) + return status; + + rsmi_gpu_cache_info_t rsmi_info; + status = rsmi_wrapper(rsmi_dev_cache_info_get, + processor_handle, &rsmi_info); + if (status != AMDSMI_STATUS_SUCCESS) + return status; + + info->num_cache_types = rsmi_info.num_cache_types; + for (unsigned int i =0; i < rsmi_info.num_cache_types; i++) { + info->cache[i].cache_size_kb = rsmi_info.cache[i].cache_size_kb; + info->cache[i].cache_level = rsmi_info.cache[i].cache_level; + } + return AMDSMI_STATUS_SUCCESS; +} + amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle, amdsmi_temperature_type_t sensor_type, amdsmi_temperature_metric_t metric, int64_t *temperature) {