From d41364d1cfd6ef2515bd703970b9b346d1391097 Mon Sep 17 00:00:00 2001 From: Ori Messinger Date: Thu, 7 Jan 2021 05:35:17 -0500 Subject: [PATCH] ROCm SMI Python CLI & LIB: Add GPU Reset Functionality The purpose of this patch is to implement GPU reset functionality in the LIB, and to call it from the rocm_smi python CLI. Signed-off-by: Ori Messinger Change-Id: Iaf525f7016f8354a7fd93af0209ca2e97ef4fd56 [ROCm/amdsmi commit: 80f629b9bec44a4778f89b1ef68f49acde7b49f5] --- projects/amdsmi/include/rocm_smi/rocm_smi.h | 15 ++++ .../amdsmi/include/rocm_smi/rocm_smi_device.h | 6 +- projects/amdsmi/python_smi_tools/rocm_smi.py | 18 +++-- projects/amdsmi/src/rocm_smi.cc | 16 +++++ projects/amdsmi/src/rocm_smi_device.cc | 69 ++++++++++++++++++- 5 files changed, 112 insertions(+), 12 deletions(-) diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/include/rocm_smi/rocm_smi.h index 601814c333..76165ddd24 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi.h @@ -2150,6 +2150,21 @@ rsmi_status_t rsmi_dev_overdrive_level_get(uint32_t dv_ind, uint32_t *od); rsmi_status_t rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind, rsmi_clk_type_t clk_type, rsmi_frequencies_t *f); +/** + * @brief Reset the gpu associated with the device with provided device index + * + * @details Given a device index @p dv_ind, this function will reset the GPU + * + * @param[in] dv_ind a device index + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * + */ +rsmi_status_t rsmi_dev_gpu_reset(int32_t dv_ind); + /** * @brief This function retrieves the voltage/frequency curve information * diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h index 34d2f7f215..9f8553c233 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h @@ -155,7 +155,8 @@ enum DevInfoTypes { kDevSerialNumber, kDevMemPageBad, kDevNumaNode, - kDevGpuMetrics + kDevGpuMetrics, + kDevGpuReset }; typedef struct { @@ -216,8 +217,11 @@ class Device { uint32_t card_indx_; // This index corresponds to the drm index (ie, card#) uint32_t drm_render_minor_; const RocmSMI_env_vars *env_; + template int openDebugFileStream(DevInfoTypes type, T *fs, + const char *str = nullptr); template int openSysfsFileStream(DevInfoTypes type, T *fs, const char *str = nullptr); + int readDebugInfoStr(DevInfoTypes type, std::string *retStr); int readDevInfoStr(DevInfoTypes type, std::string *retStr); int readDevInfoMultiLineStr(DevInfoTypes type, std::vector *retVec); diff --git a/projects/amdsmi/python_smi_tools/rocm_smi.py b/projects/amdsmi/python_smi_tools/rocm_smi.py index c6fedd4027..8569e76a27 100755 --- a/projects/amdsmi/python_smi_tools/rocm_smi.py +++ b/projects/amdsmi/python_smi_tools/rocm_smi.py @@ -934,24 +934,21 @@ def setPerfDeterminism(deviceList, value): def resetGpu(device): """ Perform a GPU reset on the specified device - Parameters: - device -- DRM Device identifier + @param device: DRM device identifier """ - # TODO: Implement GPU reset function in the LIB printLogSpacer(' Reset GPU ') global RETCODE - if len(device) > 1: + if len(device) > 1: logging.error('GPU Reset can only be performed on one GPU per call') RETCODE = 1 return resetDev = int(device[0]) - filePath = '/sys/kernel/debug/dri/%d/amdgpu_gpu_recover' % (resetDev) - if os.path.isfile(filePath): - with open(filePath, 'r') as fileContents: - fileValue = fileContents.read() - printLog(resetDev, 'GPU[%d]\t: Reset was successful' % (resetDev), None) + ret = rocmsmi.rsmi_dev_gpu_reset(resetDev) + if rsmi_ret_ok(ret, resetDev): + printLog(resetDev, 'Successfully reset GPU %d' % (resetDev), None) else: - printErrLog(resetDev, 'Unable to reset device %d' % (resetDev)) + printErrLog(resetDev, 'Unable to reset GPU %d' % (resetDev)) + logging.debug('GPU reset failed with return value of %d' % ret) printLogSpacer() @@ -2719,6 +2716,7 @@ if __name__ == '__main__': if args.gpureset: if not args.device: logging.error('No device specified. One device must be specified for GPU reset') + printLogSpacer() sys.exit(1) logging.debug('Only executing GPU reset, no other commands will be executed') resetGpu(args.device) diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index ea1ebff78e..b2d11ff425 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -2240,6 +2240,22 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { CATCH } +rsmi_status_t +rsmi_dev_gpu_reset(int32_t dv_ind) { + TRY + REQUIRE_ROOT_ACCESS + DEVICE_MUTEX + + rsmi_status_t ret; + uint64_t status_code = 0; + + // Read amdgpu_gpu_recover to reset it + ret = get_dev_value_int(amd::smi::kDevGpuReset, dv_ind, &status_code); + return ret; + + CATCH +} + rsmi_status_t rsmi_dev_od_volt_curve_regions_get(uint32_t dv_ind, uint32_t *num_regions, rsmi_freq_volt_region_t *buffer) { TRY diff --git a/projects/amdsmi/src/rocm_smi_device.cc b/projects/amdsmi/src/rocm_smi_device.cc index 1d5cd5828f..6b26660374 100755 --- a/projects/amdsmi/src/rocm_smi_device.cc +++ b/projects/amdsmi/src/rocm_smi_device.cc @@ -70,6 +70,12 @@ namespace amd { namespace smi { +// Debug root file path +static const char *kPathDebugRootFName = "/sys/kernel/debug/dri/"; + +// Device debugfs file names +static const char *kDevGpuResetFName = "amdgpu_gpu_recover"; + // Device sysfs file names static const char *kDevPerfLevelFName = "power_dpm_force_performance_level"; static const char *kDevDevProdNameFName = "product_name"; @@ -273,6 +279,7 @@ static const std::map kDevAttribNameMap = { {kDevMemPageBad, kDevMemPageBadFName}, {kDevNumaNode, kDevNumaNodeFName}, {kDevGpuMetrics, kDevGpuMetricsFName}, + {kDevGpuReset, kDevGpuResetFName}, }; static const std::map kDevPerfLvlMap = { @@ -389,6 +396,7 @@ static const std::map kDevFuncDependsMap = { {"rsmi_dev_memory_reserved_pages_get", {{kDevMemPageBadFName}, {}}}, {"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}}, {"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}}, + {"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}}, // These functions with variants, but no sensors/units. (May or may not // have mandatory dependencies.) @@ -499,6 +507,33 @@ Device:: ~Device() { shared_mutex_close(mutex_); } +template +int Device::openDebugFileStream(DevInfoTypes type, T *fs, const char *str) { + std::string debugfs_path; + + debugfs_path = kPathDebugRootFName; + debugfs_path += std::to_string(index()); + debugfs_path += "/"; + debugfs_path += kDevAttribNameMap.at(type); + + DBG_FILE_ERROR(debugfs_path, str); + bool reg_file; + int ret = isRegularFile(debugfs_path, ®_file); + + if (ret != 0) { + return ret; + } + if (!reg_file) { + return ENOENT; + } + + fs->open(debugfs_path); + if (!fs->is_open()) { + return errno; + } + return 0; +} + template int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { auto sysfs_path = path_; @@ -537,6 +572,28 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { return 0; } +int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) { + std::ifstream fs; + std::string line; + int ret = 0; + + assert(retStr != nullptr); + + ret = openDebugFileStream(type, &fs); + if (ret != 0) { + return ret; + } + + if (!(fs.peek() == std::ifstream::traits_type::eof())) { + getline(fs, line); + *retStr = line; + } + + fs.close(); + + return 0; +} + int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) { std::ifstream fs; int ret = 0; @@ -762,6 +819,11 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { *val = std::stoul(tempStr, 0, 16); break; + case kDevGpuReset: + ret = readDebugInfoStr(type, &tempStr); + RET_IF_NONZERO(ret); + break; + default: return EINVAL; } @@ -899,7 +961,12 @@ void Device::fillSupportedFuncs(void) { mand_depends_met = true; for (; dep != it->second.mandatory_depends.end(); dep++) { std::string dep_path = dev_rt + "/" + *dep; - if (!FileExists(dep_path.c_str())) { + std::string debugfs_path; + debugfs_path = kPathDebugRootFName; + debugfs_path += std::to_string(index()); + debugfs_path += "/"; + debugfs_path += *dep; + if (!FileExists(dep_path.c_str()) && !FileExists(debugfs_path.c_str())) { mand_depends_met = false; break; }