ROCm SMI Python CLI & LIB: Add GPU Reset Functionality

The purpose of this patch is to implement GPU reset functionality
in the LIB, and to call it from the rocm_smi python CLI.

Signed-off-by: Ori Messinger <Ori.Messinger@amd.com>
Change-Id: Iaf525f7016f8354a7fd93af0209ca2e97ef4fd56


[ROCm/amdsmi commit: 80f629b9be]
Этот коммит содержится в:
Ori Messinger
2021-01-07 05:35:17 -05:00
родитель a5fee40cbb
Коммит d41364d1cf
5 изменённых файлов: 112 добавлений и 12 удалений
+15
Просмотреть файл
@@ -2150,6 +2150,21 @@ rsmi_status_t rsmi_dev_overdrive_level_get(uint32_t dv_ind, uint32_t *od);
rsmi_status_t rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind,
rsmi_clk_type_t clk_type, rsmi_frequencies_t *f);
/**
* @brief Reset the gpu associated with the device with provided device index
*
* @details Given a device index @p dv_ind, this function will reset the GPU
*
* @param[in] dv_ind a device index
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function with the given arguments
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
*
*/
rsmi_status_t rsmi_dev_gpu_reset(int32_t dv_ind);
/**
* @brief This function retrieves the voltage/frequency curve information
*
+5 -1
Просмотреть файл
@@ -155,7 +155,8 @@ enum DevInfoTypes {
kDevSerialNumber,
kDevMemPageBad,
kDevNumaNode,
kDevGpuMetrics
kDevGpuMetrics,
kDevGpuReset
};
typedef struct {
@@ -216,8 +217,11 @@ class Device {
uint32_t card_indx_; // This index corresponds to the drm index (ie, card#)
uint32_t drm_render_minor_;
const RocmSMI_env_vars *env_;
template <typename T> int openDebugFileStream(DevInfoTypes type, T *fs,
const char *str = nullptr);
template <typename T> int openSysfsFileStream(DevInfoTypes type, T *fs,
const char *str = nullptr);
int readDebugInfoStr(DevInfoTypes type, std::string *retStr);
int readDevInfoStr(DevInfoTypes type, std::string *retStr);
int readDevInfoMultiLineStr(DevInfoTypes type,
std::vector<std::string> *retVec);
+8 -10
Просмотреть файл
@@ -934,24 +934,21 @@ def setPerfDeterminism(deviceList, value):
def resetGpu(device):
""" Perform a GPU reset on the specified device
Parameters:
device -- DRM Device identifier
@param device: DRM device identifier
"""
# TODO: Implement GPU reset function in the LIB
printLogSpacer(' Reset GPU ')
global RETCODE
if len(device) > 1:
if len(device) > 1:
logging.error('GPU Reset can only be performed on one GPU per call')
RETCODE = 1
return
resetDev = int(device[0])
filePath = '/sys/kernel/debug/dri/%d/amdgpu_gpu_recover' % (resetDev)
if os.path.isfile(filePath):
with open(filePath, 'r') as fileContents:
fileValue = fileContents.read()
printLog(resetDev, 'GPU[%d]\t: Reset was successful' % (resetDev), None)
ret = rocmsmi.rsmi_dev_gpu_reset(resetDev)
if rsmi_ret_ok(ret, resetDev):
printLog(resetDev, 'Successfully reset GPU %d' % (resetDev), None)
else:
printErrLog(resetDev, 'Unable to reset device %d' % (resetDev))
printErrLog(resetDev, 'Unable to reset GPU %d' % (resetDev))
logging.debug('GPU reset failed with return value of %d' % ret)
printLogSpacer()
@@ -2719,6 +2716,7 @@ if __name__ == '__main__':
if args.gpureset:
if not args.device:
logging.error('No device specified. One device must be specified for GPU reset')
printLogSpacer()
sys.exit(1)
logging.debug('Only executing GPU reset, no other commands will be executed')
resetGpu(args.device)
+16
Просмотреть файл
@@ -2240,6 +2240,22 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) {
CATCH
}
rsmi_status_t
rsmi_dev_gpu_reset(int32_t dv_ind) {
TRY
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
rsmi_status_t ret;
uint64_t status_code = 0;
// Read amdgpu_gpu_recover to reset it
ret = get_dev_value_int(amd::smi::kDevGpuReset, dv_ind, &status_code);
return ret;
CATCH
}
rsmi_status_t rsmi_dev_od_volt_curve_regions_get(uint32_t dv_ind,
uint32_t *num_regions, rsmi_freq_volt_region_t *buffer) {
TRY
+68 -1
Просмотреть файл
@@ -70,6 +70,12 @@
namespace amd {
namespace smi {
// Debug root file path
static const char *kPathDebugRootFName = "/sys/kernel/debug/dri/";
// Device debugfs file names
static const char *kDevGpuResetFName = "amdgpu_gpu_recover";
// Device sysfs file names
static const char *kDevPerfLevelFName = "power_dpm_force_performance_level";
static const char *kDevDevProdNameFName = "product_name";
@@ -273,6 +279,7 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
{kDevMemPageBad, kDevMemPageBadFName},
{kDevNumaNode, kDevNumaNodeFName},
{kDevGpuMetrics, kDevGpuMetricsFName},
{kDevGpuReset, kDevGpuResetFName},
};
static const std::map<rsmi_dev_perf_level, const char *> kDevPerfLvlMap = {
@@ -389,6 +396,7 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
{"rsmi_dev_memory_reserved_pages_get", {{kDevMemPageBadFName}, {}}},
{"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}},
{"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}},
{"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}},
// These functions with variants, but no sensors/units. (May or may not
// have mandatory dependencies.)
@@ -499,6 +507,33 @@ Device:: ~Device() {
shared_mutex_close(mutex_);
}
template <typename T>
int Device::openDebugFileStream(DevInfoTypes type, T *fs, const char *str) {
std::string debugfs_path;
debugfs_path = kPathDebugRootFName;
debugfs_path += std::to_string(index());
debugfs_path += "/";
debugfs_path += kDevAttribNameMap.at(type);
DBG_FILE_ERROR(debugfs_path, str);
bool reg_file;
int ret = isRegularFile(debugfs_path, &reg_file);
if (ret != 0) {
return ret;
}
if (!reg_file) {
return ENOENT;
}
fs->open(debugfs_path);
if (!fs->is_open()) {
return errno;
}
return 0;
}
template <typename T>
int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
auto sysfs_path = path_;
@@ -537,6 +572,28 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
return 0;
}
int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) {
std::ifstream fs;
std::string line;
int ret = 0;
assert(retStr != nullptr);
ret = openDebugFileStream(type, &fs);
if (ret != 0) {
return ret;
}
if (!(fs.peek() == std::ifstream::traits_type::eof())) {
getline(fs, line);
*retStr = line;
}
fs.close();
return 0;
}
int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) {
std::ifstream fs;
int ret = 0;
@@ -762,6 +819,11 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
*val = std::stoul(tempStr, 0, 16);
break;
case kDevGpuReset:
ret = readDebugInfoStr(type, &tempStr);
RET_IF_NONZERO(ret);
break;
default:
return EINVAL;
}
@@ -899,7 +961,12 @@ void Device::fillSupportedFuncs(void) {
mand_depends_met = true;
for (; dep != it->second.mandatory_depends.end(); dep++) {
std::string dep_path = dev_rt + "/" + *dep;
if (!FileExists(dep_path.c_str())) {
std::string debugfs_path;
debugfs_path = kPathDebugRootFName;
debugfs_path += std::to_string(index());
debugfs_path += "/";
debugfs_path += *dep;
if (!FileExists(dep_path.c_str()) && !FileExists(debugfs_path.c_str())) {
mand_depends_met = false;
break;
}