ROCm SMI Python CLI & LIB: Add GPU Reset Functionality
The purpose of this patch is to implement GPU reset functionality
in the LIB, and to call it from the rocm_smi python CLI.
Signed-off-by: Ori Messinger <Ori.Messinger@amd.com>
Change-Id: Iaf525f7016f8354a7fd93af0209ca2e97ef4fd56
[ROCm/amdsmi commit: 80f629b9be]
Этот коммит содержится в:
@@ -2150,6 +2150,21 @@ rsmi_status_t rsmi_dev_overdrive_level_get(uint32_t dv_ind, uint32_t *od);
|
||||
rsmi_status_t rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind,
|
||||
rsmi_clk_type_t clk_type, rsmi_frequencies_t *f);
|
||||
|
||||
/**
|
||||
* @brief Reset the gpu associated with the device with provided device index
|
||||
*
|
||||
* @details Given a device index @p dv_ind, this function will reset the GPU
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS call was successful
|
||||
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
|
||||
* support this function with the given arguments
|
||||
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
|
||||
*
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_gpu_reset(int32_t dv_ind);
|
||||
|
||||
/**
|
||||
* @brief This function retrieves the voltage/frequency curve information
|
||||
*
|
||||
|
||||
@@ -155,7 +155,8 @@ enum DevInfoTypes {
|
||||
kDevSerialNumber,
|
||||
kDevMemPageBad,
|
||||
kDevNumaNode,
|
||||
kDevGpuMetrics
|
||||
kDevGpuMetrics,
|
||||
kDevGpuReset
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
@@ -216,8 +217,11 @@ class Device {
|
||||
uint32_t card_indx_; // This index corresponds to the drm index (ie, card#)
|
||||
uint32_t drm_render_minor_;
|
||||
const RocmSMI_env_vars *env_;
|
||||
template <typename T> int openDebugFileStream(DevInfoTypes type, T *fs,
|
||||
const char *str = nullptr);
|
||||
template <typename T> int openSysfsFileStream(DevInfoTypes type, T *fs,
|
||||
const char *str = nullptr);
|
||||
int readDebugInfoStr(DevInfoTypes type, std::string *retStr);
|
||||
int readDevInfoStr(DevInfoTypes type, std::string *retStr);
|
||||
int readDevInfoMultiLineStr(DevInfoTypes type,
|
||||
std::vector<std::string> *retVec);
|
||||
|
||||
@@ -934,24 +934,21 @@ def setPerfDeterminism(deviceList, value):
|
||||
def resetGpu(device):
|
||||
""" Perform a GPU reset on the specified device
|
||||
|
||||
Parameters:
|
||||
device -- DRM Device identifier
|
||||
@param device: DRM device identifier
|
||||
"""
|
||||
# TODO: Implement GPU reset function in the LIB
|
||||
printLogSpacer(' Reset GPU ')
|
||||
global RETCODE
|
||||
if len(device) > 1:
|
||||
if len(device) > 1:
|
||||
logging.error('GPU Reset can only be performed on one GPU per call')
|
||||
RETCODE = 1
|
||||
return
|
||||
resetDev = int(device[0])
|
||||
filePath = '/sys/kernel/debug/dri/%d/amdgpu_gpu_recover' % (resetDev)
|
||||
if os.path.isfile(filePath):
|
||||
with open(filePath, 'r') as fileContents:
|
||||
fileValue = fileContents.read()
|
||||
printLog(resetDev, 'GPU[%d]\t: Reset was successful' % (resetDev), None)
|
||||
ret = rocmsmi.rsmi_dev_gpu_reset(resetDev)
|
||||
if rsmi_ret_ok(ret, resetDev):
|
||||
printLog(resetDev, 'Successfully reset GPU %d' % (resetDev), None)
|
||||
else:
|
||||
printErrLog(resetDev, 'Unable to reset device %d' % (resetDev))
|
||||
printErrLog(resetDev, 'Unable to reset GPU %d' % (resetDev))
|
||||
logging.debug('GPU reset failed with return value of %d' % ret)
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
@@ -2719,6 +2716,7 @@ if __name__ == '__main__':
|
||||
if args.gpureset:
|
||||
if not args.device:
|
||||
logging.error('No device specified. One device must be specified for GPU reset')
|
||||
printLogSpacer()
|
||||
sys.exit(1)
|
||||
logging.debug('Only executing GPU reset, no other commands will be executed')
|
||||
resetGpu(args.device)
|
||||
|
||||
@@ -2240,6 +2240,22 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) {
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_gpu_reset(int32_t dv_ind) {
|
||||
TRY
|
||||
REQUIRE_ROOT_ACCESS
|
||||
DEVICE_MUTEX
|
||||
|
||||
rsmi_status_t ret;
|
||||
uint64_t status_code = 0;
|
||||
|
||||
// Read amdgpu_gpu_recover to reset it
|
||||
ret = get_dev_value_int(amd::smi::kDevGpuReset, dv_ind, &status_code);
|
||||
return ret;
|
||||
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t rsmi_dev_od_volt_curve_regions_get(uint32_t dv_ind,
|
||||
uint32_t *num_regions, rsmi_freq_volt_region_t *buffer) {
|
||||
TRY
|
||||
|
||||
@@ -70,6 +70,12 @@
|
||||
namespace amd {
|
||||
namespace smi {
|
||||
|
||||
// Debug root file path
|
||||
static const char *kPathDebugRootFName = "/sys/kernel/debug/dri/";
|
||||
|
||||
// Device debugfs file names
|
||||
static const char *kDevGpuResetFName = "amdgpu_gpu_recover";
|
||||
|
||||
// Device sysfs file names
|
||||
static const char *kDevPerfLevelFName = "power_dpm_force_performance_level";
|
||||
static const char *kDevDevProdNameFName = "product_name";
|
||||
@@ -273,6 +279,7 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
|
||||
{kDevMemPageBad, kDevMemPageBadFName},
|
||||
{kDevNumaNode, kDevNumaNodeFName},
|
||||
{kDevGpuMetrics, kDevGpuMetricsFName},
|
||||
{kDevGpuReset, kDevGpuResetFName},
|
||||
};
|
||||
|
||||
static const std::map<rsmi_dev_perf_level, const char *> kDevPerfLvlMap = {
|
||||
@@ -389,6 +396,7 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
|
||||
{"rsmi_dev_memory_reserved_pages_get", {{kDevMemPageBadFName}, {}}},
|
||||
{"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}},
|
||||
{"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}},
|
||||
{"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}},
|
||||
|
||||
// These functions with variants, but no sensors/units. (May or may not
|
||||
// have mandatory dependencies.)
|
||||
@@ -499,6 +507,33 @@ Device:: ~Device() {
|
||||
shared_mutex_close(mutex_);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
int Device::openDebugFileStream(DevInfoTypes type, T *fs, const char *str) {
|
||||
std::string debugfs_path;
|
||||
|
||||
debugfs_path = kPathDebugRootFName;
|
||||
debugfs_path += std::to_string(index());
|
||||
debugfs_path += "/";
|
||||
debugfs_path += kDevAttribNameMap.at(type);
|
||||
|
||||
DBG_FILE_ERROR(debugfs_path, str);
|
||||
bool reg_file;
|
||||
int ret = isRegularFile(debugfs_path, ®_file);
|
||||
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
if (!reg_file) {
|
||||
return ENOENT;
|
||||
}
|
||||
|
||||
fs->open(debugfs_path);
|
||||
if (!fs->is_open()) {
|
||||
return errno;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
|
||||
auto sysfs_path = path_;
|
||||
@@ -537,6 +572,28 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) {
|
||||
std::ifstream fs;
|
||||
std::string line;
|
||||
int ret = 0;
|
||||
|
||||
assert(retStr != nullptr);
|
||||
|
||||
ret = openDebugFileStream(type, &fs);
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (!(fs.peek() == std::ifstream::traits_type::eof())) {
|
||||
getline(fs, line);
|
||||
*retStr = line;
|
||||
}
|
||||
|
||||
fs.close();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) {
|
||||
std::ifstream fs;
|
||||
int ret = 0;
|
||||
@@ -762,6 +819,11 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
|
||||
*val = std::stoul(tempStr, 0, 16);
|
||||
break;
|
||||
|
||||
case kDevGpuReset:
|
||||
ret = readDebugInfoStr(type, &tempStr);
|
||||
RET_IF_NONZERO(ret);
|
||||
break;
|
||||
|
||||
default:
|
||||
return EINVAL;
|
||||
}
|
||||
@@ -899,7 +961,12 @@ void Device::fillSupportedFuncs(void) {
|
||||
mand_depends_met = true;
|
||||
for (; dep != it->second.mandatory_depends.end(); dep++) {
|
||||
std::string dep_path = dev_rt + "/" + *dep;
|
||||
if (!FileExists(dep_path.c_str())) {
|
||||
std::string debugfs_path;
|
||||
debugfs_path = kPathDebugRootFName;
|
||||
debugfs_path += std::to_string(index());
|
||||
debugfs_path += "/";
|
||||
debugfs_path += *dep;
|
||||
if (!FileExists(dep_path.c_str()) && !FileExists(debugfs_path.c_str())) {
|
||||
mand_depends_met = false;
|
||||
break;
|
||||
}
|
||||
|
||||
Ссылка в новой задаче
Block a user