[SWDEV-381630] Add reset partition functionality

Updates:
    * Added rsmi_dev_compute_partition_reset & rsmi_dev_nps_mode_reset
    * Added --resetcomputepartition and --resetnpsmode python smi calls
    * Added temp data files rocmsmi_boot_compute_partition_<device num>
      & rocmsmi_boot_nps_mode_partition_<device num>, writes UNKNOWN
      if data cannot be read or device does not support
    * Cleaned up NPS & compute API documentation
    * Added creation and reading of API temp files (used in reset
      functionality)
    * Cleaned up output of rocm_smi_example
    * Updated rocm_smi_example to check if running with sudo permission
      before executing write API calls (cleans up erroneous output)
    * Added template specialization for storing temp data, requires
      specific rsmi_type_t enums (restrics what data can be stored)
    * Added storage of temp data, if temp files do not exist
    * Updated google tests for NPS & compute to include reset API calls

Change-Id: I69895a466b97107617e6dbb355737b84499a76c9
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/amdsmi commit: 77c950a4bf]
Этот коммит содержится в:
Charis Poag
2023-02-14 17:06:03 -06:00
родитель 863f58a2d8
Коммит 02ca598e70
12 изменённых файлов: 577 добавлений и 46 удалений
+41 -4
Просмотреть файл
@@ -3540,12 +3540,13 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst,
* which the device's current compute partition will be written to.
*
* @param[in] len the length of the caller provided buffer @p compute_partition
* , suggested length is 4 or greater.
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
* @retval ::RSMI_STATUS_UNEXPECTED_DATA data provided to function is not valid
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function with the given arguments
* support this function
* @retval ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if @p len bytes is not
* large enough to hold the entire compute partition value. In this case,
* only @p len bytes will be written.
@@ -3572,13 +3573,30 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition,
* @retval ::RSMI_STATUS_PERMISSION function requires root access
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function with the given arguments
* support this function
*
*/
rsmi_status_t
rsmi_dev_compute_partition_set(uint32_t dv_ind,
rsmi_compute_partition_type_t compute_partition);
/**
* @brief Reverts a selected device's compute partition setting back to its
* boot state.
*
* @details Given a device index @p dv_ind , this function will attempt to
* revert its compute partition setting back to its boot state.
*
* @param[in] dv_ind a device index
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_PERMISSION function requires root access
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function
*
*/
rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind);
/** @} */ // end of ComputePartition
/*****************************************************************************/
@@ -3609,7 +3627,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
* @retval ::RSMI_STATUS_UNEXPECTED_DATA data provided to function is not valid
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function with the given arguments
* support this function
* @retval ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if @p len bytes is not
* large enough to hold the entire nps mode value. In this case,
* only @p len bytes will be written.
@@ -3634,7 +3652,7 @@ rsmi_dev_nps_mode_get(uint32_t dv_ind, char *nps_mode, uint32_t len);
* @retval ::RSMI_STATUS_PERMISSION function requires root access
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function with the given arguments
* support this function
* @retval ::RSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart
* the amdgpu driver
*
@@ -3642,6 +3660,25 @@ rsmi_dev_nps_mode_get(uint32_t dv_ind, char *nps_mode, uint32_t len);
rsmi_status_t
rsmi_dev_nps_mode_set(uint32_t dv_ind, rsmi_nps_mode_type_t nps_mode);
/**
* @brief Reverts a selected device's NPS mode setting back to its
* boot state.
*
* @details Given a device index @p dv_ind , this function will attempt to
* revert its NPS mode setting back to its boot state.
*
* @param[in] dv_ind a device index
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_PERMISSION function requires root access
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function
* @retval ::RSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart
* the amdgpu driver
*
*/
rsmi_status_t rsmi_dev_nps_mode_reset(uint32_t dv_ind);
/** @} */ // end of NPSMode
/*****************************************************************************/
+2
Просмотреть файл
@@ -217,6 +217,8 @@ class Device {
bool DeviceAPISupported(std::string name, uint64_t variant,
uint64_t sub_variant);
rsmi_status_t restartAMDGpuDriver(void);
rsmi_status_t storeDevicePartitions(uint32_t dv_ind);
template <typename T> std::string readBootPartitionState(uint32_t dv_ind);
private:
std::shared_ptr<Monitor> monitor_;
+11 -6
Просмотреть файл
@@ -66,18 +66,23 @@ namespace amd {
namespace smi {
pthread_mutex_t *GetMutex(uint32_t dv_ind);
int SameFile(const std::string fileA, const std::string fileB);
bool FileExists(char const *filename);
int isRegularFile(std::string fname, bool *is_reg);
int ReadSysfsStr(std::string path, std::string *retStr);
int WriteSysfsStr(std::string path, std::string val);
bool IsInteger(const std::string & n_str);
std::pair<bool, std::string> executeCommand(std::string command, bool stdOut = true);
std::pair<bool, std::string> executeCommand(std::string command,
bool stdOut = true);
rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName,
std::string stateName, std::string storageData);
std::vector<std::string> getListOfAppTmpFiles();
bool containsString(std::string originalString, std::string substring);
std::tuple<bool, std::string> readTmpFile(
uint32_t dv_ind,
std::string stateName,
std::string parameterName);
void displayAppTmpFilesContent(void);
rsmi_status_t handleException();
rsmi_status_t
GetDevValueVec(amd::smi::DevInfoTypes type,
+93 -3
Просмотреть файл
@@ -411,6 +411,30 @@ def getVersion(deviceList, component):
return None
def getComputePartition(device):
""" Return the current compute partition of a given device
@param device: DRM device identifier
"""
currentComputePartition = create_string_buffer(256)
ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256)
if rsmi_ret_ok(ret, device, silent=True) and currentComputePartition.value.decode():
return str(currentComputePartition.value.decode())
return "UNKNOWN"
def getMemoryPartition(device):
""" Return the current memory partition of a given device
@param device: DRM device identifier
"""
currentNPSMode = create_string_buffer(256)
ret = rocmsmi.rsmi_dev_nps_mode_get(device, currentNPSMode, 256)
if rsmi_ret_ok(ret, device, silent=True) and currentNPSMode.value.decode():
return str(currentNPSMode.value.decode())
return "UNKNOWN"
def print2DArray(dataArray):
""" Print 2D Array with uniform spacing """
global PRINT_JSON
@@ -773,6 +797,66 @@ def resetPerfDeterminism(deviceList):
printLogSpacer()
def resetComputePartition(deviceList):
""" Reset Compute Partition to its boot state
@param deviceList: List of DRM devices (can be a single-item list)
"""
printLogSpacer(" Reset compute partition to its boot state ")
for device in deviceList:
originalPartition = getComputePartition(device)
ret = rocmsmi.rsmi_dev_compute_partition_reset(device)
if rsmi_ret_ok(ret, device, silent=True):
resetBootState = getComputePartition(device)
printLog(device, "Successfully reset compute partition (" +
originalPartition + ") to boot state (" + resetBootState +
")", None)
elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION:
printLog(device, 'Permission denied', None)
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
printLog(device, 'Not supported on the given system', None)
else:
rsmi_ret_ok(ret, device)
printErrLog(device, 'Failed to reset the compute partition to boot state')
printLogSpacer()
def resetNpsMode(deviceList):
""" Reset NPS mode to its boot state
@param deviceList: List of DRM devices (can be a single-item list)
"""
printLogSpacer(" Reset nps mode to its boot state ")
for device in deviceList:
originalPartition = getMemoryPartition(device)
t1 = multiprocessing.Process(target=showProgressbar,
args=("Resetting NPS mode",13,))
t1.start()
addExtraLine=True
start=time.time()
ret = rocmsmi.rsmi_dev_nps_mode_reset(device)
stop=time.time()
duration=stop-start
if t1.is_alive():
t1.terminate()
t1.join()
if duration < float(0.1): # For longer runs, add extra line before output
addExtraLine=False # This is to prevent overriding progress bar
if rsmi_ret_ok(ret, device, silent=True):
resetBootState = getMemoryPartition(device)
printLog(device, "Successfully reset nps mode (" +
originalPartition + ") to boot state (" +
resetBootState + ")", None, addExtraLine)
elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION:
printLog(device, 'Permission denied', None, addExtraLine)
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
printLog(device, 'Not supported on the given system', None, addExtraLine)
else:
rsmi_ret_ok(ret, device)
printErrLog(device, 'Failed to reset nps mode to boot state')
printLogSpacer()
def setClockRange(deviceList, clkType, minvalue, maxvalue, autoRespond):
""" Set the range for the specified clktype in the PowerPlay table for a list of devices.
@@ -3228,7 +3312,7 @@ if __name__ == '__main__':
action='store_true')
groupDisplay.add_argument('--shownodesbw', help='Shows the numa nodes ', action='store_true')
groupDisplay.add_argument('--showcomputepartition', help='Shows current compute partitioning ', action='store_true')
groupDisplay.add_argument('--shownpsmode', help='Shows current nps mode ', action='store_true')
groupDisplay.add_argument('--shownpsmode', help='Shows current NPS mode ', action='store_true')
groupActionReset.add_argument('-r', '--resetclocks', help='Reset clocks and OverDrive to default',
action='store_true')
@@ -3238,7 +3322,9 @@ if __name__ == '__main__':
help='Set the maximum GPU power back to the device deafult state',
action='store_true')
groupActionReset.add_argument('--resetxgmierr', help='Reset XGMI error count', action='store_true')
groupAction.add_argument('--resetperfdeterminism', help='Disable performance determinism', action='store_true')
groupActionReset.add_argument('--resetperfdeterminism', help='Disable performance determinism', action='store_true')
groupActionReset.add_argument('--resetcomputepartition', help='Resets to boot compute partition state', action='store_true')
groupActionReset.add_argument('--resetnpsmode', help='Resets to boot NPS mode state', action='store_true')
groupAction.add_argument('--setclock',
help='Set Clock Frequency Level(s) for specified clock (requires manual Perf level)',
metavar=('TYPE','LEVEL'), nargs=2)
@@ -3317,7 +3403,7 @@ if __name__ == '__main__':
or args.setpoweroverdrive or args.resetpoweroverdrive or args.rasenable or args.rasdisable or \
args.rasinject or args.gpureset or args.setperfdeterminism or args.setslevel or args.setmlevel or \
args.setvc or args.setsrange or args.setmrange or args.setclock or \
args.setcomputepartition or args.setnpsmode:
args.setcomputepartition or args.setnpsmode or args.resetcomputepartition or args.resetnpsmode:
relaunchAsSudo()
# If there is one or more device specified, use that for all commands, otherwise use a
@@ -3561,6 +3647,10 @@ if __name__ == '__main__':
resetXgmiErr(deviceList)
if args.resetperfdeterminism:
resetPerfDeterminism(deviceList)
if args.resetcomputepartition:
resetComputePartition(deviceList)
if args.resetnpsmode:
resetNpsMode(deviceList)
if args.rasenable:
setRas(deviceList, 'enable', args.rasenable[0], args.rasenable[1])
if args.rasdisable:
Двоичный файл не отображается.
+30 -21
Просмотреть файл
@@ -83,11 +83,13 @@
} \
}
#define CHK_RSMI_PERM_RET(RET) { \
#define CHK_FILE_PERMISSIONS(RET) { \
if ((RET) == RSMI_STATUS_PERMISSION) { \
std::cout << "This command requires root access." << std::endl; \
if (isFileWritable(RET)) { \
CHK_RSMI_RET(RET) \
} \
} else { \
CHK_RSMI_RET_I(RET) \
CHK_RSMI_RET(RET) \
} \
}
@@ -229,7 +231,7 @@ static bool isUserRunningAsSudo() {
bool isRunningWithSudo = false;
auto myUID = getuid();
auto myPrivledges = geteuid();
if (myUID == myPrivledges) {
if ((myUID == myPrivledges) && (myPrivledges == 0)) {
isRunningWithSudo = true;
}
return isRunningWithSudo;
@@ -243,7 +245,6 @@ static bool isFileWritable(rsmi_status_t response) {
// response situation.
bool fileWritable = true;
if (isUserRunningAsSudo() && (response == RSMI_STATUS_PERMISSION)) {
PRINT_RSMI_ERR(response)
std::cout << "[WARN] User is running with sudo "
<< "permissions, file is not writable." << std::endl;
fileWritable = false;
@@ -511,18 +512,18 @@ static rsmi_status_t test_set_freq(uint32_t dv_ind) {
" to 0b" << freq_bm_str << " ..." << std::endl;
ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, freq_bitmask);
isFileWritable(ret);
CHK_FILE_PERMISSIONS(ret)
ret = rsmi_dev_gpu_clk_freq_get(dv_ind, rsmi_clk, &f);
CHK_AND_PRINT_RSMI_ERR_RET(ret)
CHK_FILE_PERMISSIONS(ret)
std::cout << "Frequency is now index " << f.current << std::endl;
std::cout << "Resetting mask to all frequencies." << std::endl;
ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, 0xFFFFFFFF);
isFileWritable(ret);
CHK_FILE_PERMISSIONS(ret)
ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_AUTO);
isFileWritable(ret);
CHK_FILE_PERMISSIONS(ret)
}
std::cout << std::endl;
return RSMI_STATUS_SUCCESS;
@@ -576,15 +577,20 @@ static rsmi_status_t test_set_compute_partitioning(uint32_t dv_ind) {
std::cout << std::endl << std::endl;
}
std::cout << "About to initate compute partition reset..." << std::endl;
ret = rsmi_dev_compute_partition_reset(dv_ind);
CHK_RSMI_NOT_SUPPORTED_RET(ret)
std::cout << "Done resetting compute partition." << std::endl;
std::string myComputePartition = originalComputePartition;
if (myComputePartition.empty() == false) {
std::cout << "Resetting compute partition to " << originalComputePartition
<< "... " << std::endl;
std::cout << "Resetting back to original compute partition to "
<< originalComputePartition << "... " << std::endl;
rsmi_compute_partition_type origComputePartitionType
= mapStringToRSMIComputePartitionTypes[originalComputePartition];
ret = rsmi_dev_compute_partition_set(dv_ind, origComputePartitionType);
CHK_RSMI_NOT_SUPPORTED_RET(ret)
std::cout << "Done" << std::endl;
ret = rsmi_dev_compute_partition_set(dv_ind, origComputePartitionType);
}
return RSMI_STATUS_SUCCESS;
}
@@ -629,15 +635,20 @@ static rsmi_status_t test_set_nps_mode(uint32_t dv_ind) {
std::cout << std::endl << std::endl;
}
std::cout << "About to initate nps mode reset..." << std::endl;
ret = rsmi_dev_nps_mode_reset(dv_ind);
CHK_RSMI_NOT_SUPPORTED_RET(ret)
std::cout << "Done resetting nps mode." << std::endl;
std::string myNpsMode = originalNpsMode;
if (myNpsMode.empty() == false) {
std::cout << "Resetting compute partition to " << originalNpsMode
<< "... " << std::endl;
rsmi_nps_mode_type_t origNpsModeType
= mapStringToRSMINpsModeTypes[originalNpsMode];
ret = rsmi_dev_nps_mode_set(dv_ind, origNpsModeType);
CHK_RSMI_NOT_SUPPORTED_RET(ret)
std::cout << "Done" << std::endl;
ret = rsmi_dev_nps_mode_set(dv_ind, origNpsModeType);
}
return RSMI_STATUS_SUCCESS;
}
@@ -664,10 +675,6 @@ int main() {
CHK_RSMI_RET_I(ret)
std::cout << "\t**Device ID: 0x" << std::hex << val_ui64 << std::endl;
std::cout << std::endl << std::endl;
std::cout << "Starting to call "
<< "rsmi_dev_compute_partition_get()..."
<< std::endl;
char current_compute_partition[256];
current_compute_partition[0] = '\0';
ret = rsmi_dev_compute_partition_get(i, current_compute_partition, 256);
@@ -679,10 +686,6 @@ int main() {
? "UNKNOWN" : current_compute_partition)
<< std::endl;
std::cout << std::endl << std::endl;
std::cout << "Starting to call "
<< "rsmi_dev_nps_mode_get()..."
<< std::endl;
uint32_t len = 5;
char nps_mode[len];
nps_mode[0] = '\0';
@@ -764,6 +767,12 @@ int main() {
}
std::cout << "***** Testing write api's" << std::endl;
if (isUserRunningAsSudo() == false) {
std::cout << "Write APIs require users to execute with sudo. "
<< "Cannot proceed." << std::endl;
return 0;
}
for (uint32_t i = 0; i< num_monitor_devs; ++i) {
ret = test_set_overdrive(i);
CHK_AND_PRINT_RSMI_ERR_RET(ret)
+41
Просмотреть файл
@@ -3986,6 +3986,47 @@ rsmi_dev_nps_mode_get(uint32_t dv_ind, char *nps_mode,
CATCH
}
rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) {
TRY
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
GET_DEV_FROM_INDX
rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED;
// read temp file
std::string bootState =
dev->readBootPartitionState<rsmi_compute_partition_type_t>(dv_ind);
// Initiate reset
// If bootState is UNKNOWN, we cannot reset - return RSMI_STATUS_NOT_SUPPORTED
// Likely due to device not supporting it
if (bootState != "UNKNOWN") {
rsmi_compute_partition_type_t compute_partition =
mapStringToRSMIComputePartitionTypes[bootState];
ret = rsmi_dev_compute_partition_set(dv_ind, compute_partition);
}
return ret;
CATCH
}
rsmi_status_t rsmi_dev_nps_mode_reset(uint32_t dv_ind) {
TRY
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
GET_DEV_FROM_INDX
rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED;
// read temp file
std::string bootState =
dev->readBootPartitionState<rsmi_nps_mode_type_t>(dv_ind);
// Initiate reset
// If bootState is UNKNOWN, we cannot reset - return RSMI_STATUS_NOT_SUPPORTED
// Likely due to device not supporting it
if (bootState != "UNKNOWN") {
rsmi_nps_mode_type_t nps_mode = mapStringToNPSModeTypes[bootState];
ret = rsmi_dev_nps_mode_set(dv_ind, nps_mode);
}
return ret;
CATCH
}
enum iterator_handle_type {
FUNC_ITER = 0,
VARIANT_ITER,
+123
Просмотреть файл
@@ -48,6 +48,7 @@
#include <assert.h>
#include <sys/stat.h>
#include <stdint.h>
#include <string>
#include <map>
#include <fstream>
@@ -1146,6 +1147,128 @@ rsmi_status_t Device::restartAMDGpuDriver(void) {
RSMI_STATUS_AMDGPU_RESTART_ERR);
}
template <typename T> rsmi_status_t storeParameter(uint32_t dv_ind);
// Stores parameters depending on which rsmi type is provided.
// Uses template specialization, to restrict types to identify
// calls needed to complete the function.
// typename - restricted to
// rsmi_compute_partition_type_t or rsmi_compute_partition_type_t
// dv_ind - device index
// tempFileName - base file name
template <>
rsmi_status_t storeParameter<rsmi_compute_partition_type_t>(uint32_t dv_ind) {
rsmi_status_t returnStatus = RSMI_STATUS_SUCCESS;
bool doesFileExist;
std::tie(doesFileExist, std::ignore) = readTmpFile(dv_ind, "boot",
"compute_partition");
// if temporary file exists -> we do not need to store anything new
// if not, read & store the state value
if (doesFileExist) {
return returnStatus;
}
uint32_t length = 128;
char data[length];
rsmi_status_t ret = rsmi_dev_compute_partition_get(dv_ind, data, length);
rsmi_status_t storeRet;
if (ret == RSMI_STATUS_SUCCESS) {
storeRet = storeTmpFile(dv_ind, "compute_partition", "boot", data);
} else if (ret == RSMI_STATUS_NOT_SUPPORTED) {
// not supported is ok
storeRet = storeTmpFile(dv_ind, "compute_partition", "boot", "UNKNOWN");
} else {
storeRet = storeTmpFile(dv_ind, "compute_partition", "boot", "UNKNOWN");
returnStatus = ret;
}
if (storeRet != RSMI_STATUS_SUCCESS) {
// file storage err takes precedence over other errors
returnStatus = storeRet;
}
return returnStatus;
}
// Stores parameters depending on which rsmi type is provided.
// Uses template specialization, to restrict types to identify
// calls needed to complete the function.
// typename - restricted to
// rsmi_compute_partition_type_t or rsmi_compute_partition_type_t
// dv_ind - device index
// tempFileName - base file name
template <> rsmi_status_t storeParameter<rsmi_nps_mode_type_t>(uint32_t dv_ind) {
rsmi_status_t returnStatus = RSMI_STATUS_SUCCESS;
uint32_t length = 128;
char data[length];
bool doesFileExist;
std::tie(doesFileExist, std::ignore) = readTmpFile(dv_ind, "boot",
"nps_mode");
// if temporary file exists -> we do not need to store anything new
// if not, read & store the state value
if (doesFileExist) {
return returnStatus;
}
rsmi_status_t ret = rsmi_dev_nps_mode_get(dv_ind, data, length);
rsmi_status_t storeRet;
if (ret == RSMI_STATUS_SUCCESS) {
storeRet = storeTmpFile(dv_ind, "nps_mode", "boot", data);
} else if (ret == RSMI_STATUS_NOT_SUPPORTED) {
// not supported is ok
storeRet = storeTmpFile(dv_ind, "nps_mode", "boot", "UNKNOWN");
} else {
storeRet = storeTmpFile(dv_ind, "nps_mode", "boot", "UNKNOWN");
returnStatus = ret;
}
if (storeRet != RSMI_STATUS_SUCCESS) {
// file storage err takes precedence over other errors
returnStatus = storeRet;
}
return returnStatus;
}
rsmi_status_t Device::storeDevicePartitions(uint32_t dv_ind) {
rsmi_status_t returnStatus = RSMI_STATUS_SUCCESS;
returnStatus = storeParameter<rsmi_compute_partition_type_t>(dv_ind);
rsmi_status_t npsRet = storeParameter<rsmi_nps_mode_type_t>(dv_ind);
if (returnStatus == RSMI_STATUS_SUCCESS) { // only record earliest error
returnStatus = npsRet;
}
return returnStatus;
}
// Reads a device's boot partition state, depending on which rsmi type is
// provided and device index.
// Uses template specialization, to restrict types to identify
// calls needed to complete the function.
// typename - restricted to rsmi_compute_partition_type_t
// or rsmi_compute_partition_type_t
// dv_ind - device index
template <>
std::string Device::readBootPartitionState<rsmi_compute_partition_type_t>(
uint32_t dv_ind) {
std::string boot_state;
std::tie(std::ignore, boot_state) = readTmpFile(dv_ind, "boot",
"compute_partition");
return boot_state;
}
// Reads a device's boot partition state, depending on which rsmi type is
// provided and device index.
// Uses template specialization, to restrict types to identify
// calls needed to complete the function.
// typename - restricted to rsmi_compute_partition_type_t
// or rsmi_compute_partition_type_t
// dv_ind - device index
template <>
std::string Device::readBootPartitionState<rsmi_nps_mode_type_t>(
uint32_t dv_ind) {
std::string boot_state;
std::tie(std::ignore, boot_state) = readTmpFile(dv_ind, "boot", "nps_mode");
return boot_state;
}
#undef RET_IF_NONZERO
} // namespace smi
} // namespace amd
+6
Просмотреть файл
@@ -373,6 +373,7 @@ RocmSMI::Initialize(uint64_t flags) {
// 1. construct kfd_node_map_ with gpu_id as key and *Device as value
// 2. for each kfd node, write the corresponding dv_ind
// 3. for each amdgpu device, write the corresponding gpu_id
// 4. for each amdgpu device, attempt to store it's boot partition
for (uint32_t dv_ind = 0; dv_ind < devices_.size(); ++dv_ind) {
dev = devices_[dv_ind];
uint64_t bdfid = dev->bdfid();
@@ -387,7 +388,12 @@ RocmSMI::Initialize(uint64_t flags) {
uint64_t gpu_id = tmp_map[bdfid]->gpu_id();
dev->set_kfd_gpu_id(gpu_id);
kfd_node_map_[gpu_id] = tmp_map[bdfid];
// store each device boot partition state, if file doesn't exist
dev->storeDevicePartitions(dv_ind);
}
// Leaving below to help debug temp file issues
// displayAppTmpFilesContent();
}
void
+138
Просмотреть файл
@@ -43,6 +43,8 @@
#include <assert.h>
#include <errno.h>
#include <sys/stat.h>
#include <unistd.h>
#include <dirent.h>
#include <fstream>
#include <string>
@@ -61,6 +63,7 @@
namespace amd {
namespace smi {
const std::string kTmpFilePrefix = "rocmsmi_";
// Return 0 if same file, 1 if not, and -1 for error
int SameFile(const std::string fileA, const std::string fileB) {
@@ -298,5 +301,140 @@ std::pair<bool, std::string> executeCommand(std::string command, bool stdOut) {
return std::make_pair(successfulRun, stdoutAndErr);
}
// originalstring - string to search for substring
// substring - string looking to find
bool containsString(std::string originalString, std::string substring) {
if (originalString.find(substring) != std::string::npos) {
return true;
} else {
return false;
}
}
// Creates and stores supplied data into a temporary file (within /tmp/).
// All temporary files are removed upon reboot.
// Allows all users/groups to read the temporary file.
//
// For more detail, refer to mkstemp manpage:
// https://man7.org/linux/man-pages/man3/mkstemp.3.html
//
// Temporary file name format:
// <app prefix>_<state name>_<paramenter name>_<device id>
// <app prefix> - prefix for our application's identifier (see kTmpFilePrefix)
// <paramenter name> - name of parameter being stored
// <state name> - state at which the stored value captures
// <device index> - device identifier
//
// dv_ind - device index
// parameterName - name of parameter stored
// stateName - state at which the stored value captures
// storageData - string value of data to be stored
rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName,
std::string stateName, std::string storageData) {
// Required tags needed to store our files
// Files name format:
// <app prefix>_<stateName>_<parameterName>_<device id>
std::string fullFileName = kTmpFilePrefix + stateName + "_" +
parameterName + "_" + std::to_string(dv_ind);
bool doesFileExist;
std::tie(doesFileExist, std::ignore) =
readTmpFile(dv_ind, stateName, parameterName);
if (doesFileExist) {
// do not store, if file already exists
return RSMI_STATUS_SUCCESS;
}
// template for our file
std::string fullTempFilePath = "/tmp/" + fullFileName + ".XXXXXX";
char *fileName = &fullTempFilePath[0];
int fd = mkstemp(fileName);
if (fd == -1) {
return RSMI_STATUS_FILE_ERROR;
}
chmod(fileName, S_IRUSR|S_IRGRP|S_IROTH);
write(fd, storageData.c_str(), storageData.size());
close(fd);
return RSMI_STATUS_SUCCESS;
}
std::vector<std::string> getListOfAppTmpFiles() {
std::string path = "/tmp";
DIR *dir;
struct dirent *ent;
std::vector<std::string> tmpFiles;
if ((dir = opendir(path.c_str())) != nullptr) {
// captures all files & directories under specified path
while ((ent = readdir(dir)) != nullptr) {
std::string fileDirName = ent->d_name;
// we only want our app specific files
if (containsString(fileDirName, kTmpFilePrefix)) {
tmpFiles.emplace_back(path + "/" + fileDirName);
} else {
continue;
}
}
}
return tmpFiles;
}
// Reads a temporary file in path provided
// If file does not exist, returns an empty string
// If file exists, returns content (which could be an empty string)
std::string readTemporaryFile(std::string path) {
std::string fileContent;
std::ifstream inFileStream(path);
if (inFileStream.is_open()) {
inFileStream >> fileContent;
}
return fileContent;
}
// Used to debug application temporary files (idenified by kTmpFilePrefix)
// and their content
void displayAppTmpFilesContent() {
std::vector<std::string> tmpFiles = getListOfAppTmpFiles();
if (tmpFiles.empty() == false) {
for (auto &x: tmpFiles) {
std::string out = readTemporaryFile(x);
std::cout << __PRETTY_FUNCTION__ << " | Temporary file: " << x
<< "; Contained content: " << out << std::endl;
}
} else {
std::cout << __PRETTY_FUNCTION__ << " | No temporary files were found"
<< std::endl;
}
}
// Attempts to read application specific temporary file
// This method is to be used for reading (or determing if it exists),
// in order to keep file naming scheme consistent.
//
// dv_ind - device index
// parameterName - name of parameter stored
// stateName - state at which the stored value captures
// Returns:
// boolean - if temporary file exists
// string - content of temporary file, if it exists (otherwise, an empty
// string is returned)
std::tuple<bool, std::string> readTmpFile(uint32_t dv_ind,
std::string stateName,
std::string parameterName) {
bool fileExists = false;
std::string tmpFileName = kTmpFilePrefix + stateName + "_" +parameterName +
"_" + std::to_string(dv_ind);
std::string fileContent;
std::vector<std::string> tmpFiles = getListOfAppTmpFiles();
if (tmpFiles.empty() == false) {
for (auto &x: tmpFiles) {
if (containsString(x, tmpFileName)) {
fileContent = readTemporaryFile(x);
fileExists = true;
break;
}
}
}
return std::make_tuple(fileExists, fileContent);
}
} // namespace smi
} // namespace amd
+42 -2
Просмотреть файл
@@ -269,13 +269,13 @@ void TestComputePartitionReadWrite::Run(void) {
<< computePartitionString(new_computePartition)
<< " ===============" << std::endl;
}
ret = rsmi_dev_compute_partition_set(dv_ind, new_computePartition);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Attempting to set compute partition to: "
<< computePartitionString(new_computePartition) << std::endl;
}
ret = rsmi_dev_compute_partition_set(dv_ind, new_computePartition);
CHK_ERR_ASRT(ret)
ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition,
255);
CHK_ERR_ASRT(ret)
@@ -290,6 +290,46 @@ void TestComputePartitionReadWrite::Run(void) {
current_char_computePartition);
}
/* TEST RETURN TO BOOT COMPUTE PARTITION SETTING */
IF_VERB(STANDARD) {
std::cout << std::endl;
std::cout << "\t**"
<< "=========== TEST RETURN TO BOOT COMPUTE PARTITION SETTING "
<< "========" << std::endl;
}
std::string oldPartition = current_char_computePartition;
bool wasResetSuccess = false;
ret = rsmi_dev_compute_partition_reset(dv_ind);
ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) ||
(ret == RSMI_STATUS_NOT_SUPPORTED));
if (ret == RSMI_STATUS_SUCCESS) {
wasResetSuccess = true;
}
ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition,
255);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Current compute partition: " << current_char_computePartition << std::endl;
}
if (wasResetSuccess) {
ASSERT_STRNE(oldPartition.c_str(), current_char_computePartition);
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Confirmed prior partition (" << oldPartition << ") is not "
<< "equal to current partition ("
<< current_char_computePartition << ")" << std::endl;
}
} else {
ASSERT_STREQ(oldPartition.c_str(), current_char_computePartition);
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Confirmed prior partition (" << oldPartition << ") is equal"
<< " to current partition ("
<< current_char_computePartition << ")" << std::endl;
}
}
/* TEST RETURN TO ORIGINAL COMPUTE PARTITIONING SETTING */
IF_VERB(STANDARD) {
std::cout << std::endl;
+50 -10
Просмотреть файл
@@ -221,11 +221,11 @@ void TestNPSModeReadWrite::Run(void) {
EXPECT_TRUE((err == RSMI_STATUS_INVALID_ARGS) ||
(err == RSMI_STATUS_NOT_SUPPORTED) ||
(err == RSMI_STATUS_PERMISSION));
if (err == RSMI_STATUS_INVALID_ARGS) {
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Confirmed RSMI_STATUS_INVALID_ARGS was returned."
<< std::endl;
if (err == RSMI_STATUS_INVALID_ARGS) {
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Confirmed RSMI_STATUS_INVALID_ARGS was returned."
<< std::endl;
} else if (err == RSMI_STATUS_PERMISSION) {
DISPLAY_RSMI_ERR(err)
// tests should not continue if err is a permission issue
@@ -251,13 +251,14 @@ void TestNPSModeReadWrite::Run(void) {
<< npsModeString(new_nps_mode)
<< " ===============" << std::endl;
}
ret = rsmi_dev_nps_mode_set(dv_ind, new_nps_mode);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Attempting to set nps mode to: "
<< npsModeString(new_nps_mode) << std::endl;
}
ret = rsmi_dev_nps_mode_set(dv_ind, new_nps_mode);
CHK_ERR_ASRT(ret)
ret = rsmi_dev_nps_mode_get(dv_ind, current_nps_mode, 255);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
@@ -268,6 +269,45 @@ void TestNPSModeReadWrite::Run(void) {
EXPECT_STREQ(npsModeString(new_nps_mode).c_str(), current_nps_mode);
}
/* TEST RETURN TO BOOT NPS MODE SETTING */
IF_VERB(STANDARD) {
std::cout << std::endl;
std::cout << "\t**"
<< "=========== TEST RETURN TO BOOT NPS MODE SETTING "
<< "========" << std::endl;
}
std::string oldMode = current_nps_mode;
bool wasResetSuccess = false;
ret = rsmi_dev_nps_mode_reset(dv_ind);
ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) ||
(ret == RSMI_STATUS_NOT_SUPPORTED));
if (ret == RSMI_STATUS_SUCCESS) {
wasResetSuccess = true;
}
ret = rsmi_dev_nps_mode_get(dv_ind, current_nps_mode, 255);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Current nps mode: " << current_nps_mode << std::endl;
}
if (wasResetSuccess) {
ASSERT_STRNE(oldMode.c_str(), current_nps_mode);
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Confirmed prior nps mode (" << oldMode << ") is not "
<< "equal to current nps mode ("
<< current_nps_mode << ")" << std::endl;
}
} else {
ASSERT_STREQ(oldMode.c_str(), current_nps_mode);
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Confirmed prior nps mode (" << oldMode << ") is equal"
<< " to current nps mode ("
<< current_nps_mode << ")" << std::endl;
}
}
/* TEST RETURN TO ORIGINAL NPS MODE SETTING */
IF_VERB(STANDARD) {
std::cout << std::endl;
@@ -277,18 +317,18 @@ void TestNPSModeReadWrite::Run(void) {
}
new_nps_mode
= mapStringToRSMINpsModeTypes.at(orig_nps_mode);
ret = rsmi_dev_nps_mode_set(dv_ind, new_nps_mode);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\t**" << "Returning nps mode to: "
<< npsModeString(new_nps_mode) << std::endl;
}
ret = rsmi_dev_nps_mode_set(dv_ind, new_nps_mode);
CHK_ERR_ASRT(ret)
ret = rsmi_dev_nps_mode_get(dv_ind, current_nps_mode, 255);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\t**" << "Attempted to set nps mode: "
<< npsModeString(new_nps_mode) << std::endl
<< "\t**" << "Current compute partition: " << current_nps_mode
<< "\t**" << "Current nps mode: " << current_nps_mode
<< std::endl;
}
EXPECT_EQ(RSMI_STATUS_SUCCESS, ret);