diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index c478c045b8..3a80112264 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -3540,12 +3540,13 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst, * which the device's current compute partition will be written to. * * @param[in] len the length of the caller provided buffer @p compute_partition + * , suggested length is 4 or greater. * * @retval ::RSMI_STATUS_SUCCESS call was successful * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid * @retval ::RSMI_STATUS_UNEXPECTED_DATA data provided to function is not valid * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not - * support this function with the given arguments + * support this function * @retval ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if @p len bytes is not * large enough to hold the entire compute partition value. In this case, * only @p len bytes will be written. @@ -3572,13 +3573,30 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, * @retval ::RSMI_STATUS_PERMISSION function requires root access * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not - * support this function with the given arguments + * support this function * */ rsmi_status_t rsmi_dev_compute_partition_set(uint32_t dv_ind, rsmi_compute_partition_type_t compute_partition); +/** + * @brief Reverts a selected device's compute partition setting back to its + * boot state. + * + * @details Given a device index @p dv_ind , this function will attempt to + * revert its compute partition setting back to its boot state. + * + * @param[in] dv_ind a device index + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_PERMISSION function requires root access + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * + */ +rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind); + /** @} */ // end of ComputePartition /*****************************************************************************/ @@ -3609,7 +3627,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid * @retval ::RSMI_STATUS_UNEXPECTED_DATA data provided to function is not valid * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not - * support this function with the given arguments + * support this function * @retval ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if @p len bytes is not * large enough to hold the entire nps mode value. In this case, * only @p len bytes will be written. @@ -3634,7 +3652,7 @@ rsmi_dev_nps_mode_get(uint32_t dv_ind, char *nps_mode, uint32_t len); * @retval ::RSMI_STATUS_PERMISSION function requires root access * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not - * support this function with the given arguments + * support this function * @retval ::RSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart * the amdgpu driver * @@ -3642,6 +3660,25 @@ rsmi_dev_nps_mode_get(uint32_t dv_ind, char *nps_mode, uint32_t len); rsmi_status_t rsmi_dev_nps_mode_set(uint32_t dv_ind, rsmi_nps_mode_type_t nps_mode); +/** + * @brief Reverts a selected device's NPS mode setting back to its + * boot state. + * + * @details Given a device index @p dv_ind , this function will attempt to + * revert its NPS mode setting back to its boot state. + * + * @param[in] dv_ind a device index + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_PERMISSION function requires root access + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * @retval ::RSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart + * the amdgpu driver + * + */ +rsmi_status_t rsmi_dev_nps_mode_reset(uint32_t dv_ind); + /** @} */ // end of NPSMode /*****************************************************************************/ diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h index 6b7b3baeca..e3c94164ac 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h @@ -217,6 +217,8 @@ class Device { bool DeviceAPISupported(std::string name, uint64_t variant, uint64_t sub_variant); rsmi_status_t restartAMDGpuDriver(void); + rsmi_status_t storeDevicePartitions(uint32_t dv_ind); + template std::string readBootPartitionState(uint32_t dv_ind); private: std::shared_ptr monitor_; diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h index 527342d544..f2f8d6ab8f 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h @@ -66,18 +66,23 @@ namespace amd { namespace smi { pthread_mutex_t *GetMutex(uint32_t dv_ind); - int SameFile(const std::string fileA, const std::string fileB); bool FileExists(char const *filename); int isRegularFile(std::string fname, bool *is_reg); - int ReadSysfsStr(std::string path, std::string *retStr); int WriteSysfsStr(std::string path, std::string val); - bool IsInteger(const std::string & n_str); - -std::pair executeCommand(std::string command, bool stdOut = true); - +std::pair executeCommand(std::string command, + bool stdOut = true); +rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName, + std::string stateName, std::string storageData); +std::vector getListOfAppTmpFiles(); +bool containsString(std::string originalString, std::string substring); +std::tuple readTmpFile( + uint32_t dv_ind, + std::string stateName, + std::string parameterName); +void displayAppTmpFilesContent(void); rsmi_status_t handleException(); rsmi_status_t GetDevValueVec(amd::smi::DevInfoTypes type, diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py index d235738703..cae7e0cef0 100755 --- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py +++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py @@ -411,6 +411,30 @@ def getVersion(deviceList, component): return None +def getComputePartition(device): + """ Return the current compute partition of a given device + + @param device: DRM device identifier + """ + currentComputePartition = create_string_buffer(256) + ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256) + if rsmi_ret_ok(ret, device, silent=True) and currentComputePartition.value.decode(): + return str(currentComputePartition.value.decode()) + return "UNKNOWN" + + +def getMemoryPartition(device): + """ Return the current memory partition of a given device + + @param device: DRM device identifier + """ + currentNPSMode = create_string_buffer(256) + ret = rocmsmi.rsmi_dev_nps_mode_get(device, currentNPSMode, 256) + if rsmi_ret_ok(ret, device, silent=True) and currentNPSMode.value.decode(): + return str(currentNPSMode.value.decode()) + return "UNKNOWN" + + def print2DArray(dataArray): """ Print 2D Array with uniform spacing """ global PRINT_JSON @@ -773,6 +797,66 @@ def resetPerfDeterminism(deviceList): printLogSpacer() +def resetComputePartition(deviceList): + """ Reset Compute Partition to its boot state + + @param deviceList: List of DRM devices (can be a single-item list) + """ + printLogSpacer(" Reset compute partition to its boot state ") + for device in deviceList: + originalPartition = getComputePartition(device) + ret = rocmsmi.rsmi_dev_compute_partition_reset(device) + if rsmi_ret_ok(ret, device, silent=True): + resetBootState = getComputePartition(device) + printLog(device, "Successfully reset compute partition (" + + originalPartition + ") to boot state (" + resetBootState + + ")", None) + elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION: + printLog(device, 'Permission denied', None) + elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: + printLog(device, 'Not supported on the given system', None) + else: + rsmi_ret_ok(ret, device) + printErrLog(device, 'Failed to reset the compute partition to boot state') + printLogSpacer() + + +def resetNpsMode(deviceList): + """ Reset NPS mode to its boot state + + @param deviceList: List of DRM devices (can be a single-item list) + """ + printLogSpacer(" Reset nps mode to its boot state ") + for device in deviceList: + originalPartition = getMemoryPartition(device) + t1 = multiprocessing.Process(target=showProgressbar, + args=("Resetting NPS mode",13,)) + t1.start() + addExtraLine=True + start=time.time() + ret = rocmsmi.rsmi_dev_nps_mode_reset(device) + stop=time.time() + duration=stop-start + if t1.is_alive(): + t1.terminate() + t1.join() + if duration < float(0.1): # For longer runs, add extra line before output + addExtraLine=False # This is to prevent overriding progress bar + if rsmi_ret_ok(ret, device, silent=True): + resetBootState = getMemoryPartition(device) + printLog(device, "Successfully reset nps mode (" + + originalPartition + ") to boot state (" + + resetBootState + ")", None, addExtraLine) + elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION: + printLog(device, 'Permission denied', None, addExtraLine) + elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: + printLog(device, 'Not supported on the given system', None, addExtraLine) + else: + rsmi_ret_ok(ret, device) + printErrLog(device, 'Failed to reset nps mode to boot state') + printLogSpacer() + + def setClockRange(deviceList, clkType, minvalue, maxvalue, autoRespond): """ Set the range for the specified clktype in the PowerPlay table for a list of devices. @@ -3228,7 +3312,7 @@ if __name__ == '__main__': action='store_true') groupDisplay.add_argument('--shownodesbw', help='Shows the numa nodes ', action='store_true') groupDisplay.add_argument('--showcomputepartition', help='Shows current compute partitioning ', action='store_true') - groupDisplay.add_argument('--shownpsmode', help='Shows current nps mode ', action='store_true') + groupDisplay.add_argument('--shownpsmode', help='Shows current NPS mode ', action='store_true') groupActionReset.add_argument('-r', '--resetclocks', help='Reset clocks and OverDrive to default', action='store_true') @@ -3238,7 +3322,9 @@ if __name__ == '__main__': help='Set the maximum GPU power back to the device deafult state', action='store_true') groupActionReset.add_argument('--resetxgmierr', help='Reset XGMI error count', action='store_true') - groupAction.add_argument('--resetperfdeterminism', help='Disable performance determinism', action='store_true') + groupActionReset.add_argument('--resetperfdeterminism', help='Disable performance determinism', action='store_true') + groupActionReset.add_argument('--resetcomputepartition', help='Resets to boot compute partition state', action='store_true') + groupActionReset.add_argument('--resetnpsmode', help='Resets to boot NPS mode state', action='store_true') groupAction.add_argument('--setclock', help='Set Clock Frequency Level(s) for specified clock (requires manual Perf level)', metavar=('TYPE','LEVEL'), nargs=2) @@ -3317,7 +3403,7 @@ if __name__ == '__main__': or args.setpoweroverdrive or args.resetpoweroverdrive or args.rasenable or args.rasdisable or \ args.rasinject or args.gpureset or args.setperfdeterminism or args.setslevel or args.setmlevel or \ args.setvc or args.setsrange or args.setmrange or args.setclock or \ - args.setcomputepartition or args.setnpsmode: + args.setcomputepartition or args.setnpsmode or args.resetcomputepartition or args.resetnpsmode: relaunchAsSudo() # If there is one or more device specified, use that for all commands, otherwise use a @@ -3561,6 +3647,10 @@ if __name__ == '__main__': resetXgmiErr(deviceList) if args.resetperfdeterminism: resetPerfDeterminism(deviceList) + if args.resetcomputepartition: + resetComputePartition(deviceList) + if args.resetnpsmode: + resetNpsMode(deviceList) if args.rasenable: setRas(deviceList, 'enable', args.rasenable[0], args.rasenable[1]) if args.rasdisable: diff --git a/projects/rocm-smi-lib/rocm_smi/docs/ROCm_SMI_Manual.pdf b/projects/rocm-smi-lib/rocm_smi/docs/ROCm_SMI_Manual.pdf index 05cf23122e..523a91b2ba 100644 Binary files a/projects/rocm-smi-lib/rocm_smi/docs/ROCm_SMI_Manual.pdf and b/projects/rocm-smi-lib/rocm_smi/docs/ROCm_SMI_Manual.pdf differ diff --git a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc index a5001a47ef..a9fcd6c801 100755 --- a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc +++ b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc @@ -83,11 +83,13 @@ } \ } -#define CHK_RSMI_PERM_RET(RET) { \ +#define CHK_FILE_PERMISSIONS(RET) { \ if ((RET) == RSMI_STATUS_PERMISSION) { \ - std::cout << "This command requires root access." << std::endl; \ + if (isFileWritable(RET)) { \ + CHK_RSMI_RET(RET) \ + } \ } else { \ - CHK_RSMI_RET_I(RET) \ + CHK_RSMI_RET(RET) \ } \ } @@ -229,7 +231,7 @@ static bool isUserRunningAsSudo() { bool isRunningWithSudo = false; auto myUID = getuid(); auto myPrivledges = geteuid(); - if (myUID == myPrivledges) { + if ((myUID == myPrivledges) && (myPrivledges == 0)) { isRunningWithSudo = true; } return isRunningWithSudo; @@ -243,7 +245,6 @@ static bool isFileWritable(rsmi_status_t response) { // response situation. bool fileWritable = true; if (isUserRunningAsSudo() && (response == RSMI_STATUS_PERMISSION)) { - PRINT_RSMI_ERR(response) std::cout << "[WARN] User is running with sudo " << "permissions, file is not writable." << std::endl; fileWritable = false; @@ -511,18 +512,18 @@ static rsmi_status_t test_set_freq(uint32_t dv_ind) { " to 0b" << freq_bm_str << " ..." << std::endl; ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, freq_bitmask); - isFileWritable(ret); + CHK_FILE_PERMISSIONS(ret) ret = rsmi_dev_gpu_clk_freq_get(dv_ind, rsmi_clk, &f); - CHK_AND_PRINT_RSMI_ERR_RET(ret) + CHK_FILE_PERMISSIONS(ret) std::cout << "Frequency is now index " << f.current << std::endl; std::cout << "Resetting mask to all frequencies." << std::endl; ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, 0xFFFFFFFF); - isFileWritable(ret); + CHK_FILE_PERMISSIONS(ret) ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_AUTO); - isFileWritable(ret); + CHK_FILE_PERMISSIONS(ret) } std::cout << std::endl; return RSMI_STATUS_SUCCESS; @@ -576,15 +577,20 @@ static rsmi_status_t test_set_compute_partitioning(uint32_t dv_ind) { std::cout << std::endl << std::endl; } + std::cout << "About to initate compute partition reset..." << std::endl; + ret = rsmi_dev_compute_partition_reset(dv_ind); + CHK_RSMI_NOT_SUPPORTED_RET(ret) + std::cout << "Done resetting compute partition." << std::endl; + std::string myComputePartition = originalComputePartition; if (myComputePartition.empty() == false) { - std::cout << "Resetting compute partition to " << originalComputePartition - << "... " << std::endl; + std::cout << "Resetting back to original compute partition to " + << originalComputePartition << "... " << std::endl; rsmi_compute_partition_type origComputePartitionType = mapStringToRSMIComputePartitionTypes[originalComputePartition]; + ret = rsmi_dev_compute_partition_set(dv_ind, origComputePartitionType); CHK_RSMI_NOT_SUPPORTED_RET(ret) std::cout << "Done" << std::endl; - ret = rsmi_dev_compute_partition_set(dv_ind, origComputePartitionType); } return RSMI_STATUS_SUCCESS; } @@ -629,15 +635,20 @@ static rsmi_status_t test_set_nps_mode(uint32_t dv_ind) { std::cout << std::endl << std::endl; } + std::cout << "About to initate nps mode reset..." << std::endl; + ret = rsmi_dev_nps_mode_reset(dv_ind); + CHK_RSMI_NOT_SUPPORTED_RET(ret) + std::cout << "Done resetting nps mode." << std::endl; + std::string myNpsMode = originalNpsMode; if (myNpsMode.empty() == false) { std::cout << "Resetting compute partition to " << originalNpsMode << "... " << std::endl; rsmi_nps_mode_type_t origNpsModeType = mapStringToRSMINpsModeTypes[originalNpsMode]; + ret = rsmi_dev_nps_mode_set(dv_ind, origNpsModeType); CHK_RSMI_NOT_SUPPORTED_RET(ret) std::cout << "Done" << std::endl; - ret = rsmi_dev_nps_mode_set(dv_ind, origNpsModeType); } return RSMI_STATUS_SUCCESS; } @@ -664,10 +675,6 @@ int main() { CHK_RSMI_RET_I(ret) std::cout << "\t**Device ID: 0x" << std::hex << val_ui64 << std::endl; - std::cout << std::endl << std::endl; - std::cout << "Starting to call " - << "rsmi_dev_compute_partition_get()..." - << std::endl; char current_compute_partition[256]; current_compute_partition[0] = '\0'; ret = rsmi_dev_compute_partition_get(i, current_compute_partition, 256); @@ -679,10 +686,6 @@ int main() { ? "UNKNOWN" : current_compute_partition) << std::endl; - std::cout << std::endl << std::endl; - std::cout << "Starting to call " - << "rsmi_dev_nps_mode_get()..." - << std::endl; uint32_t len = 5; char nps_mode[len]; nps_mode[0] = '\0'; @@ -764,6 +767,12 @@ int main() { } std::cout << "***** Testing write api's" << std::endl; + if (isUserRunningAsSudo() == false) { + std::cout << "Write APIs require users to execute with sudo. " + << "Cannot proceed." << std::endl; + return 0; + } + for (uint32_t i = 0; i< num_monitor_devs; ++i) { ret = test_set_overdrive(i); CHK_AND_PRINT_RSMI_ERR_RET(ret) diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 645e1f6e6d..5bf50f2b87 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -3986,6 +3986,47 @@ rsmi_dev_nps_mode_get(uint32_t dv_ind, char *nps_mode, CATCH } +rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) { + TRY + REQUIRE_ROOT_ACCESS + DEVICE_MUTEX + GET_DEV_FROM_INDX + rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED; + // read temp file + std::string bootState = + dev->readBootPartitionState(dv_ind); + // Initiate reset + // If bootState is UNKNOWN, we cannot reset - return RSMI_STATUS_NOT_SUPPORTED + // Likely due to device not supporting it + if (bootState != "UNKNOWN") { + rsmi_compute_partition_type_t compute_partition = + mapStringToRSMIComputePartitionTypes[bootState]; + ret = rsmi_dev_compute_partition_set(dv_ind, compute_partition); + } + return ret; + CATCH +} + +rsmi_status_t rsmi_dev_nps_mode_reset(uint32_t dv_ind) { + TRY + REQUIRE_ROOT_ACCESS + DEVICE_MUTEX + GET_DEV_FROM_INDX + rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED; + // read temp file + std::string bootState = + dev->readBootPartitionState(dv_ind); + // Initiate reset + // If bootState is UNKNOWN, we cannot reset - return RSMI_STATUS_NOT_SUPPORTED + // Likely due to device not supporting it + if (bootState != "UNKNOWN") { + rsmi_nps_mode_type_t nps_mode = mapStringToNPSModeTypes[bootState]; + ret = rsmi_dev_nps_mode_set(dv_ind, nps_mode); + } + return ret; + CATCH +} + enum iterator_handle_type { FUNC_ITER = 0, VARIANT_ITER, diff --git a/projects/rocm-smi-lib/src/rocm_smi_device.cc b/projects/rocm-smi-lib/src/rocm_smi_device.cc index 3c9fba6287..e8e89aa9cb 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_device.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_device.cc @@ -48,6 +48,7 @@ #include #include #include + #include #include #include @@ -1146,6 +1147,128 @@ rsmi_status_t Device::restartAMDGpuDriver(void) { RSMI_STATUS_AMDGPU_RESTART_ERR); } +template rsmi_status_t storeParameter(uint32_t dv_ind); + +// Stores parameters depending on which rsmi type is provided. +// Uses template specialization, to restrict types to identify +// calls needed to complete the function. +// typename - restricted to +// rsmi_compute_partition_type_t or rsmi_compute_partition_type_t +// dv_ind - device index +// tempFileName - base file name +template <> +rsmi_status_t storeParameter(uint32_t dv_ind) { + rsmi_status_t returnStatus = RSMI_STATUS_SUCCESS; + bool doesFileExist; + std::tie(doesFileExist, std::ignore) = readTmpFile(dv_ind, "boot", + "compute_partition"); + // if temporary file exists -> we do not need to store anything new + // if not, read & store the state value + if (doesFileExist) { + return returnStatus; + } + uint32_t length = 128; + char data[length]; + rsmi_status_t ret = rsmi_dev_compute_partition_get(dv_ind, data, length); + rsmi_status_t storeRet; + + if (ret == RSMI_STATUS_SUCCESS) { + storeRet = storeTmpFile(dv_ind, "compute_partition", "boot", data); + } else if (ret == RSMI_STATUS_NOT_SUPPORTED) { + // not supported is ok + storeRet = storeTmpFile(dv_ind, "compute_partition", "boot", "UNKNOWN"); + } else { + storeRet = storeTmpFile(dv_ind, "compute_partition", "boot", "UNKNOWN"); + returnStatus = ret; + } + + if (storeRet != RSMI_STATUS_SUCCESS) { + // file storage err takes precedence over other errors + returnStatus = storeRet; + } + return returnStatus; +} + +// Stores parameters depending on which rsmi type is provided. +// Uses template specialization, to restrict types to identify +// calls needed to complete the function. +// typename - restricted to +// rsmi_compute_partition_type_t or rsmi_compute_partition_type_t +// dv_ind - device index +// tempFileName - base file name +template <> rsmi_status_t storeParameter(uint32_t dv_ind) { + rsmi_status_t returnStatus = RSMI_STATUS_SUCCESS; + uint32_t length = 128; + char data[length]; + bool doesFileExist; + std::tie(doesFileExist, std::ignore) = readTmpFile(dv_ind, "boot", + "nps_mode"); + // if temporary file exists -> we do not need to store anything new + // if not, read & store the state value + if (doesFileExist) { + return returnStatus; + } + rsmi_status_t ret = rsmi_dev_nps_mode_get(dv_ind, data, length); + rsmi_status_t storeRet; + + if (ret == RSMI_STATUS_SUCCESS) { + storeRet = storeTmpFile(dv_ind, "nps_mode", "boot", data); + } else if (ret == RSMI_STATUS_NOT_SUPPORTED) { + // not supported is ok + storeRet = storeTmpFile(dv_ind, "nps_mode", "boot", "UNKNOWN"); + } else { + storeRet = storeTmpFile(dv_ind, "nps_mode", "boot", "UNKNOWN"); + returnStatus = ret; + } + + if (storeRet != RSMI_STATUS_SUCCESS) { + // file storage err takes precedence over other errors + returnStatus = storeRet; + } + return returnStatus; +} + +rsmi_status_t Device::storeDevicePartitions(uint32_t dv_ind) { + rsmi_status_t returnStatus = RSMI_STATUS_SUCCESS; + returnStatus = storeParameter(dv_ind); + rsmi_status_t npsRet = storeParameter(dv_ind); + if (returnStatus == RSMI_STATUS_SUCCESS) { // only record earliest error + returnStatus = npsRet; + } + return returnStatus; +} + +// Reads a device's boot partition state, depending on which rsmi type is +// provided and device index. +// Uses template specialization, to restrict types to identify +// calls needed to complete the function. +// typename - restricted to rsmi_compute_partition_type_t +// or rsmi_compute_partition_type_t +// dv_ind - device index +template <> +std::string Device::readBootPartitionState( + uint32_t dv_ind) { + std::string boot_state; + std::tie(std::ignore, boot_state) = readTmpFile(dv_ind, "boot", + "compute_partition"); + return boot_state; +} + +// Reads a device's boot partition state, depending on which rsmi type is +// provided and device index. +// Uses template specialization, to restrict types to identify +// calls needed to complete the function. +// typename - restricted to rsmi_compute_partition_type_t +// or rsmi_compute_partition_type_t +// dv_ind - device index +template <> +std::string Device::readBootPartitionState( + uint32_t dv_ind) { + std::string boot_state; + std::tie(std::ignore, boot_state) = readTmpFile(dv_ind, "boot", "nps_mode"); + return boot_state; +} + #undef RET_IF_NONZERO } // namespace smi } // namespace amd diff --git a/projects/rocm-smi-lib/src/rocm_smi_main.cc b/projects/rocm-smi-lib/src/rocm_smi_main.cc index ba3649f66b..187b25241e 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_main.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_main.cc @@ -373,6 +373,7 @@ RocmSMI::Initialize(uint64_t flags) { // 1. construct kfd_node_map_ with gpu_id as key and *Device as value // 2. for each kfd node, write the corresponding dv_ind // 3. for each amdgpu device, write the corresponding gpu_id + // 4. for each amdgpu device, attempt to store it's boot partition for (uint32_t dv_ind = 0; dv_ind < devices_.size(); ++dv_ind) { dev = devices_[dv_ind]; uint64_t bdfid = dev->bdfid(); @@ -387,7 +388,12 @@ RocmSMI::Initialize(uint64_t flags) { uint64_t gpu_id = tmp_map[bdfid]->gpu_id(); dev->set_kfd_gpu_id(gpu_id); kfd_node_map_[gpu_id] = tmp_map[bdfid]; + + // store each device boot partition state, if file doesn't exist + dev->storeDevicePartitions(dv_ind); } + // Leaving below to help debug temp file issues + // displayAppTmpFilesContent(); } void diff --git a/projects/rocm-smi-lib/src/rocm_smi_utils.cc b/projects/rocm-smi-lib/src/rocm_smi_utils.cc index 8c9fe39567..3a2443ddd4 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_utils.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_utils.cc @@ -43,6 +43,8 @@ #include #include #include +#include +#include #include #include @@ -61,6 +63,7 @@ namespace amd { namespace smi { +const std::string kTmpFilePrefix = "rocmsmi_"; // Return 0 if same file, 1 if not, and -1 for error int SameFile(const std::string fileA, const std::string fileB) { @@ -298,5 +301,140 @@ std::pair executeCommand(std::string command, bool stdOut) { return std::make_pair(successfulRun, stdoutAndErr); } +// originalstring - string to search for substring +// substring - string looking to find +bool containsString(std::string originalString, std::string substring) { + if (originalString.find(substring) != std::string::npos) { + return true; + } else { + return false; + } +} + +// Creates and stores supplied data into a temporary file (within /tmp/). +// All temporary files are removed upon reboot. +// Allows all users/groups to read the temporary file. +// +// For more detail, refer to mkstemp manpage: +// https://man7.org/linux/man-pages/man3/mkstemp.3.html +// +// Temporary file name format: +// ___ +// - prefix for our application's identifier (see kTmpFilePrefix) +// - name of parameter being stored +// - state at which the stored value captures +// - device identifier +// +// dv_ind - device index +// parameterName - name of parameter stored +// stateName - state at which the stored value captures +// storageData - string value of data to be stored +rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName, + std::string stateName, std::string storageData) { + // Required tags needed to store our files + // Files name format: + // ___ + std::string fullFileName = kTmpFilePrefix + stateName + "_" + + parameterName + "_" + std::to_string(dv_ind); + bool doesFileExist; + std::tie(doesFileExist, std::ignore) = + readTmpFile(dv_ind, stateName, parameterName); + if (doesFileExist) { + // do not store, if file already exists + return RSMI_STATUS_SUCCESS; + } + // template for our file + std::string fullTempFilePath = "/tmp/" + fullFileName + ".XXXXXX"; + char *fileName = &fullTempFilePath[0]; + int fd = mkstemp(fileName); + if (fd == -1) { + return RSMI_STATUS_FILE_ERROR; + } + + chmod(fileName, S_IRUSR|S_IRGRP|S_IROTH); + write(fd, storageData.c_str(), storageData.size()); + close(fd); + return RSMI_STATUS_SUCCESS; +} + +std::vector getListOfAppTmpFiles() { + std::string path = "/tmp"; + DIR *dir; + struct dirent *ent; + std::vector tmpFiles; + + if ((dir = opendir(path.c_str())) != nullptr) { + // captures all files & directories under specified path + while ((ent = readdir(dir)) != nullptr) { + std::string fileDirName = ent->d_name; + // we only want our app specific files + if (containsString(fileDirName, kTmpFilePrefix)) { + tmpFiles.emplace_back(path + "/" + fileDirName); + } else { + continue; + } + } + } + return tmpFiles; +} + +// Reads a temporary file in path provided +// If file does not exist, returns an empty string +// If file exists, returns content (which could be an empty string) +std::string readTemporaryFile(std::string path) { + std::string fileContent; + std::ifstream inFileStream(path); + if (inFileStream.is_open()) { + inFileStream >> fileContent; + } + return fileContent; +} + +// Used to debug application temporary files (idenified by kTmpFilePrefix) +// and their content +void displayAppTmpFilesContent() { + std::vector tmpFiles = getListOfAppTmpFiles(); + if (tmpFiles.empty() == false) { + for (auto &x: tmpFiles) { + std::string out = readTemporaryFile(x); + std::cout << __PRETTY_FUNCTION__ << " | Temporary file: " << x + << "; Contained content: " << out << std::endl; + } + } else { + std::cout << __PRETTY_FUNCTION__ << " | No temporary files were found" + << std::endl; + } +} + +// Attempts to read application specific temporary file +// This method is to be used for reading (or determing if it exists), +// in order to keep file naming scheme consistent. +// +// dv_ind - device index +// parameterName - name of parameter stored +// stateName - state at which the stored value captures +// Returns: +// boolean - if temporary file exists +// string - content of temporary file, if it exists (otherwise, an empty +// string is returned) +std::tuple readTmpFile(uint32_t dv_ind, + std::string stateName, + std::string parameterName) { + bool fileExists = false; + std::string tmpFileName = kTmpFilePrefix + stateName + "_" +parameterName + + "_" + std::to_string(dv_ind); + std::string fileContent; + std::vector tmpFiles = getListOfAppTmpFiles(); + if (tmpFiles.empty() == false) { + for (auto &x: tmpFiles) { + if (containsString(x, tmpFileName)) { + fileContent = readTemporaryFile(x); + fileExists = true; + break; + } + } + } + return std::make_tuple(fileExists, fileContent); +} } // namespace smi } // namespace amd diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc index 2e71322270..660866df68 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc @@ -269,13 +269,13 @@ void TestComputePartitionReadWrite::Run(void) { << computePartitionString(new_computePartition) << " ===============" << std::endl; } - ret = rsmi_dev_compute_partition_set(dv_ind, new_computePartition); - CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { std::cout << "\t**" << "Attempting to set compute partition to: " << computePartitionString(new_computePartition) << std::endl; } + ret = rsmi_dev_compute_partition_set(dv_ind, new_computePartition); + CHK_ERR_ASRT(ret) ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition, 255); CHK_ERR_ASRT(ret) @@ -290,6 +290,46 @@ void TestComputePartitionReadWrite::Run(void) { current_char_computePartition); } + /* TEST RETURN TO BOOT COMPUTE PARTITION SETTING */ + IF_VERB(STANDARD) { + std::cout << std::endl; + std::cout << "\t**" + << "=========== TEST RETURN TO BOOT COMPUTE PARTITION SETTING " + << "========" << std::endl; + } + std::string oldPartition = current_char_computePartition; + bool wasResetSuccess = false; + ret = rsmi_dev_compute_partition_reset(dv_ind); + ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) || + (ret == RSMI_STATUS_NOT_SUPPORTED)); + if (ret == RSMI_STATUS_SUCCESS) { + wasResetSuccess = true; + } + ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition, + 255); + CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Current compute partition: " << current_char_computePartition << std::endl; + } + if (wasResetSuccess) { + ASSERT_STRNE(oldPartition.c_str(), current_char_computePartition); + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed prior partition (" << oldPartition << ") is not " + << "equal to current partition (" + << current_char_computePartition << ")" << std::endl; + } + } else { + ASSERT_STREQ(oldPartition.c_str(), current_char_computePartition); + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed prior partition (" << oldPartition << ") is equal" + << " to current partition (" + << current_char_computePartition << ")" << std::endl; + } + } + /* TEST RETURN TO ORIGINAL COMPUTE PARTITIONING SETTING */ IF_VERB(STANDARD) { std::cout << std::endl; diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/npsmode_read_write.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/npsmode_read_write.cc index e2356ea5f4..f5bcabf048 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/npsmode_read_write.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/npsmode_read_write.cc @@ -221,11 +221,11 @@ void TestNPSModeReadWrite::Run(void) { EXPECT_TRUE((err == RSMI_STATUS_INVALID_ARGS) || (err == RSMI_STATUS_NOT_SUPPORTED) || (err == RSMI_STATUS_PERMISSION)); - if (err == RSMI_STATUS_INVALID_ARGS) { - IF_VERB(STANDARD) { - std::cout << "\t**" - << "Confirmed RSMI_STATUS_INVALID_ARGS was returned." - << std::endl; + if (err == RSMI_STATUS_INVALID_ARGS) { + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed RSMI_STATUS_INVALID_ARGS was returned." + << std::endl; } else if (err == RSMI_STATUS_PERMISSION) { DISPLAY_RSMI_ERR(err) // tests should not continue if err is a permission issue @@ -251,13 +251,14 @@ void TestNPSModeReadWrite::Run(void) { << npsModeString(new_nps_mode) << " ===============" << std::endl; } - ret = rsmi_dev_nps_mode_set(dv_ind, new_nps_mode); - CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { std::cout << "\t**" << "Attempting to set nps mode to: " << npsModeString(new_nps_mode) << std::endl; } + ret = rsmi_dev_nps_mode_set(dv_ind, new_nps_mode); + CHK_ERR_ASRT(ret) + ret = rsmi_dev_nps_mode_get(dv_ind, current_nps_mode, 255); CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { @@ -268,6 +269,45 @@ void TestNPSModeReadWrite::Run(void) { EXPECT_STREQ(npsModeString(new_nps_mode).c_str(), current_nps_mode); } + /* TEST RETURN TO BOOT NPS MODE SETTING */ + IF_VERB(STANDARD) { + std::cout << std::endl; + std::cout << "\t**" + << "=========== TEST RETURN TO BOOT NPS MODE SETTING " + << "========" << std::endl; + } + std::string oldMode = current_nps_mode; + bool wasResetSuccess = false; + ret = rsmi_dev_nps_mode_reset(dv_ind); + ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) || + (ret == RSMI_STATUS_NOT_SUPPORTED)); + if (ret == RSMI_STATUS_SUCCESS) { + wasResetSuccess = true; + } + ret = rsmi_dev_nps_mode_get(dv_ind, current_nps_mode, 255); + CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Current nps mode: " << current_nps_mode << std::endl; + } + if (wasResetSuccess) { + ASSERT_STRNE(oldMode.c_str(), current_nps_mode); + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed prior nps mode (" << oldMode << ") is not " + << "equal to current nps mode (" + << current_nps_mode << ")" << std::endl; + } + } else { + ASSERT_STREQ(oldMode.c_str(), current_nps_mode); + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed prior nps mode (" << oldMode << ") is equal" + << " to current nps mode (" + << current_nps_mode << ")" << std::endl; + } + } + /* TEST RETURN TO ORIGINAL NPS MODE SETTING */ IF_VERB(STANDARD) { std::cout << std::endl; @@ -277,18 +317,18 @@ void TestNPSModeReadWrite::Run(void) { } new_nps_mode = mapStringToRSMINpsModeTypes.at(orig_nps_mode); - ret = rsmi_dev_nps_mode_set(dv_ind, new_nps_mode); - CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { std::cout << "\t**" << "Returning nps mode to: " << npsModeString(new_nps_mode) << std::endl; } + ret = rsmi_dev_nps_mode_set(dv_ind, new_nps_mode); + CHK_ERR_ASRT(ret) ret = rsmi_dev_nps_mode_get(dv_ind, current_nps_mode, 255); CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { std::cout << "\t**" << "Attempted to set nps mode: " << npsModeString(new_nps_mode) << std::endl - << "\t**" << "Current compute partition: " << current_nps_mode + << "\t**" << "Current nps mode: " << current_nps_mode << std::endl; } EXPECT_EQ(RSMI_STATUS_SUCCESS, ret);