diff --git a/projects/rocm-smi-lib/CHANGELOG.md b/projects/rocm-smi-lib/CHANGELOG.md index 56097897a8..14db5ea8ed 100644 --- a/projects/rocm-smi-lib/CHANGELOG.md +++ b/projects/rocm-smi-lib/CHANGELOG.md @@ -30,7 +30,15 @@ Users can now view GPU metrics from our new `rocm-smi --showmetrics`. Unlike AMD ### Removals -- N/A +- **Removed `--resetcomputepartition`, and `--resetmemorypartition` options and associated APIs**. + - This change is part of the partition feature redesign. + - The related APIs `rsmi_dev_compute_partition_reset()` and `rsmi_dev_memory_partition_reset()`. + +- **Temporary Disabled C++ tests for `memorypartition_read_write`**. + - This change is part of the partition feature redesign. + - SMI's workflow needs to be adjusted in order to accomidate incoming driver changes to enable + Dynamic memory partition feature. We plan on re-enabling testing for this feature during ROCm + 6.4. ### Optimizations @@ -47,7 +55,11 @@ Users can now view GPU metrics from our new `rocm-smi --showmetrics`. Unlike AMD ### Upcoming changes -- N/A +- **Re-enable C++ tests for `memorypartition_read_write`**. + - This change is part of the partition feature redesign. + - SMI's workflow needs to be adjusted in order to accomidate incoming driver changes to enable + Dynamic memory partition feature. We plan on re-enabling testing for this feature during ROCm + 6.4. ## rocm_smi_lib for ROCm 6.2.1 diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index bd56f887ee..54c9624743 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -4116,25 +4116,6 @@ rsmi_status_t rsmi_dev_compute_partition_set(uint32_t dv_ind, rsmi_compute_partition_type_t compute_partition); -/** - * @brief Reverts a selected device's compute partition setting back to its - * boot state. - * - * @details Given a device index @p dv_ind , this function will attempt to - * revert its compute partition setting back to its boot state. - * - * @param[in] dv_ind a device index - * - * @retval ::RSMI_STATUS_SUCCESS call was successful - * @retval ::RSMI_STATUS_PERMISSION function requires root access - * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not - * support this function - * @retval ::RSMI_STATUS_BUSY A resource or mutex could not be acquired - * because it is already being used - device is busy - * - */ -rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind); - /** * @brief Retrieves the partition_id for a desired device * @@ -4227,27 +4208,6 @@ rsmi_status_t rsmi_dev_memory_partition_set(uint32_t dv_ind, rsmi_memory_partition_type_t memory_partition); -/** - * @brief Reverts a selected device's memory partition setting back to its - * boot state. - * - * @details Given a device index @p dv_ind , this function will attempt to - * revert its current memory partition setting back to its boot state. - * - * @param[in] dv_ind a device index - * - * @retval ::RSMI_STATUS_SUCCESS call was successful - * @retval ::RSMI_STATUS_PERMISSION function requires root access - * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not - * support this function - * @retval ::RSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart - * the amdgpu driver - * @retval ::RSMI_STATUS_BUSY A resource or mutex could not be acquired - * because it is already being used - device is busy - * - */ -rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind); - /** @} */ // end of memory_partition /*****************************************************************************/ diff --git a/projects/rocm-smi-lib/python_smi_tools/README.md b/projects/rocm-smi-lib/python_smi_tools/README.md index a03dd90078..81d9175ba0 100644 --- a/projects/rocm-smi-lib/python_smi_tools/README.md +++ b/projects/rocm-smi-lib/python_smi_tools/README.md @@ -15,8 +15,8 @@ LD_LIBRARY_PATH should be set to the folder containing librocm_smi64. ## Version -The SMI will report two "versions", ROCM-SMI version and other is ROCM-SMI-LIB version. -- ROCM-SMI version is the CLI/tool version number with commit ID appended after + sign. +The SMI will report two "versions", ROCM-SMI version and other is ROCM-SMI-LIB version. +- ROCM-SMI version is the CLI/tool version number with commit ID appended after + sign. - ROCM-SMI-LIB version is the library package version number. ``` ROCM-SMI version: 2.0.0+8e78352 @@ -44,7 +44,7 @@ usage: rocm-smi [-h] [-V] [-d DEVICE [DEVICE ...]] [--alldevices] [--showhw] [-a [--showtopoaccess] [--showtopoweight] [--showtopohops] [--showtopotype] [--showtoponuma] [--showenergycounter] [--shownodesbw] [--showcomputepartition] [--showmemorypartition] [-r] [--resetfans] [--resetprofile] [--resetpoweroverdrive] [--resetxgmierr] [--resetperfdeterminism] - [--resetcomputepartition] [--resetmemorypartition] [--setclock TYPE LEVEL] [--setsclk LEVEL [LEVEL ...]] + [--setclock TYPE LEVEL] [--setsclk LEVEL [LEVEL ...]] [--setmclk LEVEL [LEVEL ...]] [--setpcie LEVEL [LEVEL ...]] [--setslevel SCLKLEVEL SCLK SVOLT] [--setmlevel MCLKLEVEL MCLK MVOLT] [--setvc POINT SCLK SVOLT] [--setsrange SCLKMIN SCLKMAX] [--setextremum min|max sclk|mclk CLK] [--setmrange MCLKMIN MCLKMAX] [--setfan LEVEL] @@ -185,8 +185,7 @@ Reset options: state --resetxgmierr Reset XGMI error count --resetperfdeterminism Disable performance determinism - --resetcomputepartition Resets to boot compute partition state - --resetmemorypartition Resets to boot memory partition state + Auto-response options: --autorespond RESPONSE Response to automatically provide for all prompts @@ -200,8 +199,8 @@ Output options: ``` ## Detailed Option Descriptions -`--setextremum ` -Provided ASIC support, users can now set a maximum or minimum sclk or mclk value through our Python CLI tool (`rocm-smi --setextremum max sclk 1500`). See example below. +`--setextremum ` +Provided ASIC support, users can now set a maximum or minimum sclk or mclk value through our Python CLI tool (`rocm-smi --setextremum max sclk 1500`). See example below. ```shell $ sudo /opt/rocm/bin/rocm-smi --setextremum max sclk 2100 diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py index 93435fbaaa..d0eec6a503 100755 --- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py +++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py @@ -1158,72 +1158,6 @@ def resetPerfDeterminism(deviceList): printLogSpacer() -def resetComputePartition(deviceList): - """ Reset Compute Partition to its boot state - - :param deviceList: List of DRM devices (can be a single-item list) - """ - printLogSpacer(" Reset compute partition to its boot state ") - for device in deviceList: - originalPartition = getComputePartition(device) - ret = rocmsmi.rsmi_dev_compute_partition_reset(device) - if rsmi_ret_ok(ret, device, 'reset_compute_partition', silent=True): - resetBootState = getComputePartition(device) - printLog(device, "Successfully reset compute partition (" + - originalPartition + ") to boot state (" + resetBootState + - ")", None) - elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION: - printLog(device, 'Permission denied', None) - elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: - printLog(device, 'Not supported on the given system', None) - elif ret == rsmi_status_t.RSMI_STATUS_BUSY: - printLog(device, 'Device is currently busy, try again later', - None) - else: - rsmi_ret_ok(ret, device, 'reset_compute_partition') - printErrLog(device, 'Failed to reset the compute partition to boot state') - printLogSpacer() - - -def resetMemoryPartition(deviceList): - """ Reset current memory partition to its boot state - - :param deviceList: List of DRM devices (can be a single-item list) - """ - printLogSpacer(" Reset memory partition to its boot state ") - for device in deviceList: - originalPartition = getMemoryPartition(device) - t1 = multiprocessing.Process(target=showProgressbar, - args=("Resetting memory partition",13,)) - t1.start() - addExtraLine=True - start=time.time() - ret = rocmsmi.rsmi_dev_memory_partition_reset(device) - stop=time.time() - duration=stop-start - if t1.is_alive(): - t1.terminate() - t1.join() - if duration < float(0.1): # For longer runs, add extra line before output - addExtraLine=False # This is to prevent overriding progress bar - if rsmi_ret_ok(ret, device, 'reset_memory_partition', silent=True): - resetBootState = getMemoryPartition(device) - printLog(device, "Successfully reset memory partition (" + - originalPartition + ") to boot state (" + - resetBootState + ")", None, addExtraLine) - elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION: - printLog(device, 'Permission denied', None, addExtraLine) - elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: - printLog(device, 'Not supported on the given system', None, addExtraLine) - elif ret == rsmi_status_t.RSMI_STATUS_BUSY: - printLog(device, 'Device is currently busy, try again later', - None) - else: - rsmi_ret_ok(ret, device, 'reset_memory_partition') - printErrLog(device, 'Failed to reset memory partition to boot state') - printLogSpacer() - - def setClockRange(deviceList, clkType, minvalue, maxvalue, autoRespond): """ Set the range for the specified clktype in the PowerPlay table for a list of devices. @@ -4240,8 +4174,6 @@ if __name__ == '__main__': action='store_true') groupActionReset.add_argument('--resetxgmierr', help='Reset XGMI error count', action='store_true') groupActionReset.add_argument('--resetperfdeterminism', help='Disable performance determinism', action='store_true') - groupActionReset.add_argument('--resetcomputepartition', help='Resets to boot compute partition state', action='store_true') - groupActionReset.add_argument('--resetmemorypartition', help='Resets to boot memory partition state', action='store_true') groupAction.add_argument('--setclock', help='Set Clock Frequency Level(s) for specified clock (requires manual Perf level)', metavar=('TYPE','LEVEL'), nargs=2) @@ -4329,7 +4261,7 @@ if __name__ == '__main__': or args.setpoweroverdrive or args.resetpoweroverdrive or args.rasenable or args.rasdisable or \ args.rasinject or args.gpureset or args.setperfdeterminism or args.setslevel or args.setmlevel or \ args.setvc or args.setsrange or args.setextremum or args.setmrange or args.setclock or \ - args.setcomputepartition or args.setmemorypartition or args.resetcomputepartition or args.resetmemorypartition: + args.setcomputepartition or args.setmemorypartition: relaunchAsSudo() # If there is one or more device specified, use that for all commands, otherwise use a @@ -4579,10 +4511,6 @@ if __name__ == '__main__': resetXgmiErr(deviceList) if args.resetperfdeterminism: resetPerfDeterminism(deviceList) - if args.resetcomputepartition: - resetComputePartition(deviceList) - if args.resetmemorypartition: - resetMemoryPartition(deviceList) if args.rasenable: setRas(deviceList, 'enable', args.rasenable[0], args.rasenable[1]) if args.rasdisable: diff --git a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc index 93b004b083..e1528739b6 100755 --- a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc +++ b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc @@ -652,11 +652,6 @@ static rsmi_status_t test_set_compute_partitioning(uint32_t dv_ind) { std::cout << "\n" << "\n"; } - std::cout << "About to initate compute partition reset..." << "\n"; - ret = rsmi_dev_compute_partition_reset(dv_ind); - CHK_RSMI_NOT_SUPPORTED_RET(ret) - std::cout << "Done resetting compute partition." << "\n"; - std::string myComputePartition = originalComputePartition; if (myComputePartition.empty() == false) { std::cout << "Resetting back to original compute partition to " @@ -709,11 +704,6 @@ static rsmi_status_t test_set_memory_partition(uint32_t dv_ind) { << "." << "\n\n\n"; } - std::cout << "About to initate memory partition reset...\n"; - ret = rsmi_dev_memory_partition_reset(dv_ind); - CHK_RSMI_NOT_SUPPORTED_RET(ret) - std::cout << "Done resetting memory partition.\n"; - std::string myMemPart = originalMemoryPartition; if (myMemPart.empty() == false) { std::cout << "Resetting memory partition to " << originalMemoryPartition diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 1baef2def5..274fa3c113 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -758,7 +758,7 @@ rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) { * Add domain to full pci_id: * BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((PARTITION_ID & 0xF) << 28) | * ((BUS & 0xFF) << 8) | ((DEVICE & 0x1F) <<3 ) | (FUNCTION & 0x7) - * + * * bits [63:32] = domain * bits [31:28] or bits [2:0] = partition id * bits [27:16] = reserved @@ -5099,84 +5099,6 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition, CATCH } -rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) { - TRY - std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; - LOG_TRACE(ss); - REQUIRE_ROOT_ACCESS - DEVICE_MUTEX - GET_DEV_FROM_INDX - rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED; - - // Only use 1st index, rest are there in-case of future issues - // NOTE: Partitions sets cause rocm-smi indexes to fluctuate - // since the nodes are grouped in respect to primary node - why we only use - // 1st node/device id to reset - std::string bootState = - dev->readBootPartitionState(0); - - // Initiate reset - // If bootState is UNKNOWN, we cannot reset - return RSMI_STATUS_NOT_SUPPORTED - // Likely due to device not supporting it - if (bootState != "UNKNOWN") { - rsmi_compute_partition_type_t compute_partition = - mapStringToRSMIComputePartitionTypes.at(bootState); - ret = rsmi_dev_compute_partition_set(dv_ind, compute_partition); - } - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Success - if original boot state was not unknown or valid setting" - << " | Device #: " << dv_ind - << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevComputePartition) - << " | Data: " << bootState - << " | Returning = " - << getRSMIStatusString(ret) << " |"; - LOG_TRACE(ss); - return ret; - CATCH -} - -rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind) { - TRY - std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start =======, " << dv_ind; - LOG_TRACE(ss); - REQUIRE_ROOT_ACCESS - DEVICE_MUTEX - GET_DEV_FROM_INDX - rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED; - - // Only use 1st index, rest are there in-case of future issues - // NOTE: Partitions sets cause rocm-smi indexes to fluctuate. - // Since the nodes are grouped in respect to primary node - why we only use - // 1st node/device id to reset - std::string bootState = - dev->readBootPartitionState(0); - - // Initiate reset - // If bootState is UNKNOWN, we cannot reset - return RSMI_STATUS_NOT_SUPPORTED - // Likely due to device not supporting it - if (bootState != "UNKNOWN") { - rsmi_memory_partition_type_t memory_partition = - mapStringToMemoryPartitionTypes.at(bootState); - ret = rsmi_dev_memory_partition_set(dv_ind, memory_partition); - } - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Success - if original boot state was not unknown or valid setting" - << " | Device #: " << dv_ind - << " | Type: " - << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition) - << " | Data: " << bootState - << " | Returning = " - << getRSMIStatusString(ret) << " |"; - LOG_TRACE(ss); - return ret; - CATCH -} - rsmi_status_t rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) { TRY diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc index 0170ae6f8d..2863f33843 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc @@ -428,7 +428,7 @@ void TestComputePartitionReadWrite::Run(void) { * [0:SPX, 1:CPX, 2:CPX, 3:CPX, 4:CPX, 5:CPX, 6:SPX, 7:SPX] <- set 1 to CPX * [0:SPX, 1:SPX, 2:SPX, 3:SPX] <- reset(1) * ... - * + * */ std::string final_partition_state = "UNKNOWN"; @@ -564,29 +564,29 @@ void TestComputePartitionReadWrite::Run(void) { << "========" << std::endl; } std::string oldPartition = current_char_computePartition; - bool wasResetSuccess = false; - ret = rsmi_dev_compute_partition_reset(dv_ind); - IF_VERB(STANDARD) { - std::cout << "\t**" - << "rsmi_dev_compute_partition_reset(" << dv_ind << "): " - << amd::smi::getRSMIStatusString(ret, false) << "\n"; + rsmi_compute_partition_type_t updatePartition = + static_cast( + mapStringToRSMIComputePartitionTypes.at( + std::string(orig_char_computePartition))); + ret = rsmi_dev_compute_partition_set(dv_ind, updatePartition); + + + ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition, 255); + if (strcmp(oldPartition.c_str(), current_char_computePartition) != + 0) { + devicePartitionUpdated = true; + final_partition_state = current_char_computePartition; + } else { + devicePartitionUpdated = false; } - ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) || - (ret == RSMI_STATUS_NOT_SUPPORTED) || - (ret == RSMI_STATUS_BUSY)); - if (ret == RSMI_STATUS_SUCCESS) { - wasResetSuccess = true; - } - ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition, - 255); CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { std::cout << "\t**" << "Current compute partition: " << current_char_computePartition << "\n" + << "\t**" << "Old Partition partition (before setting to original): " + << oldPartition << "\n" << "\t**" << "Original compute partition: " << orig_char_computePartition << "\n" - << "\t**" << "Reset Successful: " - << (wasResetSuccess ? "TRUE" : "FALSE") << "\n" << "\t**" << "Partitions Updated: " << (devicePartitionUpdated ? "TRUE" : "FALSE") << "\n"; } @@ -598,7 +598,7 @@ void TestComputePartitionReadWrite::Run(void) { checkPartitionIdChanges(dv_ind, std::string(current_char_computePartition), isVerbose, false); } - if (wasResetSuccess && devicePartitionUpdated) { + if (devicePartitionUpdated) { ASSERT_STRNE(oldPartition.c_str(), current_char_computePartition); IF_VERB(STANDARD) { std::cout << "\t**" diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/memorypartition_read_write.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/memorypartition_read_write.cc index c4d2c761ee..2dbc46007a 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/memorypartition_read_write.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/memorypartition_read_write.cc @@ -111,6 +111,8 @@ mapStringToRSMIMemoryPartitionTypes { }; void TestMemoryPartitionReadWrite::Run(void) { + GTEST_SKIP_("Temporarily disabled"); // Skipped due to SWDEV-491215 - + // will be re-enabled in rocm 6.4 rsmi_status_t ret, err; char orig_memory_partition[255]; char current_memory_partition[255]; @@ -302,13 +304,7 @@ void TestMemoryPartitionReadWrite::Run(void) { << "SETTING ========" << std::endl; } std::string oldMode = current_memory_partition; - bool wasResetSuccess = false; - ret = rsmi_dev_memory_partition_reset(dv_ind); - ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) || - (ret == RSMI_STATUS_NOT_SUPPORTED)); - if (ret == RSMI_STATUS_SUCCESS) { - wasResetSuccess = true; - } + ret = rsmi_dev_memory_partition_get(dv_ind, current_memory_partition, 255); CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { @@ -316,7 +312,7 @@ void TestMemoryPartitionReadWrite::Run(void) { << "Current memory partition: " << current_memory_partition << std::endl; } - if (wasResetSuccess && wasSetSuccess) { + if (wasSetSuccess) { ASSERT_STRNE(oldMode.c_str(), current_memory_partition); IF_VERB(STANDARD) { std::cout << "\t**"