Merge amd-staging into amd-master 20241022

Change-Id: I823ffdba9f1db614542658a2af61df917a44c07a
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/rocm_smi_lib commit: 7504cd04eb]
This commit is contained in:
Charis Poag
2024-10-22 18:22:34 -05:00
8 changed files with 44 additions and 237 deletions
+14 -2
View File
@@ -30,7 +30,15 @@ Users can now view GPU metrics from our new `rocm-smi --showmetrics`. Unlike AMD
### Removals
- N/A
- **Removed `--resetcomputepartition`, and `--resetmemorypartition` options and associated APIs**.
- This change is part of the partition feature redesign.
- The related APIs `rsmi_dev_compute_partition_reset()` and `rsmi_dev_memory_partition_reset()`.
- **Temporary Disabled C++ tests for `memorypartition_read_write`**.
- This change is part of the partition feature redesign.
- SMI's workflow needs to be adjusted in order to accomidate incoming driver changes to enable
Dynamic memory partition feature. We plan on re-enabling testing for this feature during ROCm
6.4.
### Optimizations
@@ -47,7 +55,11 @@ Users can now view GPU metrics from our new `rocm-smi --showmetrics`. Unlike AMD
### Upcoming changes
- N/A
- **Re-enable C++ tests for `memorypartition_read_write`**.
- This change is part of the partition feature redesign.
- SMI's workflow needs to be adjusted in order to accomidate incoming driver changes to enable
Dynamic memory partition feature. We plan on re-enabling testing for this feature during ROCm
6.4.
## rocm_smi_lib for ROCm 6.2.1
@@ -4116,25 +4116,6 @@ rsmi_status_t
rsmi_dev_compute_partition_set(uint32_t dv_ind,
rsmi_compute_partition_type_t compute_partition);
/**
* @brief Reverts a selected device's compute partition setting back to its
* boot state.
*
* @details Given a device index @p dv_ind , this function will attempt to
* revert its compute partition setting back to its boot state.
*
* @param[in] dv_ind a device index
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_PERMISSION function requires root access
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function
* @retval ::RSMI_STATUS_BUSY A resource or mutex could not be acquired
* because it is already being used - device is busy
*
*/
rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind);
/**
* @brief Retrieves the partition_id for a desired device
*
@@ -4227,27 +4208,6 @@ rsmi_status_t
rsmi_dev_memory_partition_set(uint32_t dv_ind,
rsmi_memory_partition_type_t memory_partition);
/**
* @brief Reverts a selected device's memory partition setting back to its
* boot state.
*
* @details Given a device index @p dv_ind , this function will attempt to
* revert its current memory partition setting back to its boot state.
*
* @param[in] dv_ind a device index
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_PERMISSION function requires root access
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function
* @retval ::RSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart
* the amdgpu driver
* @retval ::RSMI_STATUS_BUSY A resource or mutex could not be acquired
* because it is already being used - device is busy
*
*/
rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind);
/** @} */ // end of memory_partition
/*****************************************************************************/
@@ -15,8 +15,8 @@ LD_LIBRARY_PATH should be set to the folder containing librocm_smi64.
## Version
The SMI will report two "versions", ROCM-SMI version and other is ROCM-SMI-LIB version.
- ROCM-SMI version is the CLI/tool version number with commit ID appended after + sign.
The SMI will report two "versions", ROCM-SMI version and other is ROCM-SMI-LIB version.
- ROCM-SMI version is the CLI/tool version number with commit ID appended after + sign.
- ROCM-SMI-LIB version is the library package version number.
```
ROCM-SMI version: 2.0.0+8e78352
@@ -44,7 +44,7 @@ usage: rocm-smi [-h] [-V] [-d DEVICE [DEVICE ...]] [--alldevices] [--showhw] [-a
[--showtopoaccess] [--showtopoweight] [--showtopohops] [--showtopotype] [--showtoponuma]
[--showenergycounter] [--shownodesbw] [--showcomputepartition] [--showmemorypartition] [-r]
[--resetfans] [--resetprofile] [--resetpoweroverdrive] [--resetxgmierr] [--resetperfdeterminism]
[--resetcomputepartition] [--resetmemorypartition] [--setclock TYPE LEVEL] [--setsclk LEVEL [LEVEL ...]]
[--setclock TYPE LEVEL] [--setsclk LEVEL [LEVEL ...]]
[--setmclk LEVEL [LEVEL ...]] [--setpcie LEVEL [LEVEL ...]] [--setslevel SCLKLEVEL SCLK SVOLT]
[--setmlevel MCLKLEVEL MCLK MVOLT] [--setvc POINT SCLK SVOLT] [--setsrange SCLKMIN SCLKMAX]
[--setextremum min|max sclk|mclk CLK] [--setmrange MCLKMIN MCLKMAX] [--setfan LEVEL]
@@ -185,8 +185,7 @@ Reset options:
state
--resetxgmierr Reset XGMI error count
--resetperfdeterminism Disable performance determinism
--resetcomputepartition Resets to boot compute partition state
--resetmemorypartition Resets to boot memory partition state
Auto-response options:
--autorespond RESPONSE Response to automatically provide for all prompts
@@ -200,8 +199,8 @@ Output options:
```
## Detailed Option Descriptions
`--setextremum <min/max> <sclk or mclk> <value in MHz to set to>`
Provided ASIC support, users can now set a maximum or minimum sclk or mclk value through our Python CLI tool (`rocm-smi --setextremum max sclk 1500`). See example below.
`--setextremum <min/max> <sclk or mclk> <value in MHz to set to>`
Provided ASIC support, users can now set a maximum or minimum sclk or mclk value through our Python CLI tool (`rocm-smi --setextremum max sclk 1500`). See example below.
```shell
$ sudo /opt/rocm/bin/rocm-smi --setextremum max sclk 2100
@@ -1157,72 +1157,6 @@ def resetPerfDeterminism(deviceList):
printLogSpacer()
def resetComputePartition(deviceList):
""" Reset Compute Partition to its boot state
:param deviceList: List of DRM devices (can be a single-item list)
"""
printLogSpacer(" Reset compute partition to its boot state ")
for device in deviceList:
originalPartition = getComputePartition(device)
ret = rocmsmi.rsmi_dev_compute_partition_reset(device)
if rsmi_ret_ok(ret, device, 'reset_compute_partition', silent=True):
resetBootState = getComputePartition(device)
printLog(device, "Successfully reset compute partition (" +
originalPartition + ") to boot state (" + resetBootState +
")", None)
elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION:
printLog(device, 'Permission denied', None)
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
printLog(device, 'Not supported on the given system', None)
elif ret == rsmi_status_t.RSMI_STATUS_BUSY:
printLog(device, 'Device is currently busy, try again later',
None)
else:
rsmi_ret_ok(ret, device, 'reset_compute_partition')
printErrLog(device, 'Failed to reset the compute partition to boot state')
printLogSpacer()
def resetMemoryPartition(deviceList):
""" Reset current memory partition to its boot state
:param deviceList: List of DRM devices (can be a single-item list)
"""
printLogSpacer(" Reset memory partition to its boot state ")
for device in deviceList:
originalPartition = getMemoryPartition(device)
t1 = multiprocessing.Process(target=showProgressbar,
args=("Resetting memory partition",13,))
t1.start()
addExtraLine=True
start=time.time()
ret = rocmsmi.rsmi_dev_memory_partition_reset(device)
stop=time.time()
duration=stop-start
if t1.is_alive():
t1.terminate()
t1.join()
if duration < float(0.1): # For longer runs, add extra line before output
addExtraLine=False # This is to prevent overriding progress bar
if rsmi_ret_ok(ret, device, 'reset_memory_partition', silent=True):
resetBootState = getMemoryPartition(device)
printLog(device, "Successfully reset memory partition (" +
originalPartition + ") to boot state (" +
resetBootState + ")", None, addExtraLine)
elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION:
printLog(device, 'Permission denied', None, addExtraLine)
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
printLog(device, 'Not supported on the given system', None, addExtraLine)
elif ret == rsmi_status_t.RSMI_STATUS_BUSY:
printLog(device, 'Device is currently busy, try again later',
None)
else:
rsmi_ret_ok(ret, device, 'reset_memory_partition')
printErrLog(device, 'Failed to reset memory partition to boot state')
printLogSpacer()
def setClockRange(deviceList, clkType, minvalue, maxvalue, autoRespond):
""" Set the range for the specified clktype in the PowerPlay table for a list of devices.
@@ -4239,8 +4173,6 @@ if __name__ == '__main__':
action='store_true')
groupActionReset.add_argument('--resetxgmierr', help='Reset XGMI error count', action='store_true')
groupActionReset.add_argument('--resetperfdeterminism', help='Disable performance determinism', action='store_true')
groupActionReset.add_argument('--resetcomputepartition', help='Resets to boot compute partition state', action='store_true')
groupActionReset.add_argument('--resetmemorypartition', help='Resets to boot memory partition state', action='store_true')
groupAction.add_argument('--setclock',
help='Set Clock Frequency Level(s) for specified clock (requires manual Perf level)',
metavar=('TYPE','LEVEL'), nargs=2)
@@ -4328,7 +4260,7 @@ if __name__ == '__main__':
or args.setpoweroverdrive or args.resetpoweroverdrive or args.rasenable or args.rasdisable or \
args.rasinject or args.gpureset or args.setperfdeterminism or args.setslevel or args.setmlevel or \
args.setvc or args.setsrange or args.setextremum or args.setmrange or args.setclock or \
args.setcomputepartition or args.setmemorypartition or args.resetcomputepartition or args.resetmemorypartition:
args.setcomputepartition or args.setmemorypartition:
relaunchAsSudo()
# If there is one or more device specified, use that for all commands, otherwise use a
@@ -4578,10 +4510,6 @@ if __name__ == '__main__':
resetXgmiErr(deviceList)
if args.resetperfdeterminism:
resetPerfDeterminism(deviceList)
if args.resetcomputepartition:
resetComputePartition(deviceList)
if args.resetmemorypartition:
resetMemoryPartition(deviceList)
if args.rasenable:
setRas(deviceList, 'enable', args.rasenable[0], args.rasenable[1])
if args.rasdisable:
@@ -652,11 +652,6 @@ static rsmi_status_t test_set_compute_partitioning(uint32_t dv_ind) {
std::cout << "\n" << "\n";
}
std::cout << "About to initate compute partition reset..." << "\n";
ret = rsmi_dev_compute_partition_reset(dv_ind);
CHK_RSMI_NOT_SUPPORTED_RET(ret)
std::cout << "Done resetting compute partition." << "\n";
std::string myComputePartition = originalComputePartition;
if (myComputePartition.empty() == false) {
std::cout << "Resetting back to original compute partition to "
@@ -709,11 +704,6 @@ static rsmi_status_t test_set_memory_partition(uint32_t dv_ind) {
<< "." << "\n\n\n";
}
std::cout << "About to initate memory partition reset...\n";
ret = rsmi_dev_memory_partition_reset(dv_ind);
CHK_RSMI_NOT_SUPPORTED_RET(ret)
std::cout << "Done resetting memory partition.\n";
std::string myMemPart = originalMemoryPartition;
if (myMemPart.empty() == false) {
std::cout << "Resetting memory partition to " << originalMemoryPartition
+1 -79
View File
@@ -758,7 +758,7 @@ rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) {
* Add domain to full pci_id:
* BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((PARTITION_ID & 0xF) << 28) |
* ((BUS & 0xFF) << 8) | ((DEVICE & 0x1F) <<3 ) | (FUNCTION & 0x7)
*
*
* bits [63:32] = domain
* bits [31:28] or bits [2:0] = partition id
* bits [27:16] = reserved
@@ -5099,84 +5099,6 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition,
CATCH
}
rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) {
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind;
LOG_TRACE(ss);
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
GET_DEV_FROM_INDX
rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED;
// Only use 1st index, rest are there in-case of future issues
// NOTE: Partitions sets cause rocm-smi indexes to fluctuate
// since the nodes are grouped in respect to primary node - why we only use
// 1st node/device id to reset
std::string bootState =
dev->readBootPartitionState<rsmi_compute_partition_type_t>(0);
// Initiate reset
// If bootState is UNKNOWN, we cannot reset - return RSMI_STATUS_NOT_SUPPORTED
// Likely due to device not supporting it
if (bootState != "UNKNOWN") {
rsmi_compute_partition_type_t compute_partition =
mapStringToRSMIComputePartitionTypes.at(bootState);
ret = rsmi_dev_compute_partition_set(dv_ind, compute_partition);
}
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Success - if original boot state was not unknown or valid setting"
<< " | Device #: " << dv_ind
<< " | Type: "
<< devInfoTypesStrings.at(amd::smi::kDevComputePartition)
<< " | Data: " << bootState
<< " | Returning = "
<< getRSMIStatusString(ret) << " |";
LOG_TRACE(ss);
return ret;
CATCH
}
rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind) {
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "| ======= start =======, " << dv_ind;
LOG_TRACE(ss);
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
GET_DEV_FROM_INDX
rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED;
// Only use 1st index, rest are there in-case of future issues
// NOTE: Partitions sets cause rocm-smi indexes to fluctuate.
// Since the nodes are grouped in respect to primary node - why we only use
// 1st node/device id to reset
std::string bootState =
dev->readBootPartitionState<rsmi_memory_partition_type_t>(0);
// Initiate reset
// If bootState is UNKNOWN, we cannot reset - return RSMI_STATUS_NOT_SUPPORTED
// Likely due to device not supporting it
if (bootState != "UNKNOWN") {
rsmi_memory_partition_type_t memory_partition =
mapStringToMemoryPartitionTypes.at(bootState);
ret = rsmi_dev_memory_partition_set(dv_ind, memory_partition);
}
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Success - if original boot state was not unknown or valid setting"
<< " | Device #: " << dv_ind
<< " | Type: "
<< devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
<< " | Data: " << bootState
<< " | Returning = "
<< getRSMIStatusString(ret) << " |";
LOG_TRACE(ss);
return ret;
CATCH
}
rsmi_status_t
rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) {
TRY
@@ -428,7 +428,7 @@ void TestComputePartitionReadWrite::Run(void) {
* [0:SPX, 1:CPX, 2:CPX, 3:CPX, 4:CPX, 5:CPX, 6:SPX, 7:SPX] <- set 1 to CPX
* [0:SPX, 1:SPX, 2:SPX, 3:SPX] <- reset(1)
* ...
*
*
*/
std::string final_partition_state = "UNKNOWN";
@@ -564,29 +564,29 @@ void TestComputePartitionReadWrite::Run(void) {
<< "========" << std::endl;
}
std::string oldPartition = current_char_computePartition;
bool wasResetSuccess = false;
ret = rsmi_dev_compute_partition_reset(dv_ind);
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "rsmi_dev_compute_partition_reset(" << dv_ind << "): "
<< amd::smi::getRSMIStatusString(ret, false) << "\n";
rsmi_compute_partition_type_t updatePartition =
static_cast<rsmi_compute_partition_type_t>(
mapStringToRSMIComputePartitionTypes.at(
std::string(orig_char_computePartition)));
ret = rsmi_dev_compute_partition_set(dv_ind, updatePartition);
ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition, 255);
if (strcmp(oldPartition.c_str(), current_char_computePartition) !=
0) {
devicePartitionUpdated = true;
final_partition_state = current_char_computePartition;
} else {
devicePartitionUpdated = false;
}
ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) ||
(ret == RSMI_STATUS_NOT_SUPPORTED) ||
(ret == RSMI_STATUS_BUSY));
if (ret == RSMI_STATUS_SUCCESS) {
wasResetSuccess = true;
}
ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition,
255);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\t**" << "Current compute partition: "
<< current_char_computePartition << "\n"
<< "\t**" << "Old Partition partition (before setting to original): "
<< oldPartition << "\n"
<< "\t**" << "Original compute partition: "
<< orig_char_computePartition << "\n"
<< "\t**" << "Reset Successful: "
<< (wasResetSuccess ? "TRUE" : "FALSE") << "\n"
<< "\t**" << "Partitions Updated: "
<< (devicePartitionUpdated ? "TRUE" : "FALSE") << "\n";
}
@@ -598,7 +598,7 @@ void TestComputePartitionReadWrite::Run(void) {
checkPartitionIdChanges(dv_ind, std::string(current_char_computePartition),
isVerbose, false);
}
if (wasResetSuccess && devicePartitionUpdated) {
if (devicePartitionUpdated) {
ASSERT_STRNE(oldPartition.c_str(), current_char_computePartition);
IF_VERB(STANDARD) {
std::cout << "\t**"
@@ -111,6 +111,8 @@ mapStringToRSMIMemoryPartitionTypes {
};
void TestMemoryPartitionReadWrite::Run(void) {
GTEST_SKIP_("Temporarily disabled"); // Skipped due to SWDEV-491215 -
// will be re-enabled in rocm 6.4
rsmi_status_t ret, err;
char orig_memory_partition[255];
char current_memory_partition[255];
@@ -302,13 +304,7 @@ void TestMemoryPartitionReadWrite::Run(void) {
<< "SETTING ========" << std::endl;
}
std::string oldMode = current_memory_partition;
bool wasResetSuccess = false;
ret = rsmi_dev_memory_partition_reset(dv_ind);
ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) ||
(ret == RSMI_STATUS_NOT_SUPPORTED));
if (ret == RSMI_STATUS_SUCCESS) {
wasResetSuccess = true;
}
ret = rsmi_dev_memory_partition_get(dv_ind, current_memory_partition, 255);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
@@ -316,7 +312,7 @@ void TestMemoryPartitionReadWrite::Run(void) {
<< "Current memory partition: " << current_memory_partition
<< std::endl;
}
if (wasResetSuccess && wasSetSuccess) {
if (wasSetSuccess) {
ASSERT_STRNE(oldMode.c_str(), current_memory_partition);
IF_VERB(STANDARD) {
std::cout << "\t**"