[SWDEV-335697] Add RSMI_STATUS_SETTING_UNAVAILABLE for dynamic partition

Updates:
    * Added RSMI_STATUS_SETTING_UNAVAILABLE for
      rsmi_dev_compute_partition_set - gives users
      better error output when attempting to set
      compute partition to values not listed in
      available_compute_partition SYSFS
    * Updated python --setcomputepartition to
      provide better output when receiving
      RSMI_STATUS_SETTING_UNAVAILABLE
    * Updated all test & example files to check for
      RSMI_STATUS_SETTING_UNAVAILABLE when doing
      rsmi_dev_compute_partition_set

Change-Id: Ida5d54880d9b9b6e4a0468cdb962fdc0c18d6257
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/amdsmi commit: 0d3558945b]
Этот коммит содержится в:
Charis Poag
2023-02-20 15:16:06 -06:00
родитель c4d64a56d8
Коммит ff26973e15
11 изменённых файлов: 119 добавлений и 30 удалений
+4
Просмотреть файл
@@ -126,6 +126,8 @@ typedef enum {
//!< being used
RSMI_STATUS_REFCOUNT_OVERFLOW, //!< An internal reference counter
//!< exceeded INT32_MAX
RSMI_STATUS_SETTING_UNAVAILABLE, //!< Requested setting is unavailable
//!< for the current device
RSMI_STATUS_AMDGPU_RESTART_ERR, //!< Could not successfully restart
//!< the amdgpu driver
@@ -3572,6 +3574,8 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition,
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_PERMISSION function requires root access
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
* @retval ::RSMI_STATUS_SETTING_UNAVAILABLE the provided setting is
* unavailable for current device
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function
*
+1
Просмотреть файл
@@ -162,6 +162,7 @@ enum DevInfoTypes {
kDevNumaNode,
kDevGpuMetrics,
kDevGpuReset,
kDevAvailableComputePartition,
kDevComputePartition,
kDevMemoryPartition
};
+3
Просмотреть файл
@@ -1421,6 +1421,9 @@ def setComputePartition(deviceList, computePartitionType):
None)
elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION:
printLog(device, 'Permission denied', None)
elif ret == rsmi_status_t.RSMI_STATUS_SETTING_UNAVAILABLE:
printLog(device, 'Requested setting (%s) is unavailable for current device'
%computePartitionType, None)
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
printLog(device, 'Not supported on the given system', None)
else:
+3 -1
Просмотреть файл
@@ -69,7 +69,8 @@ class rsmi_status_t(c_int):
RSMI_STATUS_UNEXPECTED_DATA = 0xF
RSMI_STATUS_BUSY = 0x10
RSMI_STATUS_REFCOUNT_OVERFLOW = 0x11
RSMI_STATUS_AMDGPU_RESTART_ERR = 0x12
RSMI_STATUS_SETTING_UNAVAILABLE = 0x12
RSMI_STATUS_AMDGPU_RESTART_ERR = 0x13
RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF
@@ -93,6 +94,7 @@ rsmi_status_verbose_err_out = {
rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA: 'Unexpected data received',
rsmi_status_t.RSMI_STATUS_BUSY: 'Busy - resources are preventing call the ability to execute',
rsmi_status_t.RSMI_STATUS_REFCOUNT_OVERFLOW: 'Data overflow - data exceeded INT32_MAX',
rsmi_status_t.RSMI_STATUS_SETTING_UNAVAILABLE: 'Requested setting is unavailable for current device',
rsmi_status_t.RSMI_STATUS_AMDGPU_RESTART_ERR: 'Could not successfully restart the amdgpu driver',
rsmi_status_t.RSMI_STATUS_UNKNOWN_ERROR: 'Unknown error occured'
}
Двоичный файл не отображается.
+15 -4
Просмотреть файл
@@ -114,6 +114,18 @@
} \
}
#define CHK_RSMI_NOT_SUPPORTED_OR_SETTING_UNAVAILABLE_RET(RET) {\
if ((RET) == RSMI_STATUS_NOT_SUPPORTED) { \
std::cout << "This function is not supported in the current environment."\
<< std::endl; \
} else if ((RET) == RSMI_STATUS_SETTING_UNAVAILABLE) { \
std::cout << "[WARN] RSMI_STATUS_SETTING_UNAVAILABLE retrieved." \
<< std::endl; \
} else { \
CHK_RSMI_RET(RET) \
} \
}
#define CHK_NOT_SUPPORTED_OR_UNEXPECTED_DATA_OR_INSUFFICIENT_SIZE_RET(RET) { \
if ((RET) == RSMI_STATUS_NOT_SUPPORTED) { \
std::cout << "This function is not supported in the current environment." \
@@ -570,10 +582,9 @@ static rsmi_status_t test_set_compute_partitioning(uint32_t dv_ind) {
<< compute_partition_string(newPartition) << "..."
<< std::endl;
ret = rsmi_dev_compute_partition_set(dv_ind, newPartition);
CHK_RSMI_NOT_SUPPORTED_RET(ret)
CHK_RSMI_NOT_SUPPORTED_OR_SETTING_UNAVAILABLE_RET(ret)
std::cout << "Done setting compute partition to "
<< compute_partition_string(newPartition)
<< "." << std::endl;
<< compute_partition_string(newPartition) << "." << std::endl;
std::cout << std::endl << std::endl;
}
@@ -589,7 +600,7 @@ static rsmi_status_t test_set_compute_partitioning(uint32_t dv_ind) {
rsmi_compute_partition_type origComputePartitionType
= mapStringToRSMIComputePartitionTypes[originalComputePartition];
ret = rsmi_dev_compute_partition_set(dv_ind, origComputePartitionType);
CHK_RSMI_NOT_SUPPORTED_RET(ret)
CHK_RSMI_NOT_SUPPORTED_OR_SETTING_UNAVAILABLE_RET(ret)
std::cout << "Done" << std::endl;
}
return RSMI_STATUS_SUCCESS;
+38 -6
Просмотреть файл
@@ -2865,6 +2865,11 @@ rsmi_status_string(rsmi_status_t status, const char **status_string) {
"counter exceeded INT32_MAX";
break;
case RSMI_STATUS_SETTING_UNAVAILABLE:
*status_string = "RSMI_STATUS_SETTING_UNAVAILABLE: Requested setting is "
"unavailable for the current device";
break;
case RSMI_STATUS_AMDGPU_RESTART_ERR:
*status_string = "RSMI_STATUS_AMDGPU_RESTART_ERR: Could not successfully "
"restart the amdgpu driver";
@@ -3751,17 +3756,16 @@ static rsmi_status_t
get_compute_partition(uint32_t dv_ind, std::string &compute_partition) {
TRY
CHK_SUPPORT_NAME_ONLY(compute_partition.c_str())
std::string val_str;
std::string compute_partition_str;
DEVICE_MUTEX
rsmi_status_t ret = get_dev_value_str(amd::smi::kDevComputePartition,
dv_ind, &val_str);
dv_ind, &compute_partition_str);
if (ret != RSMI_STATUS_SUCCESS) {
return ret;
}
switch (mapStringToRSMIComputePartitionTypes[val_str]) {
switch (mapStringToRSMIComputePartitionTypes[compute_partition_str]) {
case RSMI_COMPUTE_PARTITION_INVALID:
// Retrieved an unknown compute partition
return RSMI_STATUS_UNEXPECTED_DATA;
@@ -3779,7 +3783,7 @@ get_compute_partition(uint32_t dv_ind, std::string &compute_partition) {
// Retrieved an unknown compute partition
return RSMI_STATUS_UNEXPECTED_DATA;
}
compute_partition = val_str;
compute_partition = compute_partition_str;
return RSMI_STATUS_SUCCESS;
CATCH
}
@@ -3809,13 +3813,33 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition,
CATCH
}
static rsmi_status_t
is_available_compute_partition(uint32_t dv_ind,
std::string new_compute_partition) {
TRY
DEVICE_MUTEX
std::string availableComputePartitions;
rsmi_status_t ret =
get_dev_value_line(amd::smi::kDevAvailableComputePartition,
dv_ind, &availableComputePartitions);
if (ret != RSMI_STATUS_SUCCESS) {
return ret;
}
bool isComputePartitionAvailable =
amd::smi::containsString(availableComputePartitions,
new_compute_partition);
return (isComputePartitionAvailable) ? RSMI_STATUS_SUCCESS :
RSMI_STATUS_SETTING_UNAVAILABLE;
CATCH
}
rsmi_status_t
rsmi_dev_compute_partition_set(uint32_t dv_ind,
rsmi_compute_partition_type_t compute_partition) {
TRY
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
std::string newComputePartitionStr
= mapRSMIToStringComputePartitionTypes[compute_partition];
std::string currentComputePartition;
@@ -3838,6 +3862,14 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
return RSMI_STATUS_INVALID_ARGS;
}
// Confirm what we are trying to set is available, otherwise provide
// RSMI_STATUS_SETTING_UNAVAILABLE
rsmi_status_t available_ret =
is_available_compute_partition(dv_ind, newComputePartitionStr);
if (available_ret != RSMI_STATUS_SUCCESS) {
return available_ret;
}
// do nothing if compute_partition is the current compute partition
rsmi_status_t ret_get = get_compute_partition(dv_ind, currentComputePartition);
// we can try to set, even if we get unexpected data
+4
Просмотреть файл
@@ -122,6 +122,8 @@ static const char *kDevXGMIErrorFName = "xgmi_error";
static const char *kDevSerialNumberFName = "serial_number";
static const char *kDevNumaNodeFName = "numa_node";
static const char *kDevGpuMetricsFName = "gpu_metrics";
static const char *kDevAvailableComputePartitionFName =
"available_compute_partition";
static const char *kDevComputePartitionFName = "current_compute_partition";
static const char *kDevMemoryPartitionFName = "current_memory_partition";
@@ -293,6 +295,7 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
{kDevNumaNode, kDevNumaNodeFName},
{kDevGpuMetrics, kDevGpuMetricsFName},
{kDevGpuReset, kDevGpuResetFName},
{kDevAvailableComputePartition, kDevAvailableComputePartitionFName},
{kDevComputePartition, kDevComputePartitionFName},
{kDevMemoryPartition, kDevMemoryPartitionFName},
};
@@ -930,6 +933,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) {
case kDevVBiosVer:
case kDevPCIEThruPut:
case kDevSerialNumber:
case kDevAvailableComputePartition:
case kDevComputePartition:
case kDevMemoryPartition:
return readDevInfoStr(type, val);
+4 -1
Просмотреть файл
@@ -75,7 +75,8 @@ static const char *kDeviceNamePrefix = "card";
static const char *kAMDMonitorTypes[] = {"radeon", "amdgpu", ""};
static const std::string amdSMI = "amd::smi::";
const std::map<amd::smi::DevInfoTypes, std::string> amd::smi::RocmSMI::devInfoTypesStrings = {
const std::map<amd::smi::DevInfoTypes, std::string>
amd::smi::RocmSMI::devInfoTypesStrings = {
{amd::smi::kDevPerfLevel, amdSMI + "kDevPerfLevel"},
{amd::smi::kDevOverDriveLevel, amdSMI + "kDevOverDriveLevel"},
{amd::smi::kDevMemOverDriveLevel, amdSMI + "kDevMemOverDriveLevel"},
@@ -142,6 +143,8 @@ const std::map<amd::smi::DevInfoTypes, std::string> amd::smi::RocmSMI::devInfoTy
{amd::smi::kDevNumaNode, amdSMI + "kDevNumaNode"},
{amd::smi::kDevGpuMetrics, amdSMI + "kDevGpuMetrics"},
{amd::smi::kDevGpuReset, amdSMI + "kDevGpuReset"},
{amd::smi::kDevAvailableComputePartition, amdSMI +
"kDevAvailableComputePartition"},
{amd::smi::kDevComputePartition, amdSMI + "kDevComputePartition"},
{amd::smi::kDevMemoryPartition, amdSMI + "kDevMemoryPartition"}
};
+1 -1
Просмотреть файл
@@ -390,7 +390,7 @@ std::string readTemporaryFile(std::string path) {
return fileContent;
}
// Used to debug application temporary files (idenified by kTmpFilePrefix)
// Used to debug application temporary files (identified by kTmpFilePrefix)
// and their content
void displayAppTmpFilesContent() {
std::vector<std::string> tmpFiles = getListOfAppTmpFiles();
+46 -17
Просмотреть файл
@@ -174,8 +174,8 @@ void TestComputePartitionReadWrite::Run(void) {
IF_VERB(STANDARD) {
if (err == RSMI_STATUS_INSUFFICIENT_SIZE) {
std::cout << "\t**"
<< "Confirmed RSMI_STATUS_INSUFFICIENT_SIZE was returned "
<< "and size matches length requested." << std::endl;
<< "Confirmed RSMI_STATUS_INSUFFICIENT_SIZE was returned"
<< "\n\t and size matches length requested." << std::endl;
}
}
@@ -275,19 +275,48 @@ void TestComputePartitionReadWrite::Run(void) {
<< computePartitionString(new_computePartition) << std::endl;
}
ret = rsmi_dev_compute_partition_set(dv_ind, new_computePartition);
CHK_ERR_ASRT(ret)
ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition,
255);
CHK_ERR_ASRT(ret)
bool isSettingUnavailable = false;
ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) ||
(ret == RSMI_STATUS_SETTING_UNAVAILABLE));
if (ret == RSMI_STATUS_SETTING_UNAVAILABLE) {
isSettingUnavailable = true;
}
rsmi_status_t retGet =
rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition,
255);
CHK_ERR_ASRT(retGet)
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Current compute partition: "
<< current_char_computePartition
<< std::endl;
}
EXPECT_EQ(RSMI_STATUS_SUCCESS, ret);
EXPECT_STREQ(computePartitionString(new_computePartition).c_str(),
current_char_computePartition);
if (isSettingUnavailable) {
ASSERT_EQ(RSMI_STATUS_SETTING_UNAVAILABLE, ret);
ASSERT_STRNE(computePartitionString(new_computePartition).c_str(),
current_char_computePartition);
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Confirmed after receiving "
<< "RSMI_STATUS_SETTING_UNAVAILABLE,\n\t current compute "
<< "partition (" << current_char_computePartition
<< ") did not update to ("
<< computePartitionString(new_computePartition) << ")"
<< std::endl;
}
} else {
ASSERT_EQ(RSMI_STATUS_SUCCESS, ret);
ASSERT_STREQ(computePartitionString(new_computePartition).c_str(),
current_char_computePartition);
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Confirmed current compute partition ("
<< current_char_computePartition << ") matches"
<< "\n\t requested compute partition ("
<< computePartitionString(new_computePartition) << ")"
<< std::endl;
}
}
}
/* TEST RETURN TO BOOT COMPUTE PARTITION SETTING */
@@ -309,15 +338,15 @@ void TestComputePartitionReadWrite::Run(void) {
255);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Current compute partition: " << current_char_computePartition << std::endl;
std::cout << "\t**" << "Current compute partition: "
<< current_char_computePartition << std::endl;
}
if (wasResetSuccess) {
ASSERT_STRNE(oldPartition.c_str(), current_char_computePartition);
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Confirmed prior partition (" << oldPartition << ") is not "
<< "equal to current partition ("
<< "equal to current\n\t partition ("
<< current_char_computePartition << ")" << std::endl;
}
} else {
@@ -325,16 +354,16 @@ void TestComputePartitionReadWrite::Run(void) {
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Confirmed prior partition (" << oldPartition << ") is equal"
<< " to current partition ("
<< " to current\n\t partition ("
<< current_char_computePartition << ")" << std::endl;
}
}
/* TEST RETURN TO ORIGINAL COMPUTE PARTITIONING SETTING */
/* TEST RETURN TO ORIGINAL COMPUTE PARTITION SETTING */
IF_VERB(STANDARD) {
std::cout << std::endl;
std::cout << "\t**"
<< "=========== TEST RETURN TO ORIGINAL COMPUTE PARTITIONING "
<< "=========== TEST RETURN TO ORIGINAL COMPUTE PARTITION "
<< "SETTING ========" << std::endl;
}
new_computePartition
@@ -351,8 +380,8 @@ void TestComputePartitionReadWrite::Run(void) {
IF_VERB(STANDARD) {
std::cout << "\t**" << "Attempted to set compute partition: "
<< computePartitionString(new_computePartition) << std::endl
<< "\t**"
<< "Current compute partition: " << current_char_computePartition
<< "\t**" << "Current compute partition: "
<< current_char_computePartition
<< std::endl;
}
EXPECT_EQ(RSMI_STATUS_SUCCESS, ret);