diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 3a80112264..d690050783 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -126,6 +126,8 @@ typedef enum { //!< being used RSMI_STATUS_REFCOUNT_OVERFLOW, //!< An internal reference counter //!< exceeded INT32_MAX + RSMI_STATUS_SETTING_UNAVAILABLE, //!< Requested setting is unavailable + //!< for the current device RSMI_STATUS_AMDGPU_RESTART_ERR, //!< Could not successfully restart //!< the amdgpu driver @@ -3572,6 +3574,8 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, * @retval ::RSMI_STATUS_SUCCESS call was successful * @retval ::RSMI_STATUS_PERMISSION function requires root access * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_SETTING_UNAVAILABLE the provided setting is + * unavailable for current device * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not * support this function * diff --git a/include/rocm_smi/rocm_smi_device.h b/include/rocm_smi/rocm_smi_device.h index e3c94164ac..c975baae55 100755 --- a/include/rocm_smi/rocm_smi_device.h +++ b/include/rocm_smi/rocm_smi_device.h @@ -162,6 +162,7 @@ enum DevInfoTypes { kDevNumaNode, kDevGpuMetrics, kDevGpuReset, + kDevAvailableComputePartition, kDevComputePartition, kDevMemoryPartition }; diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 743ed16107..f99b7f3261 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -1421,6 +1421,9 @@ def setComputePartition(deviceList, computePartitionType): None) elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION: printLog(device, 'Permission denied', None) + elif ret == rsmi_status_t.RSMI_STATUS_SETTING_UNAVAILABLE: + printLog(device, 'Requested setting (%s) is unavailable for current device' + %computePartitionType, None) elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: printLog(device, 'Not supported on the given system', None) else: diff --git a/python_smi_tools/rsmiBindings.py b/python_smi_tools/rsmiBindings.py index 8f70b63d90..d3a81ee411 100644 --- a/python_smi_tools/rsmiBindings.py +++ b/python_smi_tools/rsmiBindings.py @@ -69,7 +69,8 @@ class rsmi_status_t(c_int): RSMI_STATUS_UNEXPECTED_DATA = 0xF RSMI_STATUS_BUSY = 0x10 RSMI_STATUS_REFCOUNT_OVERFLOW = 0x11 - RSMI_STATUS_AMDGPU_RESTART_ERR = 0x12 + RSMI_STATUS_SETTING_UNAVAILABLE = 0x12 + RSMI_STATUS_AMDGPU_RESTART_ERR = 0x13 RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF @@ -93,6 +94,7 @@ rsmi_status_verbose_err_out = { rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA: 'Unexpected data received', rsmi_status_t.RSMI_STATUS_BUSY: 'Busy - resources are preventing call the ability to execute', rsmi_status_t.RSMI_STATUS_REFCOUNT_OVERFLOW: 'Data overflow - data exceeded INT32_MAX', + rsmi_status_t.RSMI_STATUS_SETTING_UNAVAILABLE: 'Requested setting is unavailable for current device', rsmi_status_t.RSMI_STATUS_AMDGPU_RESTART_ERR: 'Could not successfully restart the amdgpu driver', rsmi_status_t.RSMI_STATUS_UNKNOWN_ERROR: 'Unknown error occured' } diff --git a/rocm_smi/docs/ROCm_SMI_Manual.pdf b/rocm_smi/docs/ROCm_SMI_Manual.pdf index 523a91b2ba..3a3ec7aa6f 100644 Binary files a/rocm_smi/docs/ROCm_SMI_Manual.pdf and b/rocm_smi/docs/ROCm_SMI_Manual.pdf differ diff --git a/rocm_smi/example/rocm_smi_example.cc b/rocm_smi/example/rocm_smi_example.cc index a9fcd6c801..3390fb573e 100755 --- a/rocm_smi/example/rocm_smi_example.cc +++ b/rocm_smi/example/rocm_smi_example.cc @@ -114,6 +114,18 @@ } \ } +#define CHK_RSMI_NOT_SUPPORTED_OR_SETTING_UNAVAILABLE_RET(RET) {\ + if ((RET) == RSMI_STATUS_NOT_SUPPORTED) { \ + std::cout << "This function is not supported in the current environment."\ + << std::endl; \ + } else if ((RET) == RSMI_STATUS_SETTING_UNAVAILABLE) { \ + std::cout << "[WARN] RSMI_STATUS_SETTING_UNAVAILABLE retrieved." \ + << std::endl; \ + } else { \ + CHK_RSMI_RET(RET) \ + } \ +} + #define CHK_NOT_SUPPORTED_OR_UNEXPECTED_DATA_OR_INSUFFICIENT_SIZE_RET(RET) { \ if ((RET) == RSMI_STATUS_NOT_SUPPORTED) { \ std::cout << "This function is not supported in the current environment." \ @@ -570,10 +582,9 @@ static rsmi_status_t test_set_compute_partitioning(uint32_t dv_ind) { << compute_partition_string(newPartition) << "..." << std::endl; ret = rsmi_dev_compute_partition_set(dv_ind, newPartition); - CHK_RSMI_NOT_SUPPORTED_RET(ret) + CHK_RSMI_NOT_SUPPORTED_OR_SETTING_UNAVAILABLE_RET(ret) std::cout << "Done setting compute partition to " - << compute_partition_string(newPartition) - << "." << std::endl; + << compute_partition_string(newPartition) << "." << std::endl; std::cout << std::endl << std::endl; } @@ -589,7 +600,7 @@ static rsmi_status_t test_set_compute_partitioning(uint32_t dv_ind) { rsmi_compute_partition_type origComputePartitionType = mapStringToRSMIComputePartitionTypes[originalComputePartition]; ret = rsmi_dev_compute_partition_set(dv_ind, origComputePartitionType); - CHK_RSMI_NOT_SUPPORTED_RET(ret) + CHK_RSMI_NOT_SUPPORTED_OR_SETTING_UNAVAILABLE_RET(ret) std::cout << "Done" << std::endl; } return RSMI_STATUS_SUCCESS; diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 5bf50f2b87..19e4724fe5 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -2865,6 +2865,11 @@ rsmi_status_string(rsmi_status_t status, const char **status_string) { "counter exceeded INT32_MAX"; break; + case RSMI_STATUS_SETTING_UNAVAILABLE: + *status_string = "RSMI_STATUS_SETTING_UNAVAILABLE: Requested setting is " + "unavailable for the current device"; + break; + case RSMI_STATUS_AMDGPU_RESTART_ERR: *status_string = "RSMI_STATUS_AMDGPU_RESTART_ERR: Could not successfully " "restart the amdgpu driver"; @@ -3751,17 +3756,16 @@ static rsmi_status_t get_compute_partition(uint32_t dv_ind, std::string &compute_partition) { TRY CHK_SUPPORT_NAME_ONLY(compute_partition.c_str()) - std::string val_str; + std::string compute_partition_str; DEVICE_MUTEX rsmi_status_t ret = get_dev_value_str(amd::smi::kDevComputePartition, - dv_ind, &val_str); - + dv_ind, &compute_partition_str); if (ret != RSMI_STATUS_SUCCESS) { return ret; } - switch (mapStringToRSMIComputePartitionTypes[val_str]) { + switch (mapStringToRSMIComputePartitionTypes[compute_partition_str]) { case RSMI_COMPUTE_PARTITION_INVALID: // Retrieved an unknown compute partition return RSMI_STATUS_UNEXPECTED_DATA; @@ -3779,7 +3783,7 @@ get_compute_partition(uint32_t dv_ind, std::string &compute_partition) { // Retrieved an unknown compute partition return RSMI_STATUS_UNEXPECTED_DATA; } - compute_partition = val_str; + compute_partition = compute_partition_str; return RSMI_STATUS_SUCCESS; CATCH } @@ -3809,13 +3813,33 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, CATCH } +static rsmi_status_t +is_available_compute_partition(uint32_t dv_ind, + std::string new_compute_partition) { + TRY + DEVICE_MUTEX + std::string availableComputePartitions; + rsmi_status_t ret = + get_dev_value_line(amd::smi::kDevAvailableComputePartition, + dv_ind, &availableComputePartitions); + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + bool isComputePartitionAvailable = + amd::smi::containsString(availableComputePartitions, + new_compute_partition); + return (isComputePartitionAvailable) ? RSMI_STATUS_SUCCESS : + RSMI_STATUS_SETTING_UNAVAILABLE; + CATCH +} + rsmi_status_t rsmi_dev_compute_partition_set(uint32_t dv_ind, rsmi_compute_partition_type_t compute_partition) { TRY REQUIRE_ROOT_ACCESS DEVICE_MUTEX - std::string newComputePartitionStr = mapRSMIToStringComputePartitionTypes[compute_partition]; std::string currentComputePartition; @@ -3838,6 +3862,14 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, return RSMI_STATUS_INVALID_ARGS; } + // Confirm what we are trying to set is available, otherwise provide + // RSMI_STATUS_SETTING_UNAVAILABLE + rsmi_status_t available_ret = + is_available_compute_partition(dv_ind, newComputePartitionStr); + if (available_ret != RSMI_STATUS_SUCCESS) { + return available_ret; + } + // do nothing if compute_partition is the current compute partition rsmi_status_t ret_get = get_compute_partition(dv_ind, currentComputePartition); // we can try to set, even if we get unexpected data diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index e8e89aa9cb..175fba12d3 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -122,6 +122,8 @@ static const char *kDevXGMIErrorFName = "xgmi_error"; static const char *kDevSerialNumberFName = "serial_number"; static const char *kDevNumaNodeFName = "numa_node"; static const char *kDevGpuMetricsFName = "gpu_metrics"; +static const char *kDevAvailableComputePartitionFName = + "available_compute_partition"; static const char *kDevComputePartitionFName = "current_compute_partition"; static const char *kDevMemoryPartitionFName = "current_memory_partition"; @@ -293,6 +295,7 @@ static const std::map kDevAttribNameMap = { {kDevNumaNode, kDevNumaNodeFName}, {kDevGpuMetrics, kDevGpuMetricsFName}, {kDevGpuReset, kDevGpuResetFName}, + {kDevAvailableComputePartition, kDevAvailableComputePartitionFName}, {kDevComputePartition, kDevComputePartitionFName}, {kDevMemoryPartition, kDevMemoryPartitionFName}, }; @@ -930,6 +933,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) { case kDevVBiosVer: case kDevPCIEThruPut: case kDevSerialNumber: + case kDevAvailableComputePartition: case kDevComputePartition: case kDevMemoryPartition: return readDevInfoStr(type, val); diff --git a/src/rocm_smi_main.cc b/src/rocm_smi_main.cc index 187b25241e..16cd336e2a 100755 --- a/src/rocm_smi_main.cc +++ b/src/rocm_smi_main.cc @@ -75,7 +75,8 @@ static const char *kDeviceNamePrefix = "card"; static const char *kAMDMonitorTypes[] = {"radeon", "amdgpu", ""}; static const std::string amdSMI = "amd::smi::"; -const std::map amd::smi::RocmSMI::devInfoTypesStrings = { +const std::map +amd::smi::RocmSMI::devInfoTypesStrings = { {amd::smi::kDevPerfLevel, amdSMI + "kDevPerfLevel"}, {amd::smi::kDevOverDriveLevel, amdSMI + "kDevOverDriveLevel"}, {amd::smi::kDevMemOverDriveLevel, amdSMI + "kDevMemOverDriveLevel"}, @@ -142,6 +143,8 @@ const std::map amd::smi::RocmSMI::devInfoTy {amd::smi::kDevNumaNode, amdSMI + "kDevNumaNode"}, {amd::smi::kDevGpuMetrics, amdSMI + "kDevGpuMetrics"}, {amd::smi::kDevGpuReset, amdSMI + "kDevGpuReset"}, + {amd::smi::kDevAvailableComputePartition, amdSMI + + "kDevAvailableComputePartition"}, {amd::smi::kDevComputePartition, amdSMI + "kDevComputePartition"}, {amd::smi::kDevMemoryPartition, amdSMI + "kDevMemoryPartition"} }; diff --git a/src/rocm_smi_utils.cc b/src/rocm_smi_utils.cc index 3a2443ddd4..1d22d5d5c7 100755 --- a/src/rocm_smi_utils.cc +++ b/src/rocm_smi_utils.cc @@ -390,7 +390,7 @@ std::string readTemporaryFile(std::string path) { return fileContent; } -// Used to debug application temporary files (idenified by kTmpFilePrefix) +// Used to debug application temporary files (identified by kTmpFilePrefix) // and their content void displayAppTmpFilesContent() { std::vector tmpFiles = getListOfAppTmpFiles(); diff --git a/tests/rocm_smi_test/functional/computepartition_read_write.cc b/tests/rocm_smi_test/functional/computepartition_read_write.cc index 660866df68..1263669cb8 100755 --- a/tests/rocm_smi_test/functional/computepartition_read_write.cc +++ b/tests/rocm_smi_test/functional/computepartition_read_write.cc @@ -174,8 +174,8 @@ void TestComputePartitionReadWrite::Run(void) { IF_VERB(STANDARD) { if (err == RSMI_STATUS_INSUFFICIENT_SIZE) { std::cout << "\t**" - << "Confirmed RSMI_STATUS_INSUFFICIENT_SIZE was returned " - << "and size matches length requested." << std::endl; + << "Confirmed RSMI_STATUS_INSUFFICIENT_SIZE was returned" + << "\n\t and size matches length requested." << std::endl; } } @@ -275,19 +275,48 @@ void TestComputePartitionReadWrite::Run(void) { << computePartitionString(new_computePartition) << std::endl; } ret = rsmi_dev_compute_partition_set(dv_ind, new_computePartition); - CHK_ERR_ASRT(ret) - ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition, - 255); - CHK_ERR_ASRT(ret) + bool isSettingUnavailable = false; + ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) || + (ret == RSMI_STATUS_SETTING_UNAVAILABLE)); + if (ret == RSMI_STATUS_SETTING_UNAVAILABLE) { + isSettingUnavailable = true; + } + rsmi_status_t retGet = + rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition, + 255); + CHK_ERR_ASRT(retGet) IF_VERB(STANDARD) { std::cout << "\t**" << "Current compute partition: " << current_char_computePartition << std::endl; } - EXPECT_EQ(RSMI_STATUS_SUCCESS, ret); - EXPECT_STREQ(computePartitionString(new_computePartition).c_str(), - current_char_computePartition); + if (isSettingUnavailable) { + ASSERT_EQ(RSMI_STATUS_SETTING_UNAVAILABLE, ret); + ASSERT_STRNE(computePartitionString(new_computePartition).c_str(), + current_char_computePartition); + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed after receiving " + << "RSMI_STATUS_SETTING_UNAVAILABLE,\n\t current compute " + << "partition (" << current_char_computePartition + << ") did not update to (" + << computePartitionString(new_computePartition) << ")" + << std::endl; + } + } else { + ASSERT_EQ(RSMI_STATUS_SUCCESS, ret); + ASSERT_STREQ(computePartitionString(new_computePartition).c_str(), + current_char_computePartition); + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed current compute partition (" + << current_char_computePartition << ") matches" + << "\n\t requested compute partition (" + << computePartitionString(new_computePartition) << ")" + << std::endl; + } + } } /* TEST RETURN TO BOOT COMPUTE PARTITION SETTING */ @@ -309,15 +338,15 @@ void TestComputePartitionReadWrite::Run(void) { 255); CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { - std::cout << "\t**" - << "Current compute partition: " << current_char_computePartition << std::endl; + std::cout << "\t**" << "Current compute partition: " + << current_char_computePartition << std::endl; } if (wasResetSuccess) { ASSERT_STRNE(oldPartition.c_str(), current_char_computePartition); IF_VERB(STANDARD) { std::cout << "\t**" << "Confirmed prior partition (" << oldPartition << ") is not " - << "equal to current partition (" + << "equal to current\n\t partition (" << current_char_computePartition << ")" << std::endl; } } else { @@ -325,16 +354,16 @@ void TestComputePartitionReadWrite::Run(void) { IF_VERB(STANDARD) { std::cout << "\t**" << "Confirmed prior partition (" << oldPartition << ") is equal" - << " to current partition (" + << " to current\n\t partition (" << current_char_computePartition << ")" << std::endl; } } - /* TEST RETURN TO ORIGINAL COMPUTE PARTITIONING SETTING */ + /* TEST RETURN TO ORIGINAL COMPUTE PARTITION SETTING */ IF_VERB(STANDARD) { std::cout << std::endl; std::cout << "\t**" - << "=========== TEST RETURN TO ORIGINAL COMPUTE PARTITIONING " + << "=========== TEST RETURN TO ORIGINAL COMPUTE PARTITION " << "SETTING ========" << std::endl; } new_computePartition @@ -351,8 +380,8 @@ void TestComputePartitionReadWrite::Run(void) { IF_VERB(STANDARD) { std::cout << "\t**" << "Attempted to set compute partition: " << computePartitionString(new_computePartition) << std::endl - << "\t**" - << "Current compute partition: " << current_char_computePartition + << "\t**" << "Current compute partition: " + << current_char_computePartition << std::endl; } EXPECT_EQ(RSMI_STATUS_SUCCESS, ret);