From 78a0812f7fa789cb77f8483bb98bedd2b9aaa949 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Wed, 5 Apr 2023 12:44:29 -0500 Subject: [PATCH] [SWDEV-391036 + SWDEV-392933] Fixes for VoltRead and ComputePart. Updates: * VoltRead - needed to properly send out RSMI_STATUS_NOT_SUPPORTED when device does not have voltage hwmon files * ComputePart. - test failure was likely caused due to EvtNotif causing conflicts (unknown exactly why). Test passes when moving it ahead of the event notifier. Both API calls may have a system resource issue, TBD. * rocm_smi_example - now indicates when an API call returns RSMI_STATUS_NOT_SUPPORTED or RSMI_STATUS_NOT_YET_IMPLEMENTED. Allows example to fully complete on systems which may not provide support for all API calls. Change-Id: I520b8584e078d412414e8e5797c664220a7e823a Signed-off-by: Charis Poag --- rocm_smi/example/rocm_smi_example.cc | 142 ++++++++++++++------ src/rocm_smi.cc | 9 +- tests/rocm_smi_test/functional/volt_read.cc | 11 +- tests/rocm_smi_test/main.cc | 8 +- 4 files changed, 120 insertions(+), 50 deletions(-) diff --git a/rocm_smi/example/rocm_smi_example.cc b/rocm_smi/example/rocm_smi_example.cc index 3390fb573e..bb456f7a0e 100755 --- a/rocm_smi/example/rocm_smi_example.cc +++ b/rocm_smi/example/rocm_smi_example.cc @@ -93,9 +93,25 @@ } \ } +#define CHK_FILE_PERMISSIONS_AND_NOT_SUPPORTED_OR_UNIMPLEMENTED(RET) { \ + if ((RET) == RSMI_STATUS_PERMISSION) { \ + if (isFileWritable(RET)) { \ + CHK_RSMI_RET(RET) \ + } \ + } else if ((RET) == RSMI_STATUS_NOT_SUPPORTED) { \ + std::cout << "Not Supported." \ + << std::endl; \ + } else if ((RET) == RSMI_STATUS_NOT_YET_IMPLEMENTED) { \ + std::cout << "Not Yet Implemented." \ + << std::endl; \ + } else { \ + CHK_RSMI_RET(RET) \ + } \ +} + #define CHK_RSMI_NOT_SUPPORTED_RET(RET) { \ if ((RET) == RSMI_STATUS_NOT_SUPPORTED) { \ - std::cout << "This function is not supported in the current environment." \ + std::cout << "Not Supported." \ << std::endl; \ } else { \ CHK_RSMI_RET(RET) \ @@ -104,7 +120,7 @@ #define CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(RET) { \ if ((RET) == RSMI_STATUS_NOT_SUPPORTED) { \ - std::cout << "This function is not supported in the current environment." \ + std::cout << "Not Supported." \ << std::endl; \ } else if ((RET) == RSMI_STATUS_UNEXPECTED_DATA) { \ std::cout << "[ERROR] RSMI_STATUS_UNEXPECTED_DATA retrieved." \ @@ -116,7 +132,7 @@ #define CHK_RSMI_NOT_SUPPORTED_OR_SETTING_UNAVAILABLE_RET(RET) {\ if ((RET) == RSMI_STATUS_NOT_SUPPORTED) { \ - std::cout << "This function is not supported in the current environment."\ + std::cout << "Not Supported."\ << std::endl; \ } else if ((RET) == RSMI_STATUS_SETTING_UNAVAILABLE) { \ std::cout << "[WARN] RSMI_STATUS_SETTING_UNAVAILABLE retrieved." \ @@ -128,7 +144,7 @@ #define CHK_NOT_SUPPORTED_OR_UNEXPECTED_DATA_OR_INSUFFICIENT_SIZE_RET(RET) { \ if ((RET) == RSMI_STATUS_NOT_SUPPORTED) { \ - std::cout << "This function is not supported in the current environment." \ + std::cout << "Not Supported." \ << std::endl; \ } else if ((RET) == RSMI_STATUS_UNEXPECTED_DATA) { \ std::cout << "[WARN] RSMI_STATUS_UNEXPECTED_DATA retrieved." \ @@ -272,7 +288,13 @@ static rsmi_status_t test_power_profile(uint32_t dv_ind) { print_test_header("Power Profile", dv_ind); + std::cout << "The available power profiles are: "; ret = rsmi_dev_power_profile_presets_get(dv_ind, 0, &status); + CHK_RSMI_NOT_SUPPORTED_RET(ret) + if (ret != RSMI_STATUS_SUCCESS) { + std::cout << "***Skipping Power Profile test." << std::endl; + return RSMI_STATUS_SUCCESS; + } CHK_RSMI_RET(ret) std::cout << "The available power profiles are:" << std::endl; @@ -393,13 +415,13 @@ static rsmi_status_t test_set_overdrive(uint32_t dv_ind) { CHK_RSMI_RET(ret) ret = rsmi_dev_overdrive_level_get(dv_ind, &val); CHK_RSMI_RET(ret) - std::cout << "\t**New OverDrive Level:" << val << std::endl; + std::cout << "\t**New OverDrive Level:" << std::dec << val << std::endl; std::cout << "Reset Overdrive level to 0%..." << std::endl; ret = rsmi_dev_overdrive_level_set_v1(dv_ind, 0); CHK_RSMI_RET(ret) ret = rsmi_dev_overdrive_level_get(dv_ind, &val); CHK_RSMI_RET(ret) - std::cout << "\t**New OverDrive Level:" << val << std::endl; + std::cout << "\t**New OverDrive Level:" << std::dec << val << std::endl; return ret; } @@ -412,9 +434,15 @@ static rsmi_status_t test_set_fan_speed(uint32_t dv_ind) { print_test_header("Fan Speed Control", dv_ind); + std::cout << "Original fan speed: "; ret = rsmi_dev_fan_speed_get(dv_ind, 0, &orig_speed); - CHK_RSMI_RET(ret) - std::cout << "Original fan speed: " << orig_speed << std::endl; + if (ret == RSMI_STATUS_SUCCESS) { + std::cout << orig_speed << std::endl; + } else { + CHK_RSMI_NOT_SUPPORTED_RET(ret) + std::cout << "***Skipping Fan Speed Control test." << std::endl; + return RSMI_STATUS_SUCCESS; + } if (orig_speed == 0) { std::cout << "***System fan speed value is 0. Skip fan test." << std::endl; @@ -474,6 +502,11 @@ static rsmi_status_t test_set_perf_level(uint32_t dv_ind) { std::cout << "Set Performance Level to " << (uint32_t)pfl << " ..." << std::endl; ret = rsmi_dev_perf_level_set_v1(dv_ind, pfl); + if (ret != RSMI_STATUS_SUCCESS) { + CHK_RSMI_NOT_SUPPORTED_RET(ret) + std::cout << "***Skipping Performance Level Control test." << std::endl; + return RSMI_STATUS_SUCCESS; + } CHK_RSMI_RET(ret) ret = rsmi_dev_perf_level_get(dv_ind, &pfl); CHK_RSMI_RET(ret) @@ -505,7 +538,7 @@ static rsmi_status_t test_set_freq(uint32_t dv_ind) { rsmi_clk = (rsmi_clk_type)clk; ret = rsmi_dev_gpu_clk_freq_get(dv_ind, rsmi_clk, &f); - CHK_AND_PRINT_RSMI_ERR_RET(ret) + CHK_FILE_PERMISSIONS_AND_NOT_SUPPORTED_OR_UNIMPLEMENTED(ret) std::cout << "Initial frequency for clock" << rsmi_clk << " is " << f.current << std::endl; @@ -524,15 +557,15 @@ static rsmi_status_t test_set_freq(uint32_t dv_ind) { " to 0b" << freq_bm_str << " ..." << std::endl; ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, freq_bitmask); - CHK_FILE_PERMISSIONS(ret) + CHK_FILE_PERMISSIONS_AND_NOT_SUPPORTED_OR_UNIMPLEMENTED(ret) ret = rsmi_dev_gpu_clk_freq_get(dv_ind, rsmi_clk, &f); - CHK_FILE_PERMISSIONS(ret) + CHK_FILE_PERMISSIONS_AND_NOT_SUPPORTED_OR_UNIMPLEMENTED(ret) std::cout << "Frequency is now index " << f.current << std::endl; std::cout << "Resetting mask to all frequencies." << std::endl; ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, 0xFFFFFFFF); - CHK_FILE_PERMISSIONS(ret) + CHK_FILE_PERMISSIONS_AND_NOT_SUPPORTED_OR_UNIMPLEMENTED(ret) ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_AUTO); CHK_FILE_PERMISSIONS(ret) @@ -684,30 +717,38 @@ int main() { for (uint32_t i = 0; i < num_monitor_devs; ++i) { ret = rsmi_dev_id_get(i, &val_ui16); CHK_RSMI_RET_I(ret) - std::cout << "\t**Device ID: 0x" << std::hex << val_ui64 << std::endl; + std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << std::endl; char current_compute_partition[256]; current_compute_partition[0] = '\0'; ret = rsmi_dev_compute_partition_get(i, current_compute_partition, 256); - CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(ret) std::cout << "\t**Current Compute Partition: " << (((current_compute_partition == nullptr) || ((current_compute_partition != nullptr) && (current_compute_partition[0] == '\0'))) - ? "UNKNOWN" : current_compute_partition) - << std::endl; + ? "UNKNOWN" : current_compute_partition); + if (ret != RSMI_STATUS_SUCCESS) { + std::cout << ", RSMI_STATUS = "; + } else { + std::cout << std::endl; + } + CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(ret) uint32_t len = 5; char nps_mode[len]; nps_mode[0] = '\0'; ret = rsmi_dev_nps_mode_get(i, nps_mode, len); - CHK_NOT_SUPPORTED_OR_UNEXPECTED_DATA_OR_INSUFFICIENT_SIZE_RET(ret) std::cout << "\t**NPS Mode: " << (((nps_mode == nullptr) || ((nps_mode != nullptr) && (nps_mode[0] == '\0'))) - ? "UNKNOWN" : nps_mode) - << std::endl; + ? "UNKNOWN" : nps_mode); + if (ret != RSMI_STATUS_SUCCESS) { + std::cout << ", RSMI_STATUS = "; + } else { + std::cout << std::endl; + } + CHK_NOT_SUPPORTED_OR_UNEXPECTED_DATA_OR_INSUFFICIENT_SIZE_RET(ret) ret = rsmi_dev_gpu_metrics_info_get(i, &p); CHK_AND_PRINT_RSMI_ERR_RET(ret) @@ -733,47 +774,66 @@ int main() { std::cout << f.num_supported << std::endl; print_frequencies(&f); + std::cout << "\t**Monitor name: "; char name[128]; ret = rsmi_dev_name_get(i, name, 128); CHK_AND_PRINT_RSMI_ERR_RET(ret) - std::cout << "\t**Monitor name: " << name << std::endl; + std::cout << name << std::endl; + std::cout << "\t**Temperature: "; ret = rsmi_dev_temp_metric_get(i, 0, RSMI_TEMP_CURRENT, &val_i64); - CHK_AND_PRINT_RSMI_ERR_RET(ret) - std::cout << "\t**Temperature: " << val_i64/1000 << "C" << std::endl; + if (ret == RSMI_STATUS_SUCCESS) { + std::cout << val_i64/1000 << "C" << std::endl; + } + CHK_RSMI_NOT_SUPPORTED_RET(ret) + std::cout << "\t**Voltage: "; ret = rsmi_dev_volt_metric_get(i, RSMI_VOLT_TYPE_VDDGFX, RSMI_VOLT_CURRENT, &val_i64); - CHK_AND_PRINT_RSMI_ERR_RET(ret) - std::cout << "\t**Voltage: " << val_i64 << "mV" << std::endl; + if (ret == RSMI_STATUS_SUCCESS) { + std::cout << val_i64 << "mV" << std::endl; + } + CHK_RSMI_NOT_SUPPORTED_RET(ret) - ret = rsmi_dev_fan_speed_get(i, 0, &val_i64); - CHK_AND_PRINT_RSMI_ERR_RET(ret) - ret = rsmi_dev_fan_speed_max_get(i, 0, &val_ui64); - CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t**Current Fan Speed: "; - std::cout << val_i64/static_cast(val_ui64)*100; - std::cout << "% ("<< val_i64 << "/" << val_ui64 << ")" << std::endl; + ret = rsmi_dev_fan_speed_get(i, 0, &val_i64); + if (ret == RSMI_STATUS_SUCCESS) { + ret = rsmi_dev_fan_speed_max_get(i, 0, &val_ui64); + CHK_AND_PRINT_RSMI_ERR_RET(ret) + std::cout << (static_cast(val_i64)/val_ui64) * 100; + std::cout << "% (" << std::dec << val_i64 << "/" + << std::dec << val_ui64 << ")" << std::endl; + } + CHK_RSMI_NOT_SUPPORTED_RET(ret) + std::cout << "\t**Current fan RPMs: "; ret = rsmi_dev_fan_rpms_get(i, 0, &val_i64); - CHK_AND_PRINT_RSMI_ERR_RET(ret) - std::cout << "\t**Current fan RPMs: " << val_i64 << std::endl; + if (ret == RSMI_STATUS_SUCCESS) { + std::cout << std::dec << val_i64 << std::endl; + } + CHK_RSMI_NOT_SUPPORTED_RET(ret) + std::cout << "\t**Current Power Cap: "; ret = rsmi_dev_power_cap_get(i, 0, &val_ui64); - CHK_AND_PRINT_RSMI_ERR_RET(ret) - std::cout << "\t**Current Power Cap: " << val_ui64 << "uW" <(val_ui64)/1000 << " W" << std::endl; ret = rsmi_dev_power_ave_get(i, 0, &val_ui64); - CHK_AND_PRINT_RSMI_ERR_RET(ret) + if (ret == RSMI_STATUS_SUCCESS) { + std::cout << static_cast(val_ui64)/1000 << " W" << std::endl; + } + CHK_RSMI_NOT_SUPPORTED_RET(ret) std::cout << "\t=======" << std::endl; } diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 8c3b710ee7..c699701ddf 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -2351,8 +2351,13 @@ rsmi_dev_volt_metric_get(uint32_t dv_ind, rsmi_voltage_type_t sensor_type, // getVoltSensorIndex will throw an out of range exception if sensor_type is // not found - uint32_t sensor_index = - m->getVoltSensorIndex(sensor_type); + uint32_t sensor_index; + try { + sensor_index = + m->getVoltSensorIndex(sensor_type); + } catch (...) { + return RSMI_STATUS_NOT_SUPPORTED; + } CHK_API_SUPPORT_ONLY(voltage, metric, sensor_index) ret = get_dev_mon_value(mon_type, dv_ind, sensor_index, voltage); diff --git a/tests/rocm_smi_test/functional/volt_read.cc b/tests/rocm_smi_test/functional/volt_read.cc index a8be24e03f..fec4975647 100644 --- a/tests/rocm_smi_test/functional/volt_read.cc +++ b/tests/rocm_smi_test/functional/volt_read.cc @@ -100,6 +100,11 @@ void TestVoltRead::Run(void) { rsmi_voltage_type_t type = RSMI_VOLT_TYPE_VDDGFX; for (uint32_t i = 0; i < num_monitor_devs(); ++i) { + IF_VERB(STANDARD) { + if (i != 0) { + std::cout << "\n" << std::endl; + } + } PrintDeviceHeader(i); auto print_volt_metric = [&](rsmi_voltage_metric_t met, @@ -111,12 +116,12 @@ void TestVoltRead::Run(void) { IF_VERB(STANDARD) { std::cout << "\t**" << label << ": " << "Not supported on this machine" << std::endl; + } // Verify api support checking functionality is working err = rsmi_dev_volt_metric_get(i, type, met, nullptr); ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); return; - } } else { CHK_ERR_ASRT(err) } @@ -144,8 +149,8 @@ void TestVoltRead::Run(void) { print_volt_metric(RSMI_VOLT_MIN_CRIT, "Voltage critical min value"); print_volt_metric(RSMI_VOLT_AVERAGE, "Voltage critical max value"); - print_volt_metric(RSMI_VOLT_LOWEST, "Historical minimum temperature"); - print_volt_metric(RSMI_VOLT_HIGHEST, "Historical maximum temperature"); + print_volt_metric(RSMI_VOLT_LOWEST, "Historical minimum voltage"); + print_volt_metric(RSMI_VOLT_HIGHEST, "Historical maximum voltage"); } } } diff --git a/tests/rocm_smi_test/main.cc b/tests/rocm_smi_test/main.cc index e761ac4fd0..bde4539115 100755 --- a/tests/rocm_smi_test/main.cc +++ b/tests/rocm_smi_test/main.cc @@ -270,10 +270,6 @@ TEST(rsmitstReadOnly, TestMutualExclusion) { tst.Run(); RunCustomTestEpilog(&tst); } -TEST(rsmitstReadWrite, TestEvtNotifReadWrite) { - TestEvtNotifReadWrite tst; - RunGenericTest(&tst); -} TEST(rsmitstReadWrite, TestComputePartitionReadWrite) { TestComputePartitionReadWrite tst; RunGenericTest(&tst); @@ -282,6 +278,10 @@ TEST(rsmitstReadWrite, TestNPSModeReadWrite) { TestNPSModeReadWrite tst; RunGenericTest(&tst); } +TEST(rsmitstReadWrite, TestEvtNotifReadWrite) { + TestEvtNotifReadWrite tst; + RunGenericTest(&tst); +} TEST(rsmitstReadOnly, Test) { TestConcurrentInit tst; SetFlags(&tst);