From 6a356e7bb13ffa9a85ed8f5839e3147ed57c9ca5 Mon Sep 17 00:00:00 2001 From: "Pryor, Adam" Date: Mon, 21 Jul 2025 15:05:49 -0500 Subject: [PATCH 1/4] [SWDEV-541958] Fix config (#217) * [SWDEV-541958] Fix config Change-Id: I6703821747ade5adb993ab7f386f3658db8a3357 * fixes Change-Id: I0a1c7d96452d9b2ccb6401b77d73398a67518e91 --- include/rdc_lib/impl/RdcConfigSettingsImpl.h | 1 + rdc_libs/rdc/src/RdcConfigSettingsImpl.cc | 96 +++++++++++++------- 2 files changed, 62 insertions(+), 35 deletions(-) diff --git a/include/rdc_lib/impl/RdcConfigSettingsImpl.h b/include/rdc_lib/impl/RdcConfigSettingsImpl.h index a58017edb4..f492bdf77b 100644 --- a/include/rdc_lib/impl/RdcConfigSettingsImpl.h +++ b/include/rdc_lib/impl/RdcConfigSettingsImpl.h @@ -64,6 +64,7 @@ class RdcConfigSettingsImpl : public RdcConfigSettings { void monitorSettings(); uint64_t wattsToMicrowatts(uint64_t watts) const; uint64_t microwattsToWatts(int microwatts) const; + uint64_t mHzToHz(uint64_t mhz) const; rdc_status_t get_group_info(rdc_gpu_group_t group_id, rdc_group_info_t* rdc_group_info); }; diff --git a/rdc_libs/rdc/src/RdcConfigSettingsImpl.cc b/rdc_libs/rdc/src/RdcConfigSettingsImpl.cc index a126884ea7..ba763bd485 100644 --- a/rdc_libs/rdc/src/RdcConfigSettingsImpl.cc +++ b/rdc_libs/rdc/src/RdcConfigSettingsImpl.cc @@ -44,7 +44,7 @@ void RdcConfigSettingsImpl::monitorSettings() { rdc_status_t rdc_status; rdc_group_info_t rdc_group_info = {}; amdsmi_power_cap_info_t cap_info = {}; - amdsmi_frequencies_t freqs = {}; + amdsmi_clk_info_t info = {}; uint64_t cached_value; while (true) { @@ -102,18 +102,18 @@ void RdcConfigSettingsImpl::monitorSettings() { } // Mem clock - status = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_MEM, &freqs); + status = amdsmi_get_clock_info(processor_handle, AMDSMI_CLK_TYPE_MEM, &info); if (status != AMDSMI_STATUS_SUCCESS) { - RDC_LOG( - RDC_ERROR, - "RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_clk_freq failed: " << status); + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_clk_freq for mem failed: " + << status); continue; } auto mem_clk_it = cached_settings.find(RDC_CFG_MEMORY_CLOCK_LIMIT); if (mem_clk_it != cached_settings.end()) { cached_value = mem_clk_it->second.target_value; - if (freqs.frequency[freqs.current] == cached_value) { + if (info.max_clk != cached_value) { status = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM, CLK_LIMIT_MAX, cached_value); if (status != AMDSMI_STATUS_SUCCESS) { @@ -127,18 +127,18 @@ void RdcConfigSettingsImpl::monitorSettings() { } // GFX clock - status = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_GFX, &freqs); + status = amdsmi_get_clock_info(processor_handle, AMDSMI_CLK_TYPE_GFX, &info); if (status != AMDSMI_STATUS_SUCCESS) { - RDC_LOG( - RDC_ERROR, - "RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_clk_freq failed: " << status); + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_clk_freq for gfx failed: " + << status); continue; } auto gfx_clk_it = cached_settings.find(RDC_CFG_GFX_CLOCK_LIMIT); if (gfx_clk_it != cached_settings.end()) { cached_value = gfx_clk_it->second.target_value; - if (freqs.frequency[freqs.current] == cached_value) { + if (info.max_clk != cached_value) { status = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX, CLK_LIMIT_MAX, cached_value); if (status != AMDSMI_STATUS_SUCCESS) { @@ -165,6 +165,8 @@ uint64_t RdcConfigSettingsImpl::microwattsToWatts(int microwatts) const { return microwatts / 1'000'000; } +uint64_t RdcConfigSettingsImpl::mHzToHz(uint64_t mhz) const { return mhz * 1000000ULL; } + rdc_status_t RdcConfigSettingsImpl::get_group_info(rdc_gpu_group_t group_id, rdc_group_info_t* rdc_group_info) { rdc_status_t status = group_settings_->rdc_group_gpu_get_info(group_id, rdc_group_info); @@ -312,19 +314,31 @@ rdc_status_t RdcConfigSettingsImpl::rdc_config_clear(rdc_gpu_group_t group_id) { // Reset GFX clock limit if it was set if (group_iter->second.find(RDC_CFG_GFX_CLOCK_LIMIT) != group_iter->second.end()) { amdsmi_frequencies_t freqs = {}; + amdsmi_clk_info_t info = {}; + amd_ret = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_GFX, &freqs); - if (amd_ret == AMDSMI_STATUS_SUCCESS) { - uint64_t curr = freqs.frequency[freqs.current]; - uint64_t maxf = freqs.frequency[freqs.num_supported - 1]; - if (curr != maxf) { - amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX, CLK_LIMIT_MAX, - AMDSMI_DEV_PERF_LEVEL_AUTO); - if (amd_ret != AMDSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, - "RdcConfigSettingsImpl::rdc_config_clear: Failed to reset GFX clock limit : " - << amd_ret); - break; - } + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_clear: Failed to get GFX freq: " << amd_ret); + break; + } + + amd_ret = amdsmi_get_clock_info(processor_handle, AMDSMI_CLK_TYPE_GFX, &info); + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_clear: Failed to get GFX info: " << amd_ret); + break; + } + + uint64_t curr = mHzToHz(info.max_clk); + uint64_t maxf = freqs.frequency[freqs.num_supported - 1]; + if (curr != maxf) { + amd_ret = amdsmi_set_gpu_perf_level(processor_handle, AMDSMI_DEV_PERF_LEVEL_AUTO); + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_clear: Failed to reset GFX clock limit : " + << amd_ret); + break; } } } @@ -332,19 +346,31 @@ rdc_status_t RdcConfigSettingsImpl::rdc_config_clear(rdc_gpu_group_t group_id) { // Reset memory clock limit if it was set if (group_iter->second.find(RDC_CFG_MEMORY_CLOCK_LIMIT) != group_iter->second.end()) { amdsmi_frequencies_t freqs = {}; + amdsmi_clk_info_t info = {}; + amd_ret = amdsmi_get_clk_freq(processor_handle, AMDSMI_CLK_TYPE_MEM, &freqs); - if (amd_ret == AMDSMI_STATUS_SUCCESS) { - uint64_t curr = freqs.frequency[freqs.current]; - uint64_t maxf = freqs.frequency[freqs.num_supported - 1]; - if (curr != maxf) { - amd_ret = - amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM, CLK_LIMIT_MAX, 0); - if (amd_ret != AMDSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, - "RdcConfigSettingsImpl::rdc_config_clear: Failed to reset memory clock limit:" - << amd_ret); - break; - } + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_clear: Failed to get MEM freq: " << amd_ret); + break; + } + + amd_ret = amdsmi_get_clock_info(processor_handle, AMDSMI_CLK_TYPE_MEM, &info); + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_clear: Failed to get MEM info: " << amd_ret); + break; + } + + uint64_t curr = mHzToHz(info.max_clk); + uint64_t maxf = freqs.frequency[freqs.num_supported - 1]; + if (curr != maxf) { + amd_ret = amdsmi_set_gpu_perf_level(processor_handle, AMDSMI_DEV_PERF_LEVEL_AUTO); + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_clear: Failed to reset memory clock limit:" + << amd_ret); + break; } } } From 059451d48f8c80cdc5e94959b76904f5473be8c9 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Tue, 22 Jul 2025 14:41:37 -0500 Subject: [PATCH 2/4] Profiler - Remove UUID metric Signed-off-by: Galantsev, Dmitrii --- common/rdc_field.data | 1 - include/rdc/rdc.h | 1 - rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc | 25 +------------------- 3 files changed, 1 insertion(+), 26 deletions(-) diff --git a/common/rdc_field.data b/common/rdc_field.data index 6e447974ea..8c8603d16b 100644 --- a/common/rdc_field.data +++ b/common/rdc_field.data @@ -195,7 +195,6 @@ FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "", "CPF_CPF_TCIU_I FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_STALL, "", "CPF_CPF_TCIU_STALL", false) // Misc FLD_DESC_ENT(RDC_FI_PROF_SIMD_UTILIZATION, "Fraction of time the SIMDs are being utilized", "SIMD_UTILIZATION", false) -FLD_DESC_ENT(RDC_FI_PROF_UUID, "UUID from rocprofiler", "PROF_UUID", true) FLD_DESC_ENT(RDC_FI_PROF_KFD_ID, "GPU_ID from rocprofiler, same as KFD_ID", "PROF_KFD_ID", true) // Events diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index 51569189ce..60383f3cc3 100644 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -345,7 +345,6 @@ typedef enum { RDC_FI_PROF_CPF_CPF_TCIU_IDLE, RDC_FI_PROF_CPF_CPF_TCIU_STALL, RDC_FI_PROF_SIMD_UTILIZATION, - RDC_FI_PROF_UUID, RDC_FI_PROF_KFD_ID, /** diff --git a/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc index 658b26b4d4..1576f0f164 100644 --- a/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc +++ b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc @@ -110,7 +110,6 @@ static const std::map temp_field_map_k = { {RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "CPF_CPF_TCIU_IDLE"}, {RDC_FI_PROF_CPF_CPF_TCIU_STALL, "CPF_CPF_TCIU_STALL"}, {RDC_FI_PROF_SIMD_UTILIZATION, "SIMD_UTILIZATION"}, - {RDC_FI_PROF_UUID, "SQ_WAVES"}, // dummy value, {RDC_FI_PROF_KFD_ID, "SQ_WAVES"}, // dummy value, }; @@ -164,21 +163,6 @@ const std::vector RdcRocpBase::get_field_ids() { return field_ids; } -rocprofiler_uuid_t asic_serial_to_uuid(const char* asic_serial) { - rocprofiler_uuid_t uuid = {0}; - // have to cast to stoull as a workaround for amdsmi ignoring leading zeroes - uuid.value = std::stoull(asic_serial, nullptr, 16); - return uuid; -} - -std::string uuid_to_string(const uint64_t uuid) { - std::ostringstream oss; - oss << "0x" << std::hex << std::setw(16) << std::setfill('0') << uuid; - return oss.str(); -} - -std::string uuid_to_string(const rocprofiler_uuid_t& uuid) { return uuid_to_string(uuid.value); } - rdc_status_t RdcRocpBase::map_entity_to_profiler() { // std::map entity_to_index_map; // kfd_id_t is only used inside this function @@ -419,15 +403,8 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value // FLOPS/clock/CU data->dbl = divided_dbl / (256.0F / static_cast(agents[agent_index].simd_per_cu)); break; - case RDC_FI_PROF_UUID: { - // do not care what RDC_FI_PROF_UUID is mapped to. read value from agents - *type = STRING; - std::string uuid_str = uuid_to_string(agents[agent_index].uuid); - strncpy_with_null(data->str, uuid_str.c_str(), uuid_str.length()); - break; - } case RDC_FI_PROF_KFD_ID: { - // do not care what RDC_FI_PROF_UUID is mapped to. read value from agents + // do not care what it is mapped to. read value from agents *type = INTEGER; data->l_int = agents[agent_index].gpu_id; break; From 3f2f92a37a43013c777ed1ddb6bf71e0f58cedda Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Tue, 22 Jul 2025 15:54:01 -0500 Subject: [PATCH 3/4] RVS - Fix iet_stress by disabling logging Signed-off-by: Galantsev, Dmitrii --- cmake_modules/Findrvs.cmake | 2 ++ .../rdc_modules/rdc_rvs/RdcDiagnosticLib.cc | 19 ++++++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/cmake_modules/Findrvs.cmake b/cmake_modules/Findrvs.cmake index 981e659a36..d2cbc7419c 100644 --- a/cmake_modules/Findrvs.cmake +++ b/cmake_modules/Findrvs.cmake @@ -51,6 +51,7 @@ if(${NAME}_FOUND AND NOT TARGET ${NAME}::${NAME}) find_package(hipblaslt REQUIRED) find_package(hsakmt REQUIRED) find_package(hip REQUIRED) + find_package(hiprand REQUIRED) find_package(hsa-runtime64 REQUIRED) find_package(amd_smi REQUIRED) target_link_libraries( @@ -62,6 +63,7 @@ if(${NAME}_FOUND AND NOT TARGET ${NAME}::${NAME}) roc::hipblaslt hsakmt::hsakmt hip::amdhip64 + hip::hiprand hsa-runtime64::hsa-runtime64 amd_smi ) diff --git a/rdc_libs/rdc_modules/rdc_rvs/RdcDiagnosticLib.cc b/rdc_libs/rdc_modules/rdc_rvs/RdcDiagnosticLib.cc index 3cdd37b7e7..908ff80aa5 100644 --- a/rdc_libs/rdc_modules/rdc_rvs/RdcDiagnosticLib.cc +++ b/rdc_libs/rdc_modules/rdc_rvs/RdcDiagnosticLib.cc @@ -119,11 +119,9 @@ rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case, case RDC_DIAG_RVS_GST_TEST: case RDC_DIAG_RVS_MEMBW_TEST: case RDC_DIAG_RVS_H2DD2H_TEST: - case RDC_DIAG_RVS_IET_TEST: case RDC_DIAG_RVS_GST_LONG_TEST: case RDC_DIAG_RVS_MEMBW_LONG_TEST: - case RDC_DIAG_RVS_H2DD2H_LONG_TEST: - case RDC_DIAG_RVS_IET_LONG_TEST: { + case RDC_DIAG_RVS_H2DD2H_LONG_TEST: { const std::string test_name = "Finished running " + test_to_name.at(test_case); if (test_to_conf.find(test_case) == test_to_conf.end()) { RDC_LOG(RDC_ERROR, "cannot find test " << test_to_name.at(test_case)); @@ -135,6 +133,21 @@ rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case, rvs_status = rvs_p->run_rvs_app(predefined_config.c_str(), predefined_config.length() + 1, callback); break; + } + // IET tests don't work with callback, give it nullptr + case RDC_DIAG_RVS_IET_TEST: + case RDC_DIAG_RVS_IET_LONG_TEST: { + const std::string test_name = "Finished running " + test_to_name.at(test_case); + if (test_to_conf.find(test_case) == test_to_conf.end()) { + RDC_LOG(RDC_ERROR, "cannot find test " << test_to_name.at(test_case)); + return RDC_ST_NOT_FOUND; + } + const std::string predefined_config = test_to_conf.at(test_case); + // +1 to copy null + strncpy_with_null(result->info, test_name.c_str(), test_name.length() + 1); + rvs_status = + rvs_p->run_rvs_app(predefined_config.c_str(), predefined_config.length() + 1, nullptr); + break; } case RDC_DIAG_RVS_CUSTOM: RDC_LOG(RDC_ERROR, "custom config cannot be bundled with other tests!"); From 8f3a2326136caefa935876155eddb61177ad362c Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Wed, 23 Jul 2025 22:49:19 -0500 Subject: [PATCH 4/4] Profiler - Update counter definitions to match changed api Signed-off-by: Galantsev, Dmitrii --- include/rdc_modules/rdc_rocp/RdcRocpCounterSampler.h | 2 +- rdc_libs/rdc_modules/rdc_rocp/RdcRocpCounterSampler.cc | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/rdc_modules/rdc_rocp/RdcRocpCounterSampler.h b/include/rdc_modules/rdc_rocp/RdcRocpCounterSampler.h index 5838a96a52..8d87a7c637 100644 --- a/include/rdc_modules/rdc_rocp/RdcRocpCounterSampler.h +++ b/include/rdc_modules/rdc_rocp/RdcRocpCounterSampler.h @@ -79,7 +79,7 @@ class CounterSampler { size_t get_counter_size(rocprofiler_counter_id_t counter); // Get the dimensions of a counter - std::vector get_counter_dimensions( + std::vector get_counter_dimensions( rocprofiler_counter_id_t counter); static std::vector> samplers_; diff --git a/rdc_libs/rdc_modules/rdc_rocp/RdcRocpCounterSampler.cc b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpCounterSampler.cc index cbf31e10a2..9597dbb87f 100644 --- a/rdc_libs/rdc_modules/rdc_rocp/RdcRocpCounterSampler.cc +++ b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpCounterSampler.cc @@ -204,7 +204,7 @@ size_t CounterSampler::get_counter_size(rocprofiler_counter_id_t counter) { rocprofiler_counter_info_v1_t info; rocprofiler_query_counter_info(counter, ROCPROFILER_COUNTER_INFO_VERSION_1, static_cast(&info)); - return info.instance_ids_count; + return info.dimensions_instances_count; } std::unordered_map CounterSampler::get_supported_counters( @@ -241,7 +241,7 @@ std::unordered_map CounterSampler::get_su return out; } -std::vector CounterSampler::get_counter_dimensions( +std::vector CounterSampler::get_counter_dimensions( rocprofiler_counter_id_t counter) { rocprofiler_counter_info_v1_t info; RocprofilerCall( @@ -250,8 +250,8 @@ std::vector CounterSampler::get_counter_dim static_cast(&info)); }, "Could not query info for counter", __FILE__, __LINE__); - return std::vector{info.dimensions, - info.dimensions + info.dimensions_count}; + return std::vector{ + *info.dimensions, *info.dimensions + info.dimensions_count}; } int tool_init(rocprofiler_client_finalize_t, void*) {