From 8d48048bd31358360361fb2680b7deafa30e187f Mon Sep 17 00:00:00 2001 From: anujshuk-amd Date: Mon, 28 Apr 2025 18:52:20 +0530 Subject: [PATCH] Fix ROCPROFSYS_AMD_SMI_METRICS parsing (#178) Fixes a bug where all the `ROCPROFSYSE_AMD_SMI_METRICS` values were being recorded by default. Fixes bug with the 'all' and 'none' values giving an exception when specified for `ROCPROFSYSE_AMD_SMI_METRICS`. --------- Co-authored-by: David Galiffi --- CHANGELOG.md | 2 + source/lib/core/config.cpp | 3 +- source/lib/rocprof-sys/library/amd_smi.cpp | 47 ++++++++++++---------- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b579d06682..47c154c8bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ Full documentation for ROCm Systems Profiler is available at [https://rocm.docs. ### Resolved issues +- Fixed application hang when enabling the RCCL backend +- Fixed GPU metric collection settings with ROCPROFSYS_AMD_SMI_METRICS. - Fixed an issue where enabling the RCCL backend caused the application to stop responding. ## ROCm Systems Profiler 1.0.0 for ROCm 6.4.1 diff --git a/source/lib/core/config.cpp b/source/lib/core/config.cpp index e01ee1e798..9964045916 100644 --- a/source/lib/core/config.cpp +++ b/source/lib/core/config.cpp @@ -623,7 +623,8 @@ configure_settings(bool _init) ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_AMD_SMI_METRICS", "amd-smi metrics to collect: busy, temp, power, " - "vcn_activity, jpeg_activity, mem_usage", + "vcn_activity, jpeg_activity, mem_usage." + "An empty value implies 'all' and 'none' suppresses all.", "busy, temp, power, mem_usage", "backend", "amd_smi", "rocm", "process_sampling", "advanced"); diff --git a/source/lib/rocprof-sys/library/amd_smi.cpp b/source/lib/rocprof-sys/library/amd_smi.cpp index 6e2eb152c1..4ceaca043a 100644 --- a/source/lib/rocprof-sys/library/amd_smi.cpp +++ b/source/lib/rocprof-sys/library/amd_smi.cpp @@ -541,34 +541,37 @@ setup() { for(auto itr : _devices) { - uint16_t dev_id = 0; - ROCPROFSYS_AMD_SMI_CALL( - amdsmi_get_gpu_id(gpu::get_handle_from_id(itr), &dev_id)); - // dev_id holds the device ID of device i, upon a successful call - - if(_metrics && !_metrics->empty()) + // Enable selected metrics only + if((_metrics && !_metrics->empty()) && (*_metrics != "all")) { using key_pair_t = std::pair; const auto supported = std::unordered_map{ - key_pair_t{ "busy", get_settings(dev_id).busy }, - key_pair_t{ "temp", get_settings(dev_id).temp }, - key_pair_t{ "power", get_settings(dev_id).power }, - key_pair_t{ "mem_usage", get_settings(dev_id).mem_usage }, - key_pair_t{ "vcn_activity", get_settings(dev_id).vcn_activity }, - key_pair_t{ "jpeg_activity", get_settings(dev_id).jpeg_activity }, + key_pair_t{ "busy", get_settings(itr).busy }, + key_pair_t{ "temp", get_settings(itr).temp }, + key_pair_t{ "power", get_settings(itr).power }, + key_pair_t{ "mem_usage", get_settings(itr).mem_usage }, + key_pair_t{ "vcn_activity", get_settings(itr).vcn_activity }, + key_pair_t{ "jpeg_activity", get_settings(itr).jpeg_activity }, }; - get_settings(dev_id) = { false, false, false, false, false, false }; - for(const auto& metric : tim::delimit(*_metrics, ",;:\t\n ")) - { - auto iitr = supported.find(metric); - if(iitr == supported.end()) - ROCPROFSYS_FAIL_F("unsupported amd-smi metric: %s\n", - metric.c_str()); + // Initialize all metrics to false + for(auto& it : supported) + it.second = false; - ROCPROFSYS_VERBOSE_F(1, "Enabling amd-smi metric '%s'\n", - metric.c_str()); - iitr->second = true; + // Parse list of metrics enabled by the user + if(*_metrics != "none") + { + for(const auto& metric : tim::delimit(*_metrics, ",;:\t\n ")) + { + auto iitr = supported.find(metric); + if(iitr == supported.end()) + ROCPROFSYS_FAIL_F("unsupported amd-smi metric: %s\n", + metric.c_str()); + ROCPROFSYS_VERBOSE_F( + 1, "Enabling amd-smi metric '%s' on device [%u]\n", + metric.c_str(), itr); + iitr->second = true; + } } } }