Files
rocm-systems/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc
T
Galantsev, Dmitrii 91be467cad Profiler - Fix eval fields
The 'value' pointer was being written to a lot and then used for reading
within the same function. This likely caused issues all over RDC when
reading the metrics.

This commit changes it so *value is written to only once.

Change-Id: I83c158c1e46c6ce46ff87d8a2e769f26ffa8c0da
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
2025-04-09 20:06:21 -05:00

167 строки
5.5 KiB
C++

/*
Copyright (c) 2022 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <math.h>
#include <sys/time.h>
#include <algorithm>
#include <cstring>
#include <map>
#include <memory>
#include <stdexcept>
#include <vector>
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/RdcTelemetryLibInterface.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_modules/rdc_rocp/RdcRocpBase.h"
std::unique_ptr<amd::rdc::RdcRocpBase> rocp_p;
bool is_rocp_disabled() {
const char* value = std::getenv("RDC_DISABLE_ROCP");
if (value == nullptr) return false;
std::string value_str = value;
std::transform(value_str.begin(), value_str.end(), value_str.begin(),
[](unsigned char c) { return std::tolower(c); });
const std::vector<const char*> positive_list = {"yes", "true", "1", "on", "y", "t"};
return std::any_of(positive_list.begin(), positive_list.end(),
[&value_str](const char* val) { return value_str == val; });
}
rdc_status_t rdc_module_init(uint64_t /*flags*/) {
if (is_rocp_disabled()) {
// rocprofiler does NOT work in gtest.
// GTest starts up multiple instances of the progam under test,
// however HSA does not allow for multiple instances. Since hsa_init() is called at the very
// top of RdcRocpBase constructor - easiest to disable it alltogether when RDC_DISABLE_ROCP is
// set.
//
// We cannot rely on GTEST_DECLARE_bool_ variable because RDC is compiled
// before the tests are. Utilize an env var instead.
return RDC_ST_DISABLED_MODULE;
}
rocp_p = std::make_unique<amd::rdc::RdcRocpBase>();
return RDC_ST_OK;
}
rdc_status_t rdc_module_destroy() {
rocp_p.reset();
return RDC_ST_OK;
}
// get supported field ids
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) {
// extract all keys from counter_map
if (rocp_p == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
std::vector<rdc_field_t> fields = rocp_p->get_field_ids();
std::vector<uint32_t> counter_keys(fields.begin(), fields.end());
*field_count = counter_keys.size();
// copy from vector into array
std::copy(counter_keys.begin(), counter_keys.end(), field_ids);
return RDC_ST_OK;
}
// Fetch
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, const uint32_t fields_count,
rdc_field_value_f callback, void* user_data) {
if (rocp_p == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
// Bulk fetch fields
std::vector<rdc_gpu_field_value_t> bulk_results;
struct timeval tv{};
gettimeofday(&tv, nullptr);
const uint64_t curTime = static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
// Fetch it one by one for left fields
const int BULK_FIELDS_MAX = 16;
rdc_gpu_field_value_t values[BULK_FIELDS_MAX];
uint32_t bulk_count = 0;
rdc_status_t status = RDC_ST_UNKNOWN_ERROR;
double data = NAN;
for (uint32_t i = 0; i < fields_count; i++) {
if (bulk_count >= BULK_FIELDS_MAX) {
status = callback(values, bulk_count, user_data);
// When the callback returns errors, stop processing and return.
if (status != RDC_ST_OK) {
return status;
}
bulk_count = 0;
}
status = rocp_p->rocp_lookup(fields[i], &data);
// get value
values[bulk_count].gpu_index = fields[i].gpu_index;
values[bulk_count].field_value.type = DOUBLE;
values[bulk_count].field_value.status = status;
values[bulk_count].field_value.ts = curTime;
values[bulk_count].field_value.value.dbl = data;
values[bulk_count].field_value.field_id = fields[i].field_id;
bulk_count++;
}
if (bulk_count != 0) {
status = callback(values, bulk_count, user_data);
if (status != RDC_ST_OK) {
return status;
}
bulk_count = 0;
}
return status;
}
rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) {
rdc_status_t status = RDC_ST_OK;
for (uint32_t i = 0; i < fields_count; i++) {
RDC_LOG(RDC_DEBUG, "WATCH: " << fields[i].field_id);
const rdc_status_t temp_status = RDC_ST_OK;
if (temp_status != RDC_ST_OK) {
status = temp_status;
}
}
return status;
}
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count) {
rdc_status_t status = RDC_ST_OK;
for (uint32_t i = 0; i < fields_count; i++) {
RDC_LOG(RDC_DEBUG, "UNWATCH: " << fields[i].field_id);
const rdc_status_t temp_status = RDC_ST_OK;
// return last non-ok status
if (temp_status != RDC_ST_OK) {
status = temp_status;
}
}
return status;
}