298 строки
11 KiB
C++
298 строки
11 KiB
C++
/*
|
|
Copyright (c) 2022 - present Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#include <dlfcn.h>
|
|
#include <math.h>
|
|
#include <sys/time.h>
|
|
|
|
#include <algorithm>
|
|
#include <cstring>
|
|
#include <filesystem>
|
|
#include <map>
|
|
#include <memory>
|
|
#include <stdexcept>
|
|
#include <vector>
|
|
|
|
#include "rdc/rdc.h"
|
|
#include "rdc_lib/RdcLogger.h"
|
|
#include "rdc_lib/RdcTelemetryLibInterface.h"
|
|
#include "rdc_lib/rdc_common.h"
|
|
#include "rdc_modules/rdc_rocp/RdcRocpBase.h"
|
|
|
|
std::unique_ptr<amd::rdc::RdcRocpBase> rocp_p;
|
|
|
|
bool is_rocp_disabled() {
|
|
const char* value = std::getenv("RDC_DISABLE_ROCP");
|
|
if (value == nullptr) return false;
|
|
|
|
std::string value_str = value;
|
|
std::transform(value_str.begin(), value_str.end(), value_str.begin(),
|
|
[](unsigned char c) { return std::tolower(c); });
|
|
|
|
const std::vector<const char*> positive_list = {"yes", "true", "1", "on", "y", "t"};
|
|
|
|
return std::any_of(positive_list.begin(), positive_list.end(),
|
|
[&value_str](const char* val) { return value_str == val; });
|
|
}
|
|
|
|
bool is_rocprofiler_metrics_path_set() {
|
|
const char* rocprofiler_metrics_path = getenv("ROCPROFILER_METRICS_PATH");
|
|
if (rocprofiler_metrics_path == nullptr) {
|
|
return false;
|
|
} else {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
rdc_status_t rdc_module_init(uint64_t /*flags*/) {
|
|
if (is_rocp_disabled()) {
|
|
// rocprofiler does NOT work in gtest.
|
|
// GTest starts up multiple instances of the progam under test,
|
|
// however HSA does not allow for multiple instances. Since hsa_init() is called at the very
|
|
// top of RdcRocpBase constructor - easiest to disable it alltogether when RDC_DISABLE_ROCP is
|
|
// set.
|
|
//
|
|
// We cannot rely on GTEST_DECLARE_bool_ variable because RDC is compiled
|
|
// before the tests are. Utilize an env var instead.
|
|
return RDC_ST_DISABLED_MODULE;
|
|
}
|
|
|
|
if (is_rocprofiler_metrics_path_set() == false) {
|
|
RDC_LOG(RDC_DEBUG, "Trying to find ROCPROFILER_METRICS_PATH");
|
|
|
|
std::filesystem::path profiler_metrics_path;
|
|
bool found_path = false;
|
|
|
|
// Strategy 1: Find profiler_metrics_path relative to rocprofiler-sdk library
|
|
Dl_info rocprofiler_dl_info = {};
|
|
void* rocprofiler_symbol = dlsym(RTLD_DEFAULT, "rocprofiler_is_initialized");
|
|
if (rocprofiler_symbol != nullptr && dladdr(rocprofiler_symbol, &rocprofiler_dl_info) != 0) {
|
|
// NOTE: filesystem::canonical() returns absolute path, we need this for parent_path to work
|
|
// properly.
|
|
//
|
|
// bad example without canonical path:
|
|
// rocprofiler is found in /opt/rocm/lib/rdc/../librocprofiler-sdk.so.
|
|
// location.parent_path().parent_path() will return /opt/rocm/lib/rdc.
|
|
// we're trying to find /opt/rocm!
|
|
// with canonical path:
|
|
// rocprofiler is found in /opt/rocm/lib/rocprofiler-sdk.so
|
|
// location.parent_path().parent_path() will return /opt/rocm/.
|
|
// from here we can easily find /opt/rocm/share/rocprofiler-sdk
|
|
try {
|
|
const std::filesystem::path rocprofiler_lib_path(
|
|
std::filesystem::canonical(rocprofiler_dl_info.dli_fname));
|
|
RDC_LOG(RDC_DEBUG, "Found rocprofiler library at "
|
|
<< rocprofiler_lib_path << " : "
|
|
<< rocprofiler_lib_path.parent_path().parent_path());
|
|
|
|
profiler_metrics_path =
|
|
rocprofiler_lib_path.parent_path().parent_path() / "share/rocprofiler-sdk";
|
|
if (std::filesystem::exists(profiler_metrics_path)) {
|
|
found_path = true;
|
|
RDC_LOG(RDC_DEBUG, "Found rocprofiler metrics via rocprofiler library location at "
|
|
<< profiler_metrics_path);
|
|
}
|
|
} catch (const std::filesystem::filesystem_error& e) {
|
|
RDC_LOG(RDC_ERROR, "Failed to resolve rocprofiler library path: " << e.what());
|
|
}
|
|
}
|
|
|
|
// Strategy 2: Find profiler_metrics_path relative to rdc
|
|
if (!found_path) {
|
|
RDC_LOG(RDC_DEBUG, "Rocprofiler library detection failed, trying fallback paths");
|
|
Dl_info rdc_dl_info = {};
|
|
if (dladdr(reinterpret_cast<const void*>(rdc_module_init), &rdc_dl_info) != 0) {
|
|
try {
|
|
const std::filesystem::path rdc_path(std::filesystem::canonical(rdc_dl_info.dli_fname));
|
|
RDC_LOG(RDC_DEBUG, "Found rdc library at " << rdc_path);
|
|
profiler_metrics_path =
|
|
rdc_path.parent_path().parent_path().parent_path() / "share/rocprofiler-sdk";
|
|
if (std::filesystem::exists(profiler_metrics_path)) {
|
|
found_path = true;
|
|
RDC_LOG(RDC_DEBUG,
|
|
"Found rocprofiler metrics via fallback path at " << profiler_metrics_path);
|
|
}
|
|
} catch (const std::filesystem::filesystem_error& e) {
|
|
RDC_LOG(RDC_ERROR, "Failed to resolve rdc library path: " << e.what());
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!found_path) {
|
|
RDC_LOG(RDC_ERROR,
|
|
"rocprofiler metrics are not available. Could not locate rocprofiler-sdk share "
|
|
"directory.");
|
|
RDC_LOG(RDC_ERROR,
|
|
"Make sure rocprofiler-sdk is properly installed or set ROCPROFILER_METRICS_PATH "
|
|
"manually.");
|
|
return RDC_ST_FAIL_LOAD_MODULE;
|
|
}
|
|
|
|
RDC_LOG(RDC_DEBUG, "rocprofiler metrics are available at " << profiler_metrics_path);
|
|
setenv("ROCPROFILER_METRICS_PATH", profiler_metrics_path.c_str(), 0);
|
|
} else {
|
|
RDC_LOG(RDC_DEBUG, "Using ROCPROFILER_METRICS_PATH from environment variable");
|
|
}
|
|
|
|
rocp_p = std::make_unique<amd::rdc::RdcRocpBase>();
|
|
return RDC_ST_OK;
|
|
}
|
|
rdc_status_t rdc_module_destroy() {
|
|
rocp_p.reset();
|
|
return RDC_ST_OK;
|
|
}
|
|
|
|
// get supported field ids
|
|
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) {
|
|
// extract all keys from counter_map
|
|
if (rocp_p == nullptr) {
|
|
return RDC_ST_FAIL_LOAD_MODULE;
|
|
}
|
|
std::vector<rdc_field_t> fields = rocp_p->get_field_ids();
|
|
std::vector<uint32_t> counter_keys(fields.begin(), fields.end());
|
|
|
|
*field_count = counter_keys.size();
|
|
// copy from vector into array
|
|
std::copy(counter_keys.begin(), counter_keys.end(), field_ids);
|
|
|
|
return RDC_ST_OK;
|
|
}
|
|
|
|
// Fetch
|
|
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, const uint32_t fields_count,
|
|
rdc_field_value_f callback, void* user_data) {
|
|
if (rocp_p == nullptr) {
|
|
return RDC_ST_FAIL_LOAD_MODULE;
|
|
}
|
|
|
|
struct timeval tv{};
|
|
gettimeofday(&tv, nullptr);
|
|
const uint64_t curTime = static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
|
|
|
|
// Group fields by GPU index for bulk sampling
|
|
std::map<uint32_t, std::vector<uint32_t>> gpu_to_field_indices;
|
|
for (uint32_t i = 0; i < fields_count; i++) {
|
|
gpu_to_field_indices[fields[i].gpu_index].push_back(i);
|
|
}
|
|
|
|
// Process each GPU group
|
|
const int BULK_FIELDS_MAX = 16;
|
|
rdc_gpu_field_value_t values[BULK_FIELDS_MAX];
|
|
uint32_t bulk_count = 0;
|
|
rdc_status_t status = RDC_ST_OK;
|
|
|
|
for (const auto& [gpu_index, field_indices] : gpu_to_field_indices) {
|
|
// Collect fields for this GPU
|
|
std::vector<rdc_gpu_field_t> gpu_fields;
|
|
gpu_fields.reserve(field_indices.size());
|
|
for (uint32_t idx : field_indices) {
|
|
gpu_fields.push_back(fields[idx]);
|
|
}
|
|
|
|
// Bulk lookup for this GPU
|
|
// Note: rocp_lookup_bulk only handles rocprofiler-sdk metrics.
|
|
// Non-rocprofiler fields are handled within
|
|
// the bulk lookup via special case logic in apply_field_transformation().
|
|
// Fields without rocprofiler metric mappings will return RDC_ST_BAD_PARAMETER.
|
|
std::vector<rdc_field_value_data> bulk_data;
|
|
std::vector<rdc_field_type_t> bulk_types;
|
|
std::vector<rdc_status_t> bulk_statuses;
|
|
|
|
status = rocp_p->rocp_lookup_bulk(gpu_fields, bulk_data, bulk_types, bulk_statuses);
|
|
if (status != RDC_ST_OK) {
|
|
RDC_LOG(RDC_ERROR, "Error in bulk lookup for GPU " << gpu_index);
|
|
// Continue with next GPU even if this one failed
|
|
continue;
|
|
}
|
|
|
|
// Distribute results to callback buffer
|
|
for (size_t j = 0; j < gpu_fields.size(); j++) {
|
|
if (bulk_count >= BULK_FIELDS_MAX) {
|
|
status = callback(values, bulk_count, user_data);
|
|
if (status != RDC_ST_OK) {
|
|
return status;
|
|
}
|
|
bulk_count = 0;
|
|
}
|
|
|
|
const uint32_t original_idx = field_indices[j];
|
|
values[bulk_count].gpu_index = fields[original_idx].gpu_index;
|
|
values[bulk_count].field_value.status = bulk_statuses[j];
|
|
values[bulk_count].field_value.ts = curTime;
|
|
values[bulk_count].field_value.type = bulk_types[j];
|
|
values[bulk_count].field_value.field_id = fields[original_idx].field_id;
|
|
|
|
switch (bulk_types[j]) {
|
|
case DOUBLE:
|
|
values[bulk_count].field_value.value.dbl = bulk_data[j].dbl;
|
|
break;
|
|
case INTEGER:
|
|
values[bulk_count].field_value.value.l_int = bulk_data[j].l_int;
|
|
break;
|
|
case STRING:
|
|
case BLOB:
|
|
strncpy_with_null(values[bulk_count].field_value.value.str, bulk_data[j].str,
|
|
RDC_MAX_STR_LENGTH);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
bulk_count++;
|
|
}
|
|
}
|
|
|
|
// Flush remaining values
|
|
if (bulk_count != 0) {
|
|
status = callback(values, bulk_count, user_data);
|
|
if (status != RDC_ST_OK) {
|
|
return status;
|
|
}
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) {
|
|
rdc_status_t status = RDC_ST_OK;
|
|
for (uint32_t i = 0; i < fields_count; i++) {
|
|
RDC_LOG(RDC_DEBUG, "WATCH: " << fields[i].field_id);
|
|
const rdc_status_t temp_status = RDC_ST_OK;
|
|
if (temp_status != RDC_ST_OK) {
|
|
status = temp_status;
|
|
}
|
|
}
|
|
return status;
|
|
}
|
|
|
|
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count) {
|
|
rdc_status_t status = RDC_ST_OK;
|
|
for (uint32_t i = 0; i < fields_count; i++) {
|
|
RDC_LOG(RDC_DEBUG, "UNWATCH: " << fields[i].field_id);
|
|
const rdc_status_t temp_status = RDC_ST_OK;
|
|
// return last non-ok status
|
|
if (temp_status != RDC_ST_OK) {
|
|
status = temp_status;
|
|
}
|
|
}
|
|
return status;
|
|
}
|