[SWDEV-546543] Fix segfault in gpu_metrics
Signed-off-by: adapryor <Adam.pryor@amd.com>
[ROCm/amdsmi commit: d25c01e802]
Этот коммит содержится в:
коммит произвёл
Pham, Gabriel
родитель
a68cd9612a
Коммит
17f9feb94e
@@ -1068,7 +1068,9 @@ class GpuMetricsBase_t {
|
||||
virtual AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() = 0;
|
||||
virtual void set_device_id(uint32_t device_id) { m_device_id = device_id; }
|
||||
virtual void set_partition_id(uint32_t partition_id) { m_partition_id = partition_id; }
|
||||
static std::mutex s_base_tbl_mu;
|
||||
virtual AMDGpuDynamicMetricsTbl_t get_metrics_dynamic_tbl() {
|
||||
std::lock_guard<std::mutex> lk(s_base_tbl_mu);
|
||||
return m_base_metrics_dynamic_tbl;
|
||||
}
|
||||
|
||||
|
||||
@@ -1110,6 +1110,7 @@ namespace {
|
||||
GpuMetricsCache g_gpu_metrics_cache;
|
||||
// Keep 1 cache map, with an entry for each gpu
|
||||
std::unordered_map<std::string, GpuMetricsCache> g_gpu_metrics_cache_map;
|
||||
std::mutex g_gpu_metrics_cache_map_mu;
|
||||
static const std::chrono::milliseconds kGpuMetricsCacheDuration(
|
||||
read_env_ms("AMDSMI_GPU_METRICS_CACHE_MS", 1)
|
||||
);
|
||||
@@ -1132,20 +1133,25 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
|
||||
// is the issue, so should remain.
|
||||
const std::string key = path_ + "/device/" + kDevAttribNameMap.at(type)
|
||||
+ "#" + std::to_string(b_size);
|
||||
auto& cache = g_gpu_metrics_cache_map[key];
|
||||
|
||||
GpuMetricsCache* cache_ptr = nullptr;
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lk(g_gpu_metrics_cache_map_mu);
|
||||
cache_ptr = &g_gpu_metrics_cache_map[key]; // safe now
|
||||
}
|
||||
|
||||
// Only cache for kDevGpuMetrics
|
||||
if (type == DevInfoTypes::kDevGpuMetrics) {
|
||||
std::lock_guard<std::mutex> lock(cache.mtx);
|
||||
std::lock_guard<std::mutex> lock(cache_ptr->mtx);
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
auto last_read_delta = std::chrono::duration_cast<std::chrono::milliseconds>(now - cache.last_read);
|
||||
auto last_read_delta = std::chrono::duration_cast<std::chrono::milliseconds>(now - cache_ptr->last_read);
|
||||
|
||||
if (!cache.data.empty() &&
|
||||
if (!cache_ptr->data.empty() &&
|
||||
kGpuMetricsCacheDuration > std::chrono::milliseconds::zero() &&
|
||||
last_read_delta < kGpuMetricsCacheDuration &&
|
||||
cache.data.size() == b_size) {
|
||||
cache_ptr->data.size() == b_size) {
|
||||
|
||||
std::memcpy(p_binary_data, cache.data.data(), b_size);
|
||||
std::memcpy(p_binary_data, cache_ptr->data.data(), b_size);
|
||||
|
||||
if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) {
|
||||
ss << "Returned cached DevInfoBinary for DevInfoType ("
|
||||
@@ -1206,12 +1212,12 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
|
||||
if (type == DevInfoTypes::kDevGpuMetrics &&
|
||||
kGpuMetricsCacheDuration > std::chrono::milliseconds::zero()) {
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
auto& cache = g_gpu_metrics_cache_map[key];
|
||||
std::lock_guard<std::mutex> lock(cache.mtx);
|
||||
cache.data.assign(
|
||||
reinterpret_cast<uint8_t*>(p_binary_data),
|
||||
reinterpret_cast<uint8_t*>(p_binary_data) + b_size);
|
||||
cache.last_read = now;
|
||||
|
||||
std::lock_guard<std::mutex> lock(cache_ptr->mtx);
|
||||
cache_ptr->data.assign(
|
||||
reinterpret_cast<uint8_t*>(p_binary_data),
|
||||
reinterpret_cast<uint8_t*>(p_binary_data) + b_size);
|
||||
cache_ptr->last_read = now;
|
||||
|
||||
if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) {
|
||||
ss << "Successfully Cached GPU Metrics binaryData = " << p_binary_data
|
||||
|
||||
@@ -62,6 +62,8 @@ using namespace amd::smi;
|
||||
namespace amd::smi
|
||||
{
|
||||
|
||||
std::mutex GpuMetricsBase_t::s_base_tbl_mu;
|
||||
|
||||
constexpr uint16_t join_metrics_version(uint8_t format_rev, uint8_t content_rev)
|
||||
{
|
||||
return static_cast<uint16_t>((format_rev << 8 | content_rev));
|
||||
@@ -509,11 +511,9 @@ rsmi_status_t GpuMetricsBase_v18_t::populate_metrics_dynamic_tbl() {
|
||||
<< stringfy_metric_header_version(disjoin_metrics_version(gpu_metrics_version)) << " |";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
// firmware_timestamp is at 10ns resolution
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= Changes ======= "
|
||||
<< " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp
|
||||
<< " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10);
|
||||
m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10);
|
||||
// firmware_timestamp is at 10ns resolution; leave as-is.
|
||||
ss << __PRETTY_FUNCTION__ << " | firmware_timestamp (10ns) = "
|
||||
<< m_gpu_metrics_tbl.m_firmware_timestamp;
|
||||
LOG_DEBUG(ss);
|
||||
};
|
||||
|
||||
@@ -744,10 +744,12 @@ rsmi_status_t GpuMetricsBase_v18_t::populate_metrics_dynamic_tbl() {
|
||||
<< " | Returning = " << getRSMIStatusString(status_code) << " |";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
// Copy to base class
|
||||
std::copy(m_metrics_dynamic_tbl.begin(), m_metrics_dynamic_tbl.end(),
|
||||
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
|
||||
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(s_base_tbl_mu);
|
||||
// Copy to base class
|
||||
this->m_base_metrics_dynamic_tbl = m_metrics_dynamic_tbl;
|
||||
}
|
||||
|
||||
return status_code;
|
||||
}
|
||||
|
||||
Ссылка в новой задаче
Block a user