b53c99669c
This reverts commit 7b00d3a89b.
The workaround is no longer needed - root cause fixed in:
- rocm-smi-lib (PR #2531): Made devInfoTypesStrings file-local static
- amdsmi (PR #2575): Added visibility("hidden") attribute
420 líneas
14 KiB
C++
420 líneas
14 KiB
C++
// MIT License
|
|
//
|
|
// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in all
|
|
// copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
// SOFTWARE.
|
|
|
|
#include "agent.hpp"
|
|
#include "agent_info.hpp"
|
|
#define ROCPROFILER_SDK_CEREAL_NAMESPACE_BEGIN \
|
|
namespace tim \
|
|
{ \
|
|
namespace cereal \
|
|
{
|
|
#define ROCPROFILER_SDK_CEREAL_NAMESPACE_END \
|
|
} \
|
|
} // namespace ::tim::cereal
|
|
|
|
#include "common/defines.h"
|
|
|
|
#if !defined(ROCPROFSYS_USE_ROCM)
|
|
# define ROCPROFSYS_USE_ROCM 0
|
|
#endif
|
|
|
|
#include "defines.hpp"
|
|
#include "gpu.hpp"
|
|
|
|
#include <timemory/manager.hpp>
|
|
|
|
#include <string>
|
|
|
|
#include "core/agent_manager.hpp"
|
|
|
|
#if ROCPROFSYS_USE_ROCM > 0
|
|
# include <amd_smi/amdsmi.h>
|
|
# include <rocprofiler-sdk/agent.h>
|
|
# include <rocprofiler-sdk/cxx/serialization.hpp>
|
|
# include <rocprofiler-sdk/fwd.h>
|
|
#endif
|
|
|
|
#include "logger/debug.hpp"
|
|
|
|
namespace rocprofsys
|
|
{
|
|
namespace gpu
|
|
{
|
|
namespace
|
|
{
|
|
#if ROCPROFSYS_USE_ROCM > 0
|
|
# define ROCPROFSYS_AMD_SMI_CALL(ERROR_CODE) \
|
|
::rocprofsys::gpu::check_amdsmi_error(ERROR_CODE, __FILE__, __LINE__)
|
|
|
|
void
|
|
check_amdsmi_error(amdsmi_status_t _code, const char* _file, int _line)
|
|
{
|
|
if(_code == AMDSMI_STATUS_SUCCESS) return;
|
|
const char* _msg = nullptr;
|
|
auto _err = amdsmi_status_code_to_string(_code, &_msg);
|
|
if(_err != AMDSMI_STATUS_SUCCESS)
|
|
{
|
|
throw std::runtime_error(fmt::format(
|
|
"amdsmi_status_code_to_string failed. No error message available. "
|
|
"Error code {} originated at {}:{}",
|
|
static_cast<int>(_code), _file, _line));
|
|
}
|
|
throw std::runtime_error(fmt::format("[{}:{}] Error code {} :: {}", _file, _line,
|
|
static_cast<int>(_code), _msg));
|
|
}
|
|
|
|
// Ensures initialization happens only once
|
|
std::once_flag amdsmi_once;
|
|
|
|
// Tracks whether AMD SMI is initialized
|
|
bool&
|
|
_amdsmi_is_initialized()
|
|
{
|
|
static bool initialized = false;
|
|
return initialized;
|
|
}
|
|
|
|
bool
|
|
amdsmi_init()
|
|
{
|
|
auto _amdsmi_init = []() {
|
|
try
|
|
{
|
|
// Currently, only AMDSMI_INIT_AMD_GPUS is supported
|
|
ROCPROFSYS_AMD_SMI_CALL(::amdsmi_init(AMDSMI_INIT_AMD_GPUS));
|
|
get_processor_handles();
|
|
_amdsmi_is_initialized() = true; // Mark as initialized
|
|
} catch(std::exception& _e)
|
|
{
|
|
LOG_ERROR("Exception thrown initializing amd-smi: {}", _e.what());
|
|
_amdsmi_is_initialized() = false; // Mark as not initialized
|
|
return false;
|
|
}
|
|
return true;
|
|
}();
|
|
|
|
return _amdsmi_init;
|
|
}
|
|
#endif // ROCPROFSYS_USE_ROCM > 0
|
|
|
|
size_t
|
|
query_rocm_agents()
|
|
{
|
|
size_t _dev_cnt = 0;
|
|
#if ROCPROFSYS_USE_ROCM > 0
|
|
auto iterator = []([[maybe_unused]] rocprofiler_agent_version_t version,
|
|
const void** agents, size_t num_agents,
|
|
[[maybe_unused]] void* user_data) -> rocprofiler_status_t {
|
|
auto& _agent_manager = get_agent_manager_instance();
|
|
for(size_t i = 0; i < num_agents; ++i)
|
|
{
|
|
const auto* _agent = static_cast<const rocprofiler_agent_v0_t*>(agents[i]);
|
|
agent cur_agent;
|
|
cur_agent.type =
|
|
(_agent->type == ROCPROFILER_AGENT_TYPE_GPU ? agent_type::GPU
|
|
: agent_type::CPU);
|
|
cur_agent.handle = _agent->id.handle;
|
|
cur_agent.device_id = _agent->device_id;
|
|
cur_agent.node_id = _agent->node_id;
|
|
cur_agent.logical_node_id = _agent->logical_node_id;
|
|
cur_agent.logical_node_type_id = _agent->logical_node_type_id;
|
|
cur_agent.name = std::string(_agent->name);
|
|
cur_agent.model_name = std::string(_agent->model_name);
|
|
cur_agent.vendor_name = std::string(_agent->vendor_name);
|
|
cur_agent.product_name = std::string(_agent->product_name);
|
|
|
|
cur_agent.agent_info = agent_info::to_json_string(*_agent);
|
|
|
|
_agent_manager.insert_agent(cur_agent);
|
|
}
|
|
return ROCPROFILER_STATUS_SUCCESS;
|
|
};
|
|
|
|
try
|
|
{
|
|
rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, iterator,
|
|
sizeof(rocprofiler_agent_v0_t), nullptr);
|
|
} catch(std::exception& _e)
|
|
{
|
|
LOG_ERROR("Exception thrown getting the rocm agents: {}. _dev_cnt={}", _e.what(),
|
|
_dev_cnt);
|
|
}
|
|
_dev_cnt = get_agent_manager_instance().get_gpu_agents_count();
|
|
#endif
|
|
return _dev_cnt;
|
|
}
|
|
} // namespace
|
|
|
|
int
|
|
device_count()
|
|
{
|
|
#if ROCPROFSYS_USE_ROCM > 0
|
|
static int _num_devices = query_rocm_agents();
|
|
return _num_devices;
|
|
#else
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
bool
|
|
initialize_amdsmi()
|
|
{
|
|
#if ROCPROFSYS_USE_ROCM > 0
|
|
// Ensure initialization happens only once
|
|
std::call_once(amdsmi_once, amdsmi_init);
|
|
return _amdsmi_is_initialized();
|
|
#else
|
|
return false;
|
|
#endif
|
|
}
|
|
|
|
template <typename ArchiveT>
|
|
void
|
|
add_device_metadata(ArchiveT& ar)
|
|
{
|
|
namespace cereal = tim::cereal;
|
|
using cereal::make_nvp;
|
|
|
|
#if ROCPROFSYS_USE_ROCM > 0
|
|
using agent_vec_t = std::vector<rocprofiler_agent_v0_t>;
|
|
|
|
auto iterator_cb = []([[maybe_unused]] rocprofiler_agent_version_t version,
|
|
const void** agents, size_t num_agents,
|
|
[[maybe_unused]] void* user_data) -> rocprofiler_status_t {
|
|
auto* agents_vec = static_cast<agent_vec_t*>(user_data);
|
|
for(size_t i = 0; i < num_agents; ++i)
|
|
{
|
|
const auto* _agent = static_cast<const rocprofiler_agent_v0_t*>(agents[i]);
|
|
if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
|
|
{
|
|
agents_vec->push_back(*_agent);
|
|
}
|
|
}
|
|
return ROCPROFILER_STATUS_SUCCESS;
|
|
};
|
|
|
|
auto _agents_vec = agent_vec_t{};
|
|
try
|
|
{
|
|
rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, iterator_cb,
|
|
sizeof(rocprofiler_agent_v0_t), &_agents_vec);
|
|
} catch(std::exception& _e)
|
|
{
|
|
LOG_ERROR("Exception thrown getting the rocm agents: {}", _e.what());
|
|
}
|
|
|
|
ar(make_nvp("rocm_agents", _agents_vec));
|
|
#else
|
|
(void) ar;
|
|
#endif
|
|
}
|
|
|
|
void
|
|
add_device_metadata()
|
|
{
|
|
if(device_count() == 0) return;
|
|
|
|
ROCPROFSYS_METADATA([](auto& ar) {
|
|
try
|
|
{
|
|
add_device_metadata(ar);
|
|
} catch(std::runtime_error& _e)
|
|
{
|
|
LOG_ERROR("Exception thrown adding device metadata: {}", _e.what());
|
|
}
|
|
});
|
|
}
|
|
|
|
#if ROCPROFSYS_USE_ROCM > 0
|
|
/*
|
|
* Required amdsmi methods to get processors and handles
|
|
*/
|
|
|
|
uint32_t processors::total_processor_count = 0;
|
|
std::vector<amdsmi_processor_handle> processors::processors_list = {};
|
|
std::vector<bool> processors::vcn_device_level_only = {};
|
|
std::vector<bool> processors::jpeg_device_level_only = {};
|
|
std::vector<bool> processors::vcn_busy_supported = {};
|
|
std::vector<bool> processors::jpeg_busy_supported = {};
|
|
std::vector<bool> processors::xgmi_supported = {};
|
|
std::vector<bool> processors::pcie_supported = {};
|
|
|
|
void
|
|
get_processor_handles()
|
|
{
|
|
uint32_t socket_count;
|
|
uint32_t processor_count;
|
|
processors::processors_list.clear();
|
|
|
|
// Passing nullptr will return us the number of sockets available for read in this
|
|
// system
|
|
auto ret = amdsmi_get_socket_handles(&socket_count, nullptr);
|
|
if(ret != AMDSMI_STATUS_SUCCESS)
|
|
{
|
|
return;
|
|
}
|
|
std::vector<amdsmi_socket_handle> sockets(socket_count);
|
|
ret = amdsmi_get_socket_handles(&socket_count, sockets.data());
|
|
for(auto& socket : sockets)
|
|
{
|
|
// Passing nullptr will return us the number of processors available for read for
|
|
// this socket
|
|
ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr);
|
|
if(ret != AMDSMI_STATUS_SUCCESS)
|
|
{
|
|
return;
|
|
}
|
|
std::vector<amdsmi_processor_handle> all_processors(processor_count);
|
|
ret =
|
|
amdsmi_get_processor_handles(socket, &processor_count, all_processors.data());
|
|
if(ret != AMDSMI_STATUS_SUCCESS)
|
|
{
|
|
return;
|
|
}
|
|
|
|
for(auto& processor : all_processors)
|
|
{
|
|
processor_type_t processor_type = {};
|
|
ret = amdsmi_get_processor_type(processor, &processor_type);
|
|
if(processor_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU)
|
|
{
|
|
throw std::runtime_error("Not AMD_GPU device type!");
|
|
}
|
|
processors::processors_list.push_back(processor);
|
|
|
|
amdsmi_gpu_metrics_t gpu_metrics;
|
|
bool vcn_supported = false, jpeg_supported = false;
|
|
bool v_busy_supported = false, j_busy_supported = false;
|
|
bool xgmi_supported = false, pcie_supported = false;
|
|
// AMD SMI will not report VCN_activity and JPEG_activity, if VCN_busy or
|
|
// JPEG_busy fields are available.
|
|
if(amdsmi_get_gpu_metrics_info(processor, &gpu_metrics) ==
|
|
AMDSMI_STATUS_SUCCESS)
|
|
{
|
|
// Helper lambda to check if any value in the array is valid (not
|
|
// UINT16_MAX)
|
|
auto has_valid_u16 = [](const auto& arr) {
|
|
return std::any_of(std::begin(arr), std::end(arr),
|
|
[](auto val) { return val != UINT16_MAX; });
|
|
};
|
|
|
|
// Helper lambda to check if any value in the array is valid (not
|
|
// UINT64_MAX)
|
|
auto has_valid_u64 = [](const auto& arr) {
|
|
return std::any_of(std::begin(arr), std::end(arr),
|
|
[](auto val) { return val != UINT64_MAX; });
|
|
};
|
|
|
|
vcn_supported = has_valid_u16(gpu_metrics.vcn_activity);
|
|
jpeg_supported = has_valid_u16(gpu_metrics.jpeg_activity);
|
|
|
|
// Check if VCN and JPEG busy metrics are available
|
|
for(const auto& xcp : gpu_metrics.xcp_stats)
|
|
{
|
|
if(!v_busy_supported && has_valid_u16(xcp.vcn_busy))
|
|
v_busy_supported = true;
|
|
if(!j_busy_supported && has_valid_u16(xcp.jpeg_busy))
|
|
j_busy_supported = true;
|
|
if(v_busy_supported && j_busy_supported) break;
|
|
}
|
|
|
|
// Check if XGMI metrics are supported (any value not at max)
|
|
xgmi_supported = (gpu_metrics.xgmi_link_width != UINT16_MAX) ||
|
|
(gpu_metrics.xgmi_link_speed != UINT16_MAX) ||
|
|
has_valid_u64(gpu_metrics.xgmi_read_data_acc) ||
|
|
has_valid_u64(gpu_metrics.xgmi_write_data_acc);
|
|
|
|
// Check if PCIe metrics are supported (any value not at max)
|
|
pcie_supported = (gpu_metrics.pcie_link_width != UINT16_MAX) ||
|
|
(gpu_metrics.pcie_link_speed != UINT16_MAX) ||
|
|
(gpu_metrics.pcie_bandwidth_acc != UINT64_MAX) ||
|
|
(gpu_metrics.pcie_bandwidth_inst != UINT64_MAX);
|
|
}
|
|
processors::vcn_device_level_only.push_back(vcn_supported);
|
|
processors::jpeg_device_level_only.push_back(jpeg_supported);
|
|
processors::vcn_busy_supported.push_back(v_busy_supported);
|
|
processors::jpeg_busy_supported.push_back(j_busy_supported);
|
|
processors::xgmi_supported.push_back(xgmi_supported);
|
|
processors::pcie_supported.push_back(pcie_supported);
|
|
}
|
|
}
|
|
processors::total_processor_count = processors::processors_list.size();
|
|
}
|
|
|
|
bool
|
|
vcn_is_device_level_only(uint32_t dev_id)
|
|
{
|
|
if(dev_id >= processors::vcn_device_level_only.size()) return false;
|
|
return processors::vcn_device_level_only[dev_id];
|
|
}
|
|
|
|
bool
|
|
jpeg_is_device_level_only(uint32_t dev_id)
|
|
{
|
|
if(dev_id >= processors::jpeg_device_level_only.size()) return false;
|
|
return processors::jpeg_device_level_only[dev_id];
|
|
}
|
|
|
|
bool
|
|
is_vcn_busy_supported(uint32_t dev_id)
|
|
{
|
|
if(dev_id >= processors::vcn_busy_supported.size()) return false;
|
|
return processors::vcn_busy_supported[dev_id];
|
|
}
|
|
|
|
bool
|
|
is_jpeg_busy_supported(uint32_t dev_id)
|
|
{
|
|
if(dev_id >= processors::jpeg_busy_supported.size()) return false;
|
|
return processors::jpeg_busy_supported[dev_id];
|
|
}
|
|
|
|
bool
|
|
is_xgmi_supported(uint32_t dev_id)
|
|
{
|
|
if(dev_id >= processors::xgmi_supported.size()) return false;
|
|
return processors::xgmi_supported[dev_id];
|
|
}
|
|
|
|
bool
|
|
is_pcie_supported(uint32_t dev_id)
|
|
{
|
|
if(dev_id >= processors::pcie_supported.size()) return false;
|
|
return processors::pcie_supported[dev_id];
|
|
}
|
|
|
|
uint32_t
|
|
get_processor_count()
|
|
{
|
|
return processors::total_processor_count;
|
|
}
|
|
|
|
amdsmi_processor_handle
|
|
get_handle_from_id(uint32_t dev_id)
|
|
{
|
|
return processors::processors_list[dev_id];
|
|
}
|
|
#endif
|
|
|
|
} // namespace gpu
|
|
} // namespace rocprofsys
|