Files
rocm-systems/source/lib/rocprofiler/hsa/agent.cpp
T
Jonathan R. Madsen 6fb9000fa1 Agent Implementation (#78)
* Agent Implementation

* Remove unused Findrocprofiler

* Update lib/rocprofiler/hsa/agent.{hpp,cpp}

- default AgentInfo ctor
- getNumaNode() const
- noexcept move ctors
- default initializers for member variables
- fixed clang-tidy recommentations
  - preallocate
  - static in anon namespace
- AgentInfo::setName uses strncpy and ensures that it is terminated

* Update lib/rocprofiler/rocprofiler.cpp (agent.cpp and pc_sampling.cpp)

- move public PC sampling function implementations to pc_sampling.cpp
- move public agent function implementation to agent.cpp
2023-09-22 11:51:21 -07:00

500 lines
16 KiB
C++

// Copyright (c) 2018-2023 Advanced Micro Devices, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include "agent.hpp"
#include <glog/logging.h>
#include <filesystem>
#include <fstream>
#include "lib/common/utility.hpp"
namespace fs = std::filesystem;
namespace rocprofiler
{
namespace hsa
{
namespace
{
std::unordered_map<long long, long long>
get_gpu_nodes_near_cpu()
{
std::unordered_map<long long, long long> gpu_numa_nodes_near_cpu;
long long gpu_numa_nodes_start = 0;
std::string path = "/sys/class/kfd/kfd/topology/nodes";
for(const auto& entry : fs::directory_iterator(path))
{
long long node_id = std::stoll(entry.path().filename().c_str());
std::ifstream gpu_id_file;
std::string gpu_path = entry.path().c_str();
gpu_path += "/gpu_id";
gpu_id_file.open(gpu_path);
std::string gpu_id_str;
if(gpu_id_file.is_open())
{
gpu_id_file >> gpu_id_str;
if(!gpu_id_str.empty())
{
auto gpu_id = std::stoll(gpu_id_str);
if(gpu_id > 0 && (gpu_numa_nodes_start > node_id || gpu_numa_nodes_start == 0))
{
gpu_numa_nodes_start = node_id;
}
}
}
gpu_id_file.close();
}
path = "/sys/class/kfd/kfd/topology/nodes";
for(const auto& entry : fs::directory_iterator(path))
{
long long node_id = std::stoll(entry.path().filename().c_str());
std::string numa_node_path = entry.path().c_str();
long long agent_id = std::stoll(entry.path().filename().c_str());
if(agent_id >= gpu_numa_nodes_start)
{
numa_node_path += "/io_links";
for(const auto& numa_node_entry : fs::directory_iterator(numa_node_path))
{
std::string numa_node_entry_properties_path = numa_node_entry.path().c_str();
numa_node_entry_properties_path += "/properties";
std::ifstream gpu_properties_file;
gpu_properties_file.open(numa_node_entry_properties_path);
std::string gpu_properties_file_line;
if(gpu_properties_file.is_open())
{
while(gpu_properties_file)
{
std::getline(gpu_properties_file, gpu_properties_file_line);
std::string delimiter = " ";
std::stringstream ss(gpu_properties_file_line);
std::string word;
ss >> word;
if(word == "node_to")
{
ss >> word;
long long near_cpu_node_id = std::stoll(word);
if(near_cpu_node_id < gpu_numa_nodes_start)
{
gpu_numa_nodes_near_cpu[node_id] = near_cpu_node_id;
}
}
}
}
gpu_properties_file.close();
}
}
}
return gpu_numa_nodes_near_cpu;
}
// This function checks to see if the provided
// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true,
// the function adds an additional requirement that the pool have the
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false,
// pools must NOT have this property.
// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is
// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but
// no pool was found meeting the requirements. If an error is encountered, we
// return that error.
hsa_status_t
FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg)
{
if(!data) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
auto [api_ptr, pool_ptr] =
*static_cast<std::pair<const AmdExtTable*, hsa_amd_memory_pool_t*>*>(data);
hsa_amd_segment_t segment;
LOG_IF(FATAL,
api_ptr->hsa_amd_memory_pool_get_info_fn(
pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment) == HSA_STATUS_ERROR)
<< "Could not get pool segment";
if(HSA_AMD_SEGMENT_GLOBAL != segment) return HSA_STATUS_SUCCESS;
uint32_t flag;
LOG_IF(FATAL,
api_ptr->hsa_amd_memory_pool_get_info_fn(
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag) == HSA_STATUS_ERROR)
<< "Could not get flag value";
uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT;
if((karg_st == 0 && kern_arg) || (karg_st != 0 && !kern_arg))
{
return HSA_STATUS_SUCCESS;
}
*(pool_ptr) = pool;
return HSA_STATUS_INFO_BREAK;
}
// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
hsa_status_t
FindStandardPool(hsa_amd_memory_pool_t pool, void* data)
{
return FindGlobalPool(pool, data, false);
}
// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
hsa_status_t
FindKernArgPool(hsa_amd_memory_pool_t pool, void* data)
{
return FindGlobalPool(pool, data, true);
}
void
init_cpu_pool(const AmdExtTable& api, AgentInfo& cpu_agent)
{
CHECK(!cpu_agent.isGpu());
auto params = std::make_pair(&api, &cpu_agent.cpu_pool);
auto status =
api.hsa_amd_agent_iterate_memory_pools_fn(cpu_agent.getAgent(), FindStandardPool, &params);
LOG_IF(FATAL, status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
<< "Error: Command Buffer Pool is not initialized";
params.second = &cpu_agent.kernarg_pool;
status =
api.hsa_amd_agent_iterate_memory_pools_fn(cpu_agent.getAgent(), FindKernArgPool, &(params));
LOG_IF(FATAL, status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
<< "Error: Output Buffer Pool is not initialized";
}
void
init_gpu_pool(const AmdExtTable& api, AgentInfo& agent_info)
{
CHECK(agent_info.isGpu());
auto params = std::make_pair(&api, &agent_info.gpu_pool);
auto status =
api.hsa_amd_agent_iterate_memory_pools_fn(agent_info.getAgent(), FindStandardPool, &params);
LOG_IF(FATAL, status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
<< "Error: GPU Pool is not initialized";
}
} // namespace
const std::vector<AgentInfo>&
all_agents()
{
static std::shared_ptr<const std::vector<AgentInfo>> agents = AgentInfo::getAgents(
{.hsa_iterate_agents_fn = hsa_iterate_agents, .hsa_agent_get_info_fn = hsa_agent_get_info},
{.hsa_amd_memory_pool_get_info_fn = hsa_amd_memory_pool_get_info,
.hsa_amd_agent_iterate_memory_pools_fn = hsa_amd_agent_iterate_memory_pools,
.hsa_amd_memory_pool_allocate_fn = hsa_amd_memory_pool_allocate,
.hsa_amd_memory_pool_free_fn = hsa_amd_memory_pool_free,
.hsa_amd_agents_allow_access_fn = hsa_amd_agents_allow_access});
return *agents;
}
std::shared_ptr<const std::vector<AgentInfo>>
AgentInfo::getAgents(const CoreApiTable& api, const AmdExtTable& ext_api)
{
std::vector<hsa_agent_t> agents;
std::shared_ptr<std::vector<AgentInfo>> agent_info_ptr =
std::make_shared<std::vector<AgentInfo>>();
auto& agent_info = *agent_info_ptr;
api.hsa_iterate_agents_fn(
[](hsa_agent_t agent, void* data) {
CHECK_NOTNULL(static_cast<std::vector<hsa_agent_t>*>(data))->emplace_back(agent);
return HSA_STATUS_SUCCESS;
},
&agents);
auto near_gpu_map = get_gpu_nodes_near_cpu();
std::unordered_map<int64_t, AgentInfo*> cpu_id_to_agent;
// Reserve is required to prevent reallocation (which breaks cpu_id_to_agent)
agent_info.reserve(agents.size());
for(auto& agent : agents)
{
auto& new_agent = agent_info.emplace_back(agent, api);
if(!new_agent.isGpu())
{
uint32_t cpu_numa_node_id;
LOG_IF(FATAL,
api.hsa_agent_get_info_fn(agent, HSA_AGENT_INFO_NODE, &cpu_numa_node_id) !=
HSA_STATUS_SUCCESS)
<< "Could not fetch numa info";
new_agent.setNumaNode(cpu_numa_node_id);
cpu_id_to_agent[cpu_numa_node_id] = &new_agent;
init_cpu_pool(ext_api, new_agent);
}
else if(new_agent.isGpu())
{
uint32_t node_id;
LOG_IF(FATAL,
api.hsa_agent_get_info_fn(
agent,
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_DRIVER_NODE_ID),
&node_id) != HSA_STATUS_SUCCESS)
<< "Could not fetch driver node id";
new_agent.setIndex(node_id);
LOG_IF(FATAL,
api.hsa_agent_get_info_fn(agent,
static_cast<hsa_agent_info_t>(HSA_AGENT_INFO_NODE),
&node_id) != HSA_STATUS_SUCCESS)
<< "Could not fetch driver node id";
new_agent.setNumaNode(node_id);
init_gpu_pool(ext_api, new_agent);
}
}
// Sperate for loop to allow cpu_id_to_agent to populate (in case CPUs are not always the first
// NUMA nodes)
for(auto& agent : agent_info)
{
if(agent.isGpu())
{
auto* near_gpu = common::get_val(near_gpu_map, agent.getNumaNode());
LOG_IF(FATAL, !near_gpu) << fmt::format("No CPU Agent near GPU Agent: {} {}", agent);
auto* id_to_agent = common::get_val(cpu_id_to_agent, *near_gpu);
LOG_IF(FATAL, !id_to_agent) << fmt::format("Cannot convert id to agent: {}", *near_gpu);
agent.setNearCpuAgent((*id_to_agent)->getAgent());
agent.cpu_pool = (*id_to_agent)->cpu_pool;
agent.kernarg_pool = (*id_to_agent)->kernarg_pool;
}
}
return agent_info_ptr;
}
AgentInfo::AgentInfo(const hsa_agent_t agent, const ::CoreApiTable& table)
: handle_(agent.handle)
, agent_(agent)
{
if(table.hsa_agent_get_info_fn(agent, HSA_AGENT_INFO_DEVICE, &type_) != HSA_STATUS_SUCCESS)
{
LOG(FATAL) << "hsa_agent_get_info failed";
}
table.hsa_agent_get_info_fn(agent, HSA_AGENT_INFO_NAME, name_);
const int gfxip_label_len = std::min(strlen(name_) - 2, sizeof(gfxip_) - 1);
memcpy(gfxip_, name_, gfxip_label_len);
gfxip_[gfxip_label_len] = '\0';
if(type_ != HSA_DEVICE_TYPE_GPU)
{
return;
}
table.hsa_agent_get_info_fn(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &max_wave_size_);
table.hsa_agent_get_info_fn(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &max_queue_size_);
table.hsa_agent_get_info_fn(
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &cu_num_);
table.hsa_agent_get_info_fn(
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), &simds_per_cu_);
table.hsa_agent_get_info_fn(
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &se_num_);
if(table.hsa_agent_get_info_fn(agent,
(hsa_agent_info_t) HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE,
&shader_arrays_per_se_) != HSA_STATUS_SUCCESS ||
table.hsa_agent_get_info_fn(agent,
(hsa_agent_info_t) HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
&waves_per_cu_) != HSA_STATUS_SUCCESS)
{
LOG(FATAL) << "hsa_agent_get_info for gfxip hardware configuration failed";
}
compute_units_per_sh_ = cu_num_ / (se_num_ * shader_arrays_per_se_);
wave_slots_per_simd_ = waves_per_cu_ / simds_per_cu_;
if(table.hsa_agent_get_info_fn(agent,
(hsa_agent_info_t) HSA_AMD_AGENT_INFO_DOMAIN,
&pci_domain_) != HSA_STATUS_SUCCESS ||
table.hsa_agent_get_info_fn(agent,
(hsa_agent_info_t) HSA_AMD_AGENT_INFO_BDFID,
&pci_location_id_) != HSA_STATUS_SUCCESS)
{
LOG(FATAL) << "hsa_agent_get_info for PCI info failed";
}
}
uint64_t
AgentInfo::getIndex() const
{
return index_;
}
hsa_device_type_t
AgentInfo::getType() const
{
return type_;
}
uint64_t
AgentInfo::getHandle() const
{
return handle_;
}
const std::string_view
AgentInfo::getName() const
{
return name_;
}
std::string
AgentInfo::getGfxip() const
{
return std::string(gfxip_);
}
uint32_t
AgentInfo::getMaxWaveSize() const
{
return max_wave_size_;
}
uint32_t
AgentInfo::getMaxQueueSize() const
{
return max_queue_size_;
}
uint32_t
AgentInfo::getCUCount() const
{
return cu_num_;
}
uint32_t
AgentInfo::getSimdCountPerCU() const
{
return simds_per_cu_;
}
uint32_t
AgentInfo::getShaderEngineCount() const
{
return se_num_;
}
uint32_t
AgentInfo::getShaderArraysPerSE() const
{
return shader_arrays_per_se_;
}
uint32_t
AgentInfo::getMaxWavesPerCU() const
{
return waves_per_cu_;
}
uint32_t
AgentInfo::getCUCountPerSH() const
{
return compute_units_per_sh_;
}
uint32_t
AgentInfo::getWaveSlotsPerSimd() const
{
return wave_slots_per_simd_;
}
uint32_t
AgentInfo::getPCIDomain() const
{
return pci_domain_;
}
uint32_t
AgentInfo::getPCILocationID() const
{
return pci_location_id_;
}
uint32_t
AgentInfo::getXccCount() const
{
return xcc_num_;
}
void
AgentInfo::setIndex(uint64_t index)
{
index_ = index;
}
void
AgentInfo::setType(hsa_device_type_t type)
{
type_ = type;
}
void
AgentInfo::setHandle(uint64_t handle)
{
handle_ = handle;
}
void
AgentInfo::setName(const std::string& name)
{
constexpr auto name_len = sizeof(name_) / sizeof(char);
//
// char* strncpy(char* destination, const char* source, size_t num)
//
// If the end of the source string (which is signaled by a null-character) is found before num
// characters have been copied, destination is padded with zeros until a total of num characters
// have been written to it
strncpy(name_, name.c_str(), name_len - 2);
// ensure always terminated
name_[name_len - 1] = '\0';
}
void
AgentInfo::setNumaNode(uint32_t numa_node)
{
numa_node_ = numa_node;
}
uint32_t
AgentInfo::getNumaNode() const
{
return numa_node_;
}
void
AgentInfo::setNearCpuAgent(hsa_agent_t near_cpu_agent)
{
near_cpu_agent_ = near_cpu_agent;
}
hsa_agent_t
AgentInfo::getNearCpuAgent()
{
return near_cpu_agent_;
}
} // namespace hsa
} // namespace rocprofiler