Merge "Remove now unused hsa_rsrc_factory" into amd-staging

[ROCm/roctracer commit: fe0adfd37b]
This commit is contained in:
Laurent Morichetti
2022-05-10 14:54:01 -04:00
zatwierdzone przez Gerrit Code Review
5 zmienionych plików z 11 dodań i 1337 usunięć
@@ -40,7 +40,6 @@ execute_process ( COMMAND sh -xc "ln -s ${ROOT_DIR}/../rocprofiler/src/core/acti
set ( TARGET_LIB ${TARGET_NAME} )
set ( LIB_SRC
${LIB_DIR}/core/roctracer.cpp
${LIB_DIR}/util/hsa_rsrc_factory.cpp
)
add_library ( ${TARGET_LIB} ${LIBRARY_TYPE} ${LIB_SRC} )
target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${ROCM_INC_PATH} ${HIP_INC_DIR} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} )
@@ -48,6 +48,16 @@
#define CONSTRUCTOR_API __attribute__((constructor))
#define DESTRUCTOR_API __attribute__((destructor))
#define CHECK_STATUS(msg, status) \
do { \
if ((status) != HSA_STATUS_SUCCESS) { \
const char* status_string = nullptr; \
hsa_status_string(status, &status_string); \
ERR_LOGGING(msg << ": " << (status_string ? status_string : "<unknown error>")); \
abort(); \
} \
} while (false)
#define HIPAPI_CALL(call) \
do { \
hipError_t err = call; \
+1 -9
Wyświetl plik
@@ -28,7 +28,6 @@
#include <atomic>
#include "util/hsa_rsrc_factory.h"
#include "util/exception.h"
#include "util/logger.h"
@@ -114,14 +113,7 @@ class Tracker {
entry->begin = async_copy_time.start * sysclock_period;
entry->end = async_copy_time.end * sysclock_period;
} else {
hsa_amd_profiling_dispatch_time_t dispatch_time{};
hsa_status_t status =
hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time);
if (status != HSA_STATUS_SUCCESS)
EXC_RAISING(ROCTRACER_STATUS_ERROR, "hsa_amd_profiling_get_dispatch_time failed");
entry->begin = dispatch_time.start * sysclock_period;
entry->end = dispatch_time.end * sysclock_period;
entry->dev_index = ::util::HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index;
assert(false && "should not reach here");
}
hsa_signal_t orig = entry->orig;
@@ -1,807 +0,0 @@
/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "util/hsa_rsrc_factory.h"
#include <dlfcn.h>
#include <fcntl.h>
#include <hsa.h>
#include <hsa_ext_amd.h>
#include <hsa_ext_finalize.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <atomic>
#include <cassert>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
namespace util {
// Callback function to get available in the system agents
hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) {
hsa_status_t status = HSA_STATUS_ERROR;
HsaRsrcFactory* hsa_rsrc = reinterpret_cast<HsaRsrcFactory*>(data);
const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent);
if (agent_info != NULL) status = HSA_STATUS_SUCCESS;
return status;
}
// This function checks to see if the provided
// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true,
// the function adds an additional requirement that the pool have the
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false,
// pools must NOT have this property.
// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is
// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but
// no pool was found meeting the requirements. If an error is encountered, we
// return that error.
static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) {
hsa_status_t err;
hsa_amd_segment_t segment;
uint32_t flag;
if (nullptr == data) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
CHECK_STATUS("hsa_amd_memory_pool_get_info", err);
if (HSA_AMD_SEGMENT_GLOBAL != segment) {
return HSA_STATUS_SUCCESS;
}
err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
CHECK_STATUS("hsa_amd_memory_pool_get_info", err);
uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT;
if ((karg_st == 0 && kern_arg) || (karg_st != 0 && !kern_arg)) {
return HSA_STATUS_SUCCESS;
}
*(reinterpret_cast<hsa_amd_memory_pool_t*>(data)) = pool;
return HSA_STATUS_INFO_BREAK;
}
// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) {
return FindGlobalPool(pool, data, false);
}
// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) {
return FindGlobalPool(pool, data, true);
}
// Constructor of the class
HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) {
hsa_status_t status;
cpu_pool_ = NULL;
kern_arg_pool_ = NULL;
InitHsaApiTable(NULL);
// Initialize the Hsa Runtime
if (initialize_hsa_) {
status = hsa_api_.hsa_init();
CHECK_STATUS("Error in hsa_init", status);
}
// Discover the set of Gpu devices available on the platform
status = hsa_api_.hsa_iterate_agents(GetHsaAgentsCallback, this);
CHECK_STATUS("Error Calling hsa_iterate_agents", status);
if (cpu_pool_ == NULL) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR);
if (kern_arg_pool_ == NULL) CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR);
// Get AqlProfile API table
aqlprofile_api_ = {0};
#ifdef ROCP_LD_AQLPROFILE
status = LoadAqlProfileLib(&aqlprofile_api_);
#else
status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE,
hsa_ven_amd_aqlprofile_VERSION_MAJOR,
sizeof(aqlprofile_api_), &aqlprofile_api_);
#endif
CHECK_STATUS("aqlprofile API table load failed", status);
// Get Loader API table
loader_api_ = {0};
status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1,
sizeof(loader_api_), &loader_api_);
CHECK_STATUS("loader API table query failed", status);
// Instantiate HSA timer
timer_ = new HsaTimer(&hsa_api_);
CHECK_STATUS("HSA timer allocation failed",
(timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS);
// Time correlation
const uint32_t corr_iters = 1000;
CorrelateTime(HsaTimer::TIME_ID_CLOCK_REALTIME, corr_iters);
CorrelateTime(HsaTimer::TIME_ID_CLOCK_MONOTONIC, corr_iters);
// System timeout
timeout_ =
(timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_);
}
// Destructor of the class
HsaRsrcFactory::~HsaRsrcFactory() {
delete timer_;
for (auto p : cpu_list_) delete p;
for (auto p : gpu_list_) delete p;
if (initialize_hsa_) {
hsa_status_t status = hsa_api_.hsa_shut_down();
CHECK_STATUS("Error in hsa_shut_down", status);
}
}
void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
std::lock_guard<mutex_t> lck(mutex_);
if (hsa_api_.hsa_init == NULL) {
if (table != NULL) {
hsa_api_.hsa_init = table->core_->hsa_init_fn;
hsa_api_.hsa_shut_down = table->core_->hsa_shut_down_fn;
hsa_api_.hsa_agent_get_info = table->core_->hsa_agent_get_info_fn;
hsa_api_.hsa_iterate_agents = table->core_->hsa_iterate_agents_fn;
hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn;
hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn;
hsa_api_.hsa_queue_load_write_index_relaxed =
table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_api_.hsa_queue_store_write_index_relaxed =
table->core_->hsa_queue_store_write_index_relaxed_fn;
hsa_api_.hsa_queue_load_read_index_relaxed =
table->core_->hsa_queue_load_read_index_relaxed_fn;
hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn;
hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn;
hsa_api_.hsa_signal_load_relaxed = table->core_->hsa_signal_load_relaxed_fn;
hsa_api_.hsa_signal_store_relaxed = table->core_->hsa_signal_store_relaxed_fn;
hsa_api_.hsa_signal_wait_scacquire = table->core_->hsa_signal_wait_scacquire_fn;
hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn;
hsa_api_.hsa_code_object_reader_create_from_file =
table->core_->hsa_code_object_reader_create_from_file_fn;
hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn;
hsa_api_.hsa_executable_load_agent_code_object =
table->core_->hsa_executable_load_agent_code_object_fn;
hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn;
hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn;
hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn;
hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn;
hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn;
hsa_api_.hsa_system_get_major_extension_table =
table->core_->hsa_system_get_major_extension_table_fn;
hsa_api_.hsa_amd_agent_iterate_memory_pools =
table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn;
hsa_api_.hsa_amd_memory_pool_get_info = table->amd_ext_->hsa_amd_memory_pool_get_info_fn;
hsa_api_.hsa_amd_memory_pool_allocate = table->amd_ext_->hsa_amd_memory_pool_allocate_fn;
hsa_api_.hsa_amd_agents_allow_access = table->amd_ext_->hsa_amd_agents_allow_access_fn;
hsa_api_.hsa_amd_memory_async_copy = table->amd_ext_->hsa_amd_memory_async_copy_fn;
hsa_api_.hsa_amd_memory_async_copy_rect = table->amd_ext_->hsa_amd_memory_async_copy_rect_fn;
hsa_api_.hsa_amd_signal_async_handler = table->amd_ext_->hsa_amd_signal_async_handler_fn;
hsa_api_.hsa_amd_profiling_set_profiler_enabled =
table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn;
hsa_api_.hsa_amd_profiling_get_async_copy_time =
table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn;
hsa_api_.hsa_amd_profiling_get_dispatch_time =
table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn;
} else {
hsa_api_.hsa_init = hsa_init;
hsa_api_.hsa_shut_down = hsa_shut_down;
hsa_api_.hsa_agent_get_info = hsa_agent_get_info;
hsa_api_.hsa_iterate_agents = hsa_iterate_agents;
hsa_api_.hsa_queue_create = hsa_queue_create;
hsa_api_.hsa_queue_destroy = hsa_queue_destroy;
hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed;
hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed;
hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed;
hsa_api_.hsa_signal_create = hsa_signal_create;
hsa_api_.hsa_signal_destroy = hsa_signal_destroy;
hsa_api_.hsa_signal_load_relaxed = hsa_signal_load_relaxed;
hsa_api_.hsa_signal_store_relaxed = hsa_signal_store_relaxed;
hsa_api_.hsa_signal_wait_scacquire = hsa_signal_wait_scacquire;
hsa_api_.hsa_signal_store_screlease = hsa_signal_store_screlease;
hsa_api_.hsa_code_object_reader_create_from_file = hsa_code_object_reader_create_from_file;
hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt;
hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object;
hsa_api_.hsa_executable_freeze = hsa_executable_freeze;
hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol;
hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info;
hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols;
hsa_api_.hsa_system_get_info = hsa_system_get_info;
hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table;
hsa_api_.hsa_amd_agent_iterate_memory_pools = hsa_amd_agent_iterate_memory_pools;
hsa_api_.hsa_amd_memory_pool_get_info = hsa_amd_memory_pool_get_info;
hsa_api_.hsa_amd_memory_pool_allocate = hsa_amd_memory_pool_allocate;
hsa_api_.hsa_amd_agents_allow_access = hsa_amd_agents_allow_access;
hsa_api_.hsa_amd_memory_async_copy = hsa_amd_memory_async_copy;
hsa_api_.hsa_amd_memory_async_copy_rect = hsa_amd_memory_async_copy_rect;
hsa_api_.hsa_amd_signal_async_handler = hsa_amd_signal_async_handler;
hsa_api_.hsa_amd_profiling_set_profiler_enabled = hsa_amd_profiling_set_profiler_enabled;
hsa_api_.hsa_amd_profiling_get_async_copy_time = hsa_amd_profiling_get_async_copy_time;
hsa_api_.hsa_amd_profiling_get_dispatch_time = hsa_amd_profiling_get_dispatch_time;
}
}
}
hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) {
void* handle = dlopen(kAqlProfileLib, RTLD_NOW);
if (handle == NULL) {
fprintf(stderr, "Loading '%s' failed, %s\n", kAqlProfileLib, dlerror());
return HSA_STATUS_ERROR;
}
dlerror(); /* Clear any existing error */
api->hsa_ven_amd_aqlprofile_error_string =
(decltype(::hsa_ven_amd_aqlprofile_error_string)*)dlsym(
handle, "hsa_ven_amd_aqlprofile_error_string");
api->hsa_ven_amd_aqlprofile_validate_event =
(decltype(::hsa_ven_amd_aqlprofile_validate_event)*)dlsym(
handle, "hsa_ven_amd_aqlprofile_validate_event");
api->hsa_ven_amd_aqlprofile_start =
(decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start");
api->hsa_ven_amd_aqlprofile_stop =
(decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop");
#ifdef AQLPROF_NEW_API
api->hsa_ven_amd_aqlprofile_read =
(decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read");
#endif
api->hsa_ven_amd_aqlprofile_legacy_get_pm4 =
(decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym(
handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4");
api->hsa_ven_amd_aqlprofile_get_info = (decltype(::hsa_ven_amd_aqlprofile_get_info)*)dlsym(
handle, "hsa_ven_amd_aqlprofile_get_info");
api->hsa_ven_amd_aqlprofile_iterate_data =
(decltype(::hsa_ven_amd_aqlprofile_iterate_data)*)dlsym(
handle, "hsa_ven_amd_aqlprofile_iterate_data");
return HSA_STATUS_SUCCESS;
}
// Add system agent info
const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) {
// Determine if device is a Gpu agent
hsa_status_t status;
AgentInfo* agent_info = NULL;
hsa_device_type_t type;
status = hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
CHECK_STATUS("Error Calling hsa_agent_get_info", status);
if (type == HSA_DEVICE_TYPE_CPU) {
agent_info = new AgentInfo{};
agent_info->dev_id = agent;
agent_info->dev_type = HSA_DEVICE_TYPE_CPU;
agent_info->dev_index = cpu_list_.size();
status =
hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool);
if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool;
status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool,
&agent_info->kern_arg_pool);
if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL))
kern_arg_pool_ = &agent_info->kern_arg_pool;
agent_info->gpu_pool = {};
cpu_list_.push_back(agent_info);
cpu_agents_.push_back(agent);
}
if (type == HSA_DEVICE_TYPE_GPU) {
agent_info = new AgentInfo{};
agent_info->dev_id = agent;
agent_info->dev_type = HSA_DEVICE_TYPE_GPU;
hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name);
const int gfxip_label_len =
std::min(strlen(agent_info->name) - 2, sizeof(agent_info->gfxip) - 1);
memcpy(agent_info->gfxip, agent_info->name, gfxip_label_len);
agent_info->gfxip[gfxip_label_len] = '\0';
hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size);
hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size);
hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile);
agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false;
hsa_api_.hsa_agent_get_info(
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT),
&agent_info->cu_num);
hsa_api_.hsa_agent_get_info(agent,
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU),
&agent_info->waves_per_cu);
hsa_api_.hsa_agent_get_info(agent,
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU),
&agent_info->simds_per_cu);
hsa_api_.hsa_agent_get_info(
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES),
&agent_info->se_num);
hsa_api_.hsa_agent_get_info(
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE),
&agent_info->shader_arrays_per_se);
agent_info->cpu_pool = {};
agent_info->kern_arg_pool = {};
status =
hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool);
CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status);
// GFX8 and GFX9 SGPR/VGPR block sizes
agent_info->sgpr_block_dflt = (strcmp(agent_info->gfxip, "gfx8") == 0) ? 1 : 2;
agent_info->sgpr_block_size = 8;
agent_info->vgpr_block_size = 4;
// Set GPU index
agent_info->dev_index = gpu_list_.size();
gpu_list_.push_back(agent_info);
gpu_agents_.push_back(agent);
}
if (agent_info) agent_map_[agent.handle] = agent_info;
return agent_info;
}
// Return systen agent info
const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) {
const AgentInfo* agent_info = NULL;
auto it = agent_map_.find(agent.handle);
if (it != agent_map_.end()) {
agent_info = it->second;
}
return agent_info;
}
// Get the count of Hsa Gpu Agents available on the platform
//
// @return uint32_t Number of Gpu agents on platform
//
uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); }
// Get the count of Hsa Cpu Agents available on the platform
//
// @return uint32_t Number of Cpu agents on platform
//
uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); }
// Get the AgentInfo handle of a Gpu device
//
// @param idx Gpu Agent at specified index
//
// @param agent_info Output parameter updated with AgentInfo
//
// @return bool true if successful, false otherwise
//
bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) {
// Determine if request is valid
uint32_t size = uint32_t(gpu_list_.size());
if (idx >= size) {
return false;
}
// Copy AgentInfo from specified index
*agent_info = gpu_list_[idx];
return true;
}
// Get the AgentInfo handle of a Cpu device
//
// @param idx Cpu Agent at specified index
//
// @param agent_info Output parameter updated with AgentInfo
//
// @return bool true if successful, false otherwise
//
bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) {
// Determine if request is valid
uint32_t size = uint32_t(cpu_list_.size());
if (idx >= size) {
return false;
}
// Copy AgentInfo from specified index
*agent_info = cpu_list_[idx];
return true;
}
// Create a Queue object and return its handle. The queue object is expected
// to support user requested number of Aql dispatch packets.
//
// @param agent_info Gpu Agent on which to create a queue object
//
// @param num_Pkts Number of packets to be held by queue
//
// @param queue Output parameter updated with handle of queue object
//
// @return bool true if successful, false otherwise
//
bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts,
hsa_queue_t** queue) {
hsa_status_t status;
status = hsa_api_.hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
UINT32_MAX, UINT32_MAX, queue);
return (status == HSA_STATUS_SUCCESS);
}
// Create a Signal object and return its handle.
// @param value Initial value of signal object
// @param signal Output parameter updated with handle of signal object
// @return bool true if successful, false otherwise
bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) {
hsa_status_t status;
status = hsa_api_.hsa_signal_create(value, 0, NULL, signal);
return (status == HSA_STATUS_SUCCESS);
}
// Allocate memory for use by a kernel of specified size in specified
// agent's memory region.
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) {
hsa_status_t status = HSA_STATUS_ERROR;
uint8_t* buffer = NULL;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = hsa_api_.hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0,
reinterpret_cast<void**>(&buffer));
uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
return ptr;
}
// Allocate memory to pass kernel parameters.
// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter.
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) {
hsa_status_t status = HSA_STATUS_ERROR;
uint8_t* buffer = NULL;
if (!cpu_agents_.empty()) {
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = hsa_api_.hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0,
reinterpret_cast<void**>(&buffer));
// Both the CPU and GPU can access the kernel arguments
if (status == HSA_STATUS_SUCCESS) {
hsa_agent_t ag_list[1] = {agent_info->dev_id};
status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer);
}
}
uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
return ptr;
}
// Allocate system memory accessible by both CPU and GPU
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) {
hsa_status_t status = HSA_STATUS_ERROR;
uint8_t* buffer = NULL;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
if (!cpu_agents_.empty()) {
status = hsa_api_.hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0,
reinterpret_cast<void**>(&buffer));
// Both the CPU and GPU can access the memory
if (status == HSA_STATUS_SUCCESS) {
hsa_agent_t ag_list[1] = {agent_info->dev_id};
status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer);
}
}
uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
return ptr;
}
// Allocate memory for command buffer.
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t size) {
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
uint8_t* ptr = (agent_info->is_apu && CMD_MEMORY_MMAP)
? reinterpret_cast<uint8_t*>(
mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, 0, 0))
: AllocateSysMemory(agent_info, size);
return ptr;
}
// Wait signal
hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal,
const hsa_signal_value_t& signal_value) const {
const hsa_signal_value_t exp_value = signal_value - 1;
hsa_signal_value_t ret_value = signal_value;
while (1) {
ret_value = hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, signal_value,
timeout_, HSA_WAIT_STATE_BLOCKED);
if (ret_value == exp_value) break;
if (ret_value != signal_value) {
std::cerr << "Error: HsaRsrcFactory::SignalWait: signal_value(" << signal_value
<< "), ret_value(" << ret_value << ")" << std::endl
<< std::flush;
abort();
}
}
return ret_value;
}
// Wait signal with signal value restore
void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal,
const hsa_signal_value_t& signal_value) const {
SignalWait(signal, signal_value);
hsa_api_.hsa_signal_store_relaxed(const_cast<hsa_signal_t&>(signal), signal_value);
}
// Copy data from GPU to host memory
bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) {
hsa_status_t status = HSA_STATUS_ERROR;
if (!cpu_agents_.empty()) {
hsa_signal_t s = {};
status = hsa_api_.hsa_signal_create(1, 0, NULL, &s);
CHECK_STATUS("hsa_signal_create()", status);
status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s);
CHECK_STATUS("hsa_amd_memory_async_copy()", status);
SignalWait(s, 1);
status = hsa_api_.hsa_signal_destroy(s);
CHECK_STATUS("hsa_signal_destroy()", status);
}
return (status == HSA_STATUS_SUCCESS);
}
bool HsaRsrcFactory::Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size) {
return Memcpy(agent_info->dev_id, dst, src, size);
}
// Memory free method
bool HsaRsrcFactory::FreeMemory(void* ptr) {
const hsa_status_t status = hsa_memory_free(ptr);
CHECK_STATUS("hsa_memory_free", status);
return (status == HSA_STATUS_SUCCESS);
}
// Loads an Assembled Brig file and Finalizes it into Device Isa
// @param agent_info Gpu device for which to finalize
// @param brig_path File path of the Assembled Brig file
// @param kernel_name Name of the kernel to finalize
// @param code_desc Handle of finalized Code Descriptor that could
// be used to submit for execution
// @return bool true if successful, false otherwise
bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
const char* kernel_name, hsa_executable_t* executable,
hsa_executable_symbol_t* code_desc) {
hsa_status_t status = HSA_STATUS_ERROR;
// Build the code object filename
std::string filename(brig_path);
std::clog << "Code object filename: " << filename << std::endl;
// Open the file containing code object
hsa_file_t file_handle = open(filename.c_str(), O_RDONLY);
if (file_handle == -1) {
std::cerr << "Error: failed to load '" << filename << "'" << std::endl;
assert(false);
return false;
}
// Create code object reader
hsa_code_object_reader_t code_obj_rdr = {0};
status = hsa_api_.hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr);
if (status != HSA_STATUS_SUCCESS) {
std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl;
return false;
}
// Create executable.
status = hsa_api_.hsa_executable_create_alt(
HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable);
CHECK_STATUS("Error in creating executable object", status);
// Load code object.
status = hsa_api_.hsa_executable_load_agent_code_object(*executable, agent_info->dev_id,
code_obj_rdr, NULL, NULL);
CHECK_STATUS("Error in loading executable object", status);
// Freeze executable.
status = hsa_api_.hsa_executable_freeze(*executable, "");
CHECK_STATUS("Error in freezing executable object", status);
// Get symbol handle.
hsa_executable_symbol_t kernelSymbol;
status = hsa_api_.hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0,
&kernelSymbol);
CHECK_STATUS("Error in looking up kernel symbol", status);
// Update output parameter
*code_desc = kernelSymbol;
return true;
}
// Print the various fields of Hsa Gpu Agents
bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) {
std::cout << std::flush;
std::clog << header << " :" << std::endl;
const AgentInfo* agent_info;
int size = uint32_t(gpu_list_.size());
for (int idx = 0; idx < size; idx++) {
agent_info = gpu_list_[idx];
std::clog << "> agent[" << idx << "] :" << std::endl;
std::clog << ">> Name : " << agent_info->name << std::endl;
std::clog << ">> APU : " << agent_info->is_apu << std::endl;
std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl;
std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl;
std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl;
std::clog << ">> CU number : " << agent_info->cu_num << std::endl;
std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl;
std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl;
std::clog << ">> SE number : " << agent_info->se_num << std::endl;
std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl;
}
return true;
}
uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) {
const uint32_t slot_size_b = CMD_SLOT_SIZE_B;
// adevance command queue
const uint64_t write_idx = hsa_api_.hsa_queue_load_write_index_relaxed(queue);
hsa_api_.hsa_queue_store_write_index_relaxed(queue, write_idx + 1);
while ((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) {
sched_yield();
}
uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
uint32_t* queue_slot =
reinterpret_cast<uint32_t*>((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
const uint32_t* slot_data = reinterpret_cast<const uint32_t*>(packet);
// Copy buffered commands into the queue slot.
// Overwrite the AQL invalid header (first dword) last.
// This prevents the slot from being read until it's fully written.
memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t));
std::atomic<uint32_t>* header_atomic_ptr =
reinterpret_cast<std::atomic<uint32_t>*>(&queue_slot[0]);
header_atomic_ptr->store(slot_data[0], std::memory_order_release);
// ringdoor bell
hsa_api_.hsa_signal_store_relaxed(queue->doorbell_signal, write_idx);
return write_idx;
}
uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes) {
const uint32_t slot_size_b = CMD_SLOT_SIZE_B;
if ((size_bytes & (slot_size_b - 1)) != 0) {
fprintf(stderr, "HsaRsrcFactory::Submit: Bad packet size %zx\n", size_bytes);
abort();
}
const char* begin = reinterpret_cast<const char*>(packet);
const char* end = begin + size_bytes;
uint64_t write_idx = 0;
for (const char* ptr = begin; ptr < end; ptr += slot_size_b) {
write_idx = Submit(queue, ptr);
}
return write_idx;
}
const char* HsaRsrcFactory::GetKernelName(uint64_t addr) {
std::lock_guard<mutex_t> lck(mutex_);
const auto it = symbols_map_->find(addr);
if (it == symbols_map_->end()) {
fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr);
abort();
}
return strdup(it->second);
}
void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) {
std::lock_guard<mutex_t> lck(mutex_);
executable_tracking_on_ = true;
table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor;
}
hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec,
hsa_executable_symbol_t symbol, void* data) {
hsa_symbol_kind_t value = (hsa_symbol_kind_t)0;
hsa_status_t status =
hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value);
CHECK_STATUS("Error in getting symbol info", status);
if (value == HSA_SYMBOL_KIND_KERNEL) {
uint64_t addr = 0;
uint32_t len = 0;
status = hsa_api_.hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr);
CHECK_STATUS("Error in getting kernel object", status);
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH,
&len);
CHECK_STATUS("Error in getting name len", status);
char* name = new char[len + 1];
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
CHECK_STATUS("Error in getting kernel name", status);
name[len] = 0;
auto ret = symbols_map_->insert({addr, name});
if (ret.second == false) {
delete[] ret.first->second;
ret.first->second = name;
}
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable,
const char* options) {
std::lock_guard<mutex_t> lck(mutex_);
if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t;
hsa_status_t status =
hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL);
CHECK_STATUS("Error in iterating executable symbols", status);
return hsa_api_.hsa_executable_freeze(executable, options);
;
}
void HsaRsrcFactory::DumpHandles(FILE* file) {
auto beg = agent_map_.begin();
auto end = agent_map_.end();
for (auto it = beg; it != end; ++it) {
const AgentInfo* agent_info = it->second;
fprintf(file, "0x%lx agent %s\n", agent_info->dev_id.handle,
(agent_info->dev_type == HSA_DEVICE_TYPE_CPU) ? "cpu" : "gpu");
if (agent_info->cpu_pool.handle != 0)
fprintf(file, "0x%lx pool cpu\n", agent_info->cpu_pool.handle);
if (agent_info->kern_arg_pool.handle != 0)
fprintf(file, "0x%lx pool cpu kernarg\n", agent_info->kern_arg_pool.handle);
if (agent_info->gpu_pool.handle != 0)
fprintf(file, "0x%lx pool gpu%u\n", agent_info->gpu_pool.handle, agent_info->dev_index);
}
fflush(file);
}
std::atomic<HsaRsrcFactory*> HsaRsrcFactory::instance_{};
HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_;
HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX;
hsa_pfn_t HsaRsrcFactory::hsa_api_{};
bool HsaRsrcFactory::executable_tracking_on_ = false;
HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = NULL;
} // namespace util
@@ -1,520 +0,0 @@
/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef SRC_UTIL_HSA_RSRC_FACTORY_H_
#define SRC_UTIL_HSA_RSRC_FACTORY_H_
#include <hsa.h>
#include <hsa_api_trace.h>
#include <hsa_ext_amd.h>
#include <hsa_ext_finalize.h>
#include <hsa_ven_amd_aqlprofile.h>
#include <hsa_ven_amd_loader.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <atomic>
#include <iostream>
#include <mutex>
#include <map>
#include <string>
#include <vector>
#define HSA_ARGUMENT_ALIGN_BYTES 16
#define HSA_QUEUE_ALIGN_BYTES 64
#define HSA_PACKET_ALIGN_BYTES 64
#define CHECK_STATUS(msg, status) \
do { \
if ((status) != HSA_STATUS_SUCCESS) { \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
abort(); \
} \
} while (0)
#define CHECK_ITER_STATUS(msg, status) \
do { \
if ((status) != HSA_STATUS_INFO_BREAK) { \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
abort(); \
} \
} while (0)
namespace util {
static const size_t MEM_PAGE_BYTES = 0x1000;
static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t;
struct hsa_pfn_t {
decltype(hsa_init)* hsa_init;
decltype(hsa_shut_down)* hsa_shut_down;
decltype(hsa_agent_get_info)* hsa_agent_get_info;
decltype(hsa_iterate_agents)* hsa_iterate_agents;
decltype(hsa_queue_create)* hsa_queue_create;
decltype(hsa_queue_destroy)* hsa_queue_destroy;
decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed;
decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed;
decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed;
decltype(hsa_signal_create)* hsa_signal_create;
decltype(hsa_signal_destroy)* hsa_signal_destroy;
decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed;
decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed;
decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire;
decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease;
decltype(hsa_code_object_reader_create_from_file)* hsa_code_object_reader_create_from_file;
decltype(hsa_executable_create_alt)* hsa_executable_create_alt;
decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object;
decltype(hsa_executable_freeze)* hsa_executable_freeze;
decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol;
decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info;
decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols;
decltype(hsa_system_get_info)* hsa_system_get_info;
decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table;
decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools;
decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info;
decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate;
decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access;
decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy;
decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect;
decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler;
decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled;
decltype(hsa_amd_profiling_get_async_copy_time)* hsa_amd_profiling_get_async_copy_time;
decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time;
};
// Encapsulates information about a Hsa Agent such as its
// handle, name, max queue size, max wavefront size, etc.
struct AgentInfo {
// Handle of Agent
hsa_agent_t dev_id;
// Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
uint32_t dev_type;
// APU flag
bool is_apu;
// Agent system index
uint32_t dev_index;
// GFXIP name
char gfxip[64];
// Name of Agent whose length is less than 64
char name[64];
// Max size of Wavefront size
uint32_t max_wave_size;
// Max size of Queue buffer
uint32_t max_queue_size;
// Hsail profile supported by agent
hsa_profile_t profile;
// CPU/GPU/kern-arg memory pools
hsa_amd_memory_pool_t cpu_pool;
hsa_amd_memory_pool_t gpu_pool;
hsa_amd_memory_pool_t kern_arg_pool;
// The number of compute unit available in the agent.
uint32_t cu_num;
// Maximum number of waves possible in a Compute Unit.
uint32_t waves_per_cu;
// Number of SIMD's per compute unit CU
uint32_t simds_per_cu;
// Number of Shader Engines (SE) in Gpu
uint32_t se_num;
// Number of Shader Arrays Per Shader Engines in Gpu
uint32_t shader_arrays_per_se;
// SGPR/VGPR block sizes
uint32_t sgpr_block_dflt;
uint32_t sgpr_block_size;
uint32_t vgpr_block_size;
};
// HSA timer class
// Provides current HSA timestampa and system-clock/ns conversion API
class HsaTimer {
public:
typedef uint64_t timestamp_t;
static const timestamp_t TIMESTAMP_MAX = UINT64_MAX;
typedef long double freq_t;
enum time_id_t { TIME_ID_CLOCK_REALTIME = 0, TIME_ID_CLOCK_MONOTONIC = 1, TIME_ID_NUMBER };
HsaTimer(const hsa_pfn_t* hsa_api) : hsa_api_(hsa_api) {
timestamp_t sysclock_hz = 0;
hsa_status_t status =
hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz);
CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status);
sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz;
}
// Methods for system-clock/ns conversion
timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const {
return timestamp_t((freq_t)sysclock * sysclock_factor_);
}
timestamp_t ns_to_sysclock(const timestamp_t& time) const {
return timestamp_t((freq_t)time / sysclock_factor_);
}
// Method for timespec/ns conversion
static timestamp_t timespec_to_ns(const timespec& time) {
return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec;
}
// Return timestamp in 'ns'
timestamp_t timestamp_ns() const {
timestamp_t sysclock;
hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock);
CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status);
return sysclock_to_ns(sysclock);
}
// Return time in 'ns'
static timestamp_t clocktime_ns(clockid_t clock_id) {
timespec time;
clock_gettime(clock_id, &time);
return timespec_to_ns(time);
}
// Return pair of correlated values of profiling timestamp and time with
// correlation error for a given time ID and number of iterations
void correlated_pair_ns(time_id_t time_id, uint32_t iters, timestamp_t* timestamp_v,
timestamp_t* time_v, timestamp_t* error_v) const {
clockid_t clock_id = 0;
switch (time_id) {
case TIME_ID_CLOCK_REALTIME:
clock_id = CLOCK_REALTIME;
break;
case TIME_ID_CLOCK_MONOTONIC:
clock_id = CLOCK_MONOTONIC;
break;
default:
CHECK_STATUS("internal error: invalid time_id", HSA_STATUS_ERROR);
}
std::vector<timestamp_t> ts_vec(iters);
std::vector<timespec> tm_vec(iters);
const uint32_t steps = iters - 1;
for (uint32_t i = 0; i < iters; ++i) {
hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &ts_vec[i]);
clock_gettime(clock_id, &tm_vec[i]);
}
const timestamp_t ts_base = sysclock_to_ns(ts_vec.front());
const timestamp_t tm_base = timespec_to_ns(tm_vec.front());
const timestamp_t error = (ts_vec.back() - ts_vec.front()) / (2 * steps);
timestamp_t ts_accum = 0;
timestamp_t tm_accum = 0;
for (uint32_t i = 0; i < iters; ++i) {
ts_accum += (ts_vec[i] - ts_base);
tm_accum += (timespec_to_ns(tm_vec[i]) - tm_base);
}
*timestamp_v = (ts_accum / iters) + ts_base + error;
*time_v = (tm_accum / iters) + tm_base;
*error_v = error;
}
private:
// Timestamp frequency factor
freq_t sysclock_factor_;
// HSA API table
const hsa_pfn_t* const hsa_api_;
};
class HsaRsrcFactory {
public:
static const size_t CMD_SLOT_SIZE_B = 0x40;
typedef std::recursive_mutex mutex_t;
typedef HsaTimer::timestamp_t timestamp_t;
static HsaRsrcFactory* Create(bool initialize_hsa = true) {
std::lock_guard<mutex_t> lck(mutex_);
HsaRsrcFactory* obj = instance_.load(std::memory_order_relaxed);
if (obj == NULL) {
obj = new HsaRsrcFactory(initialize_hsa);
instance_.store(obj, std::memory_order_release);
}
return obj;
}
static HsaRsrcFactory& Instance() {
HsaRsrcFactory* obj = instance_.load(std::memory_order_acquire);
if (obj == NULL) obj = Create(false);
hsa_status_t status = (obj != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
CHECK_STATUS("HsaRsrcFactory::Instance() failed", status);
return *obj;
}
static void Destroy() {
std::lock_guard<mutex_t> lck(mutex_);
if (instance_) delete instance_.load();
instance_ = NULL;
}
// Return system agent info
const AgentInfo* GetAgentInfo(const hsa_agent_t agent);
// Get the count of Hsa Gpu Agents available on the platform
// @return uint32_t Number of Gpu agents on platform
uint32_t GetCountOfGpuAgents();
// Get the count of Hsa Cpu Agents available on the platform
// @return uint32_t Number of Cpu agents on platform
uint32_t GetCountOfCpuAgents();
// Get the AgentInfo handle of a Gpu device
// @param idx Gpu Agent at specified index
// @param agent_info Output parameter updated with AgentInfo
// @return bool true if successful, false otherwise
bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info);
// Get the AgentInfo handle of a Cpu device
// @param idx Cpu Agent at specified index
// @param agent_info Output parameter updated with AgentInfo
// @return bool true if successful, false otherwise
bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info);
// Create a Queue object and return its handle. The queue object is expected
// to support user requested number of Aql dispatch packets.
// @param agent_info Gpu Agent on which to create a queue object
// @param num_Pkts Number of packets to be held by queue
// @param queue Output parameter updated with handle of queue object
// @return bool true if successful, false otherwise
bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue);
// Create a Signal object and return its handle.
// @param value Initial value of signal object
// @param signal Output parameter updated with handle of signal object
// @return bool true if successful, false otherwise
bool CreateSignal(uint32_t value, hsa_signal_t* signal);
// Allocate local GPU memory
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size);
// Allocate memory tp pass kernel parameters
// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter.
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* AllocateKernArgMemory(const AgentInfo* agent_info, size_t size);
// Allocate system memory accessible from both CPU and GPU
// Memory is alocated accessible to all CPU agents and AgentInfo parameter is ignored.
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size);
// Allocate memory for command buffer.
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size);
// Wait signal
hsa_signal_value_t SignalWait(const hsa_signal_t& signal,
const hsa_signal_value_t& signal_value) const;
// Wait signal with signal value restore
void SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const;
// Copy data from GPU to host memory
bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size);
bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size);
// Memory free method
static bool FreeMemory(void* ptr);
// Loads an Assembled Brig file and Finalizes it into Device Isa
// @param agent_info Gpu device for which to finalize
// @param brig_path File path of the Assembled Brig file
// @param kernel_name Name of the kernel to finalize
// @param code_desc Handle of finalized Code Descriptor that could
// be used to submit for execution
// @return true if successful, false otherwise
bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc);
// Print the various fields of Hsa Gpu Agents
bool PrintGpuAgents(const std::string& header);
// Submit AQL packet to given queue
static uint64_t Submit(hsa_queue_t* queue, const void* packet);
static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes);
// Enable executables loading tracking
static bool IsExecutableTracking() { return executable_tracking_on_; }
static void EnableExecutableTracking(HsaApiTable* table);
static const char* GetKernelName(uint64_t addr);
// Initialize HSA API table
void static InitHsaApiTable(HsaApiTable* table);
static const hsa_pfn_t* HsaApi() { return &hsa_api_; }
// Return AqlProfile API table
typedef hsa_ven_amd_aqlprofile_pfn_t aqlprofile_pfn_t;
const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; }
// Return Loader API table
const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; }
// Methods for system-clock/ns conversion and timestamp in 'ns'
timestamp_t SysclockToNs(const timestamp_t& sysclock) const {
return timer_->sysclock_to_ns(sysclock);
}
timestamp_t NsToSysclock(const timestamp_t& time) const { return timer_->ns_to_sysclock(time); }
timestamp_t TimestampNs() const { return timer_->timestamp_ns(); }
timestamp_t GetSysTimeout() const { return timeout_; }
static timestamp_t GetTimeoutNs() { return timeout_ns_; }
static void SetTimeoutNs(const timestamp_t& time) {
std::lock_guard<mutex_t> lck(mutex_);
timeout_ns_ = time;
if (instance_ != NULL) Instance().timeout_ = Instance().timer_->ns_to_sysclock(time);
}
void CorrelateTime(HsaTimer::time_id_t time_id, uint32_t iters) {
timestamp_t timestamp_v = 0;
timestamp_t time_v = 0;
timestamp_t error_v = 0;
timer_->correlated_pair_ns(time_id, iters, &timestamp_v, &time_v, &error_v);
time_shift_[time_id] = time_v - timestamp_v;
time_error_[time_id] = error_v;
}
hsa_status_t GetTime(uint32_t time_id, timestamp_t value, uint64_t* time) {
if (time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR;
*time = value + time_shift_[time_id];
return HSA_STATUS_SUCCESS;
}
hsa_status_t GetTimestamp(uint32_t time_id, uint64_t value, timestamp_t* timestamp) {
if (time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR;
*timestamp = value - time_shift_[time_id];
return HSA_STATUS_SUCCESS;
}
void DumpHandles(FILE* output_file);
private:
// System agents iterating callback
static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data);
// Callback function to find and bind kernarg region of an agent
static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data);
// Load AQL profile HSA extension library directly
static hsa_status_t LoadAqlProfileLib(aqlprofile_pfn_t* api);
// Constructor of the class. Will initialize the Hsa Runtime and
// query the system topology to get the list of Cpu and Gpu devices
explicit HsaRsrcFactory(bool initialize_hsa);
// Destructor of the class
~HsaRsrcFactory();
// Add an instance of AgentInfo representing a Hsa Gpu agent
const AgentInfo* AddAgentInfo(const hsa_agent_t agent);
// To mmap command buffer memory
static const bool CMD_MEMORY_MMAP = false;
// HSA was initialized
const bool initialize_hsa_;
static std::atomic<HsaRsrcFactory*> instance_;
static mutex_t mutex_;
// Used to maintain a list of Hsa Gpu Agent Info
std::vector<const AgentInfo*> gpu_list_;
std::vector<hsa_agent_t> gpu_agents_;
// Used to maintain a list of Hsa Cpu Agent Info
std::vector<const AgentInfo*> cpu_list_;
std::vector<hsa_agent_t> cpu_agents_;
// System agents map
std::map<hsa_agent_handle_t, const AgentInfo*> agent_map_;
// Executables loading tracking
typedef std::map<uint64_t, const char*> symbols_map_t;
static symbols_map_t* symbols_map_;
static bool executable_tracking_on_;
static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable,
const char* options);
static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol,
void* data);
// HSA runtime API table
static hsa_pfn_t hsa_api_;
// AqlProfile API table
aqlprofile_pfn_t aqlprofile_api_;
// Loader API table
hsa_ven_amd_loader_1_00_pfn_t loader_api_;
// System timeout, ns
static timestamp_t timeout_ns_;
// System timeout, sysclock
timestamp_t timeout_;
// HSA timer
HsaTimer* timer_;
// Time shift array to support time conversion
timestamp_t time_shift_[HsaTimer::TIME_ID_NUMBER];
timestamp_t time_error_[HsaTimer::TIME_ID_NUMBER];
// CPU/kern-arg memory pools
hsa_amd_memory_pool_t* cpu_pool_;
hsa_amd_memory_pool_t* kern_arg_pool_;
};
} // namespace util
#endif // SRC_UTIL_HSA_RSRC_FACTORY_H_