From 9dec361cd45c3bd6663523fcb58dc64b51731499 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Sun, 29 Apr 2018 03:24:46 -0500 Subject: [PATCH] SQTT local allocation Change-Id: Ie4a150ad0dc141226f6f1c571916c5a526dd723c --- inc/rocprofiler.h | 1 + src/core/context.h | 17 ++- src/core/profile.h | 12 +- src/core/rocprofiler.cpp | 3 + src/util/hsa_rsrc_factory.cpp | 190 +++++++++++++++++++------------ src/util/hsa_rsrc_factory.h | 87 ++++++-------- sync_hsa_mgr.sh | 5 + test/ctrl/test_hsa.cpp | 12 +- test/ctrl/tool.cpp | 42 ++++--- test/input.xml | 2 +- test/util/hsa_rsrc_factory.cpp | 200 ++++++++++++++++++++++----------- test/util/hsa_rsrc_factory.h | 59 ++++++---- 12 files changed, 393 insertions(+), 237 deletions(-) create mode 100755 sync_hsa_mgr.sh diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index f9f0f371d0..acb05ea76a 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -85,6 +85,7 @@ uint32_t rocprofiler_version_minor(); typedef struct { uint32_t intercept_mode; uint32_t sqtt_size; + uint32_t sqtt_local; uint64_t timeout; uint32_t timestamp_on; } rocprofiler_settings_t; diff --git a/src/core/context.h b/src/core/context.h index 63833435bc..dbba323f68 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -426,11 +426,14 @@ class Context { rinfo->data.kind = ROCPROFILER_DATA_KIND_INT64; } else if (ainfo_type == HSA_VEN_AMD_AQLPROFILE_INFO_SQTT_DATA) { if (rinfo->data.result_bytes.copy) { + const bool sqtt_local = SqttProfile::IsLocal(); + util::HsaRsrcFactory* hsa_rsrc = &util::HsaRsrcFactory::Instance(); if (sample_id == 0) { const uint32_t output_buffer_size = profile->output_buffer.size; - util::HsaRsrcFactory* hsa_rsrc = &util::HsaRsrcFactory::Instance(); + const uint32_t output_buffer_size64 = profile->output_buffer.size / sizeof(uint64_t); const util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(profile->agent); - void* ptr = hsa_rsrc->AllocateSysMemory(agent_info, output_buffer_size); + void* ptr = (sqtt_local) ? hsa_rsrc->AllocateSysMemory(agent_info, output_buffer_size) : + calloc(output_buffer_size64, sizeof(uint64_t)); rinfo->data.result_bytes.size = output_buffer_size; rinfo->data.result_bytes.ptr = ptr; callback_data->ptr = reinterpret_cast(ptr); @@ -448,13 +451,19 @@ class Context { else EXC_RAISING(HSA_STATUS_ERROR, "SQTT data out of output buffer"); } - bool suc = util::HsaRsrcFactory::Memcpy(profile->agent, dest, src, size); + bool suc = true; + if (sqtt_local) { + suc = hsa_rsrc->Memcpy(profile->agent, dest, src, size); + } else { + memcpy(dest, src, size); + } if (suc) { *header = size; callback_data->ptr = dest + align_size(size, sizeof(uint32_t)); rinfo->data.result_bytes.instance_count = sample_id + 1; rinfo->data.kind = ROCPROFILER_DATA_KIND_BYTES; - } + } else + EXC_RAISING(HSA_STATUS_ERROR, "Agent Memcpy failed, dst(" << (void*)dest << ") src(" << (void*)src << ") size(" << size << ")"); } else { if (sample_id == 0) { rinfo->data.result_bytes.ptr = profile->output_buffer.ptr; diff --git a/src/core/profile.h b/src/core/profile.h index 01a89ad4d4..90981ee621 100644 --- a/src/core/profile.h +++ b/src/core/profile.h @@ -202,7 +202,7 @@ class PmcProfile : public Profile { hsa_status_t Allocate(util::HsaRsrcFactory* rsrc) { profile_.command_buffer.ptr = - rsrc->AllocateSysMemory(agent_info_, profile_.command_buffer.size); + rsrc->AllocateSysMemory(agent_info_, profile_.command_buffer.size); profile_.output_buffer.ptr = rsrc->AllocateSysMemory(agent_info_, profile_.output_buffer.size); return (profile_.command_buffer.ptr && profile_.output_buffer.ptr) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; @@ -213,6 +213,8 @@ class SqttProfile : public Profile { public: static inline void SetSize(const uint32_t& size) { output_buffer_size_ = size; } static inline uint32_t GetSize() { return output_buffer_size_; } + static inline void SetLocal(const bool& b) { output_buffer_local_ = b; } + static inline bool IsLocal() { return output_buffer_local_; } SqttProfile(const util::AgentInfo* agent_info) : Profile(agent_info) { profile_.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_SQTT; @@ -227,16 +229,18 @@ class SqttProfile : public Profile { hsa_status_t Allocate(util::HsaRsrcFactory* rsrc) { profile_.command_buffer.ptr = - rsrc->AllocateSysMemory(agent_info_, profile_.command_buffer.size); + rsrc->AllocateSysMemory(agent_info_, profile_.command_buffer.size); profile_.output_buffer.size = output_buffer_size_; - profile_.output_buffer.ptr = - rsrc->AllocateLocalMemory(agent_info_, profile_.output_buffer.size); + profile_.output_buffer.ptr = (output_buffer_local_) ? + rsrc->AllocateLocalMemory(agent_info_, profile_.output_buffer.size) : + rsrc->AllocateSysMemory(agent_info_, profile_.output_buffer.size); return (profile_.command_buffer.ptr && profile_.output_buffer.ptr) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; } private: static uint32_t output_buffer_size_; + static bool output_buffer_local_; }; } // namespace rocprofiler diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 61ebcf57e6..9d43ee3284 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -127,6 +127,7 @@ bool LoadTool() { rocprofiler_settings_t settings{}; settings.intercept_mode = (intercept_mode) ? 1 : 0; settings.sqtt_size = SqttProfile::GetSize(); + settings.sqtt_local = SqttProfile::IsLocal() ? 1: 0; settings.timeout = Context::GetTimeout(); settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0; @@ -135,6 +136,7 @@ bool LoadTool() { intercept_mode = (settings.intercept_mode != 0); SqttProfile::SetSize(settings.sqtt_size); + SqttProfile::SetLocal(settings.sqtt_local != 0); Context::SetTimeout(settings.timeout); InterceptQueue::SetTimeout(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); @@ -186,6 +188,7 @@ const MetricsDict* GetMetrics(const hsa_agent_t& agent) { rocprofiler_properties_t rocprofiler_properties; uint64_t Context::timeout_ = UINT64_MAX; uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M +bool SqttProfile::output_buffer_local_ = true; Tracker::mutex_t Tracker::mutex_; util::Logger::mutex_t util::Logger::mutex_; util::Logger* util::Logger::instance_ = NULL; diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index fd3b30e300..6624142ad0 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -59,27 +59,59 @@ hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) return status; } -// Callback function to find and bind kernarg region of an agent -hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* data) { - hsa_region_global_flag_t flags; - hsa_region_segment_t segment_id; +// This function checks to see if the provided +// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true, +// the function adds an additional requirement that the pool have the +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false, +// pools must NOT have this property. +// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is +// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but +// no pool was found meeting the requirements. If an error is encountered, we +// return that error. +static hsa_status_t +FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) { + hsa_status_t err; + hsa_amd_segment_t segment; + uint32_t flag; - hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id); - if (segment_id != HSA_REGION_SEGMENT_GLOBAL) { + if (nullptr == data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, + &segment); + CHECK_STATUS("hsa_amd_memory_pool_get_info", err); + if (HSA_AMD_SEGMENT_GLOBAL != segment) { return HSA_STATUS_SUCCESS; } - AgentInfo* agent_info = (AgentInfo*)data; - hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); - if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) { - agent_info->coarse_region = region; + err = hsa_amd_memory_pool_get_info(pool, + HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + CHECK_STATUS("hsa_amd_memory_pool_get_info", err); + + uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; + + if ((karg_st == 0 && kern_arg) || + (karg_st != 0 && !kern_arg)) { + return HSA_STATUS_SUCCESS; } - if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) { - agent_info->kernarg_region = region; - } + *(reinterpret_cast(data)) = pool; + return HSA_STATUS_INFO_BREAK; +} - return HSA_STATUS_SUCCESS; +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, false); +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, true); } // Constructor of the class @@ -172,7 +204,15 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { agent_info->dev_id = agent; agent_info->dev_type = HSA_DEVICE_TYPE_CPU; agent_info->dev_index = cpu_list_.size(); + + status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(cpu pool)", status); + status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(kern arg pool)", status); + agent_info->gpu_pool = {}; + cpu_list_.push_back(agent_info); + cpu_agents_.push_back(agent); } if (type == HSA_DEVICE_TYPE_GPU) { @@ -192,16 +232,15 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &agent_info->se_num); hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &agent_info->shader_arrays_per_se); - // Initialize memory regions to zero - agent_info->kernarg_region.handle = 0; - agent_info->coarse_region.handle = 0; - - // Find and Bind Memory regions of the Gpu agent - hsa_agent_iterate_regions(agent, FindMemRegionsCallback, agent_info); + agent_info->cpu_pool = {}; + agent_info->kern_arg_pool = {}; + status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); // Set GPU index agent_info->dev_index = gpu_list_.size(); gpu_list_.push_back(agent_info); + gpu_agents_.push_back(agent); } if (agent_info) agent_map_[agent.handle] = agent_info; @@ -292,13 +331,9 @@ bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, } // Create a Signal object and return its handle. -// // @param value Initial value of signal object -// // @param signal Output parameter updated with handle of signal object -// // @return bool true if successful, false otherwise -// bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { hsa_status_t status; status = hsa_signal_create(value, 0, NULL, signal); @@ -306,65 +341,83 @@ bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { } // Allocate memory for use by a kernel of specified size in specified -// agent's memory region. Currently supports Global segment whose Kernarg -// flag set. -// +// agent's memory region. // @param agent_info Agent from whose memory region to allocate -// // @param size Size of memory in terms of bytes -// // @return uint8_t* Pointer to buffer, null if allocation fails. -// uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) { - hsa_status_t status; + hsa_status_t status = HSA_STATUS_ERROR; uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - - if (agent_info->coarse_region.handle != 0) { - // Allocate in local memory if it is available - status = hsa_memory_allocate(agent_info->coarse_region, size, (void**)&buffer); - if (status == HSA_STATUS_SUCCESS) { - status = hsa_memory_assign_agent(buffer, agent_info->dev_id, HSA_ACCESS_PERMISSION_RW); - } - } else { - // Allocate in system memory if local memory is not available - status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer); - } - - CHECK_STATUS("hsa_memory_allocate", status); - return (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + status = hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, (void**)&buffer); + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; } -// Allocate memory tp pass kernel parameters. -// +// Allocate memory to pass kernel parameters. +// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. // @param agent_info Agent from whose memory region to allocate -// // @param size Size of memory in terms of bytes -// // @return uint8_t* Pointer to buffer, null if allocation fails. -// -uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) { - hsa_status_t status; - size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - +uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; uint8_t* buffer = NULL; - status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer); - CHECK_STATUS("hsa_memory_allocate", status); - return (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + if (!cpu_agents_.empty()) { + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + status = hsa_amd_memory_pool_allocate(cpu_list_[0]->kern_arg_pool, size, 0, (void**)&buffer); + // Both the CPU and GPU can access the kernel arguments + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; } -// Memcopy method -bool HsaRsrcFactory::CopyToHost(void* dest_buff, const void* src_buff, uint32_t length) { - const hsa_status_t status = hsa_memory_copy(dest_buff, src_buff, length); - CHECK_STATUS("hsa_memory_copy", status); +// Allocate system memory accessible by both CPU and GPU +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + if (!cpu_agents_.empty()) { + status = hsa_amd_memory_pool_allocate(cpu_list_[0]->cpu_pool, size, 0, (void**)&buffer); + // Both the CPU and GPU can access the memory + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Copy data from GPU to host memory +bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + if (!cpu_agents_.empty()) { + hsa_signal_t s = {}; + status = hsa_signal_create(1, 0, NULL, &s); + if (status == HSA_STATUS_SUCCESS) { + status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + if (status == HSA_STATUS_SUCCESS) { + if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) { + status = HSA_STATUS_ERROR; + } + } + status = hsa_signal_destroy(s); + } + } return (status == HSA_STATUS_SUCCESS); } -bool HsaRsrcFactory::Memcpy(hsa_agent_t agent, void* dest_buff, const void* src_buff, uint32_t length) { - (void)agent; - return CopyToHost(dest_buff, src_buff, length); +bool HsaRsrcFactory::Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size) { + return Memcpy(agent_info->dev_id, dst, src, size); } -// Free method +// Memory free method bool HsaRsrcFactory::FreeMemory(void* ptr) { const hsa_status_t status = hsa_memory_free(ptr); CHECK_STATUS("hsa_memory_free", status); @@ -372,18 +425,12 @@ bool HsaRsrcFactory::FreeMemory(void* ptr) { } // Loads an Assembled Brig file and Finalizes it into Device Isa -// // @param agent_info Gpu device for which to finalize -// // @param brig_path File path of the Assembled Brig file -// // @param kernel_name Name of the kernel to finalize -// // @param code_desc Handle of finalized Code Descriptor that could // be used to submit for execution -// // @return bool true if successful, false otherwise -// bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, hsa_executable_t* executable, hsa_executable_symbol_t* code_desc) { hsa_status_t status = HSA_STATUS_ERROR; @@ -448,7 +495,6 @@ bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl; std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl; std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl; - std::clog << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl; std::clog << ">> CU number : " << agent_info->cu_num << std::endl; std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl; std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl; diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index c9ab5f3f09..c07798c22c 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -22,10 +22,11 @@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING POSSIBILITY OF SUCH DAMAGE. ********************************************************************/ -#ifndef SRC_UTIL_HSA_RSRC_FACTORY_H_ -#define SRC_UTIL_HSA_RSRC_FACTORY_H_ +#ifndef _HSA_RSRC_FACTORY_H_ +#define _HSA_RSRC_FACTORY_H_ #include +#include #include #include #include @@ -52,10 +53,18 @@ POSSIBILITY OF SUCH DAMAGE. exit(1); \ } +#define CHECK_ITER_STATUS(msg, status) \ + if (status != HSA_STATUS_INFO_BREAK) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + exit(1); \ + } + namespace rocprofiler { namespace util { -static const unsigned MEM_PAGE_BYTES = 0x1000; -static const unsigned MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; +static const size_t MEM_PAGE_BYTES = 0x1000; +static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; // Encapsulates information about a Hsa Agent such as its @@ -88,11 +97,10 @@ struct AgentInfo { // Hsail profile supported by agent hsa_profile_t profile; - // Memory region supporting kernel parameters - hsa_region_t coarse_region; - - // Memory region supporting kernel arguments - hsa_region_t kernarg_region; + // CPU/GPU/kern-arg memory pools + hsa_amd_memory_pool_t cpu_pool; + hsa_amd_memory_pool_t gpu_pool; + hsa_amd_memory_pool_t kern_arg_pool; // The number of compute unit available in the agent. uint32_t cu_num; @@ -139,102 +147,73 @@ class HsaRsrcFactory { const AgentInfo* GetAgentInfo(const hsa_agent_t agent); // Get the count of Hsa Gpu Agents available on the platform - // // @return uint32_t Number of Gpu agents on platform - // uint32_t GetCountOfGpuAgents(); // Get the count of Hsa Cpu Agents available on the platform - // // @return uint32_t Number of Cpu agents on platform - // uint32_t GetCountOfCpuAgents(); // Get the AgentInfo handle of a Gpu device - // // @param idx Gpu Agent at specified index - // // @param agent_info Output parameter updated with AgentInfo - // // @return bool true if successful, false otherwise - // bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); // Get the AgentInfo handle of a Cpu device - // // @param idx Cpu Agent at specified index - // // @param agent_info Output parameter updated with AgentInfo - // // @return bool true if successful, false otherwise - // bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); // Create a Queue object and return its handle. The queue object is expected // to support user requested number of Aql dispatch packets. - // // @param agent_info Gpu Agent on which to create a queue object - // // @param num_Pkts Number of packets to be held by queue - // // @param queue Output parameter updated with handle of queue object - // // @return bool true if successful, false otherwise - // bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue); // Create a Signal object and return its handle. - // // @param value Initial value of signal object - // // @param signal Output parameter updated with handle of signal object - // // @return bool true if successful, false otherwise - // bool CreateSignal(uint32_t value, hsa_signal_t* signal); - // Allocate memory for use by a kernel of specified size in specified - // agent's memory region. Currently supports Global segment whose Kernarg - // flag set. - // + // Allocate local GPU memory // @param agent_info Agent from whose memory region to allocate - // // @param size Size of memory in terms of bytes - // // @return uint8_t* Pointer to buffer, null if allocation fails. - // uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size); - // Allocate memory tp pass kernel parameters. - // + // Allocate memory tp pass kernel parameters + // Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateKernArgMemory(const AgentInfo* agent_info, size_t size); + + // Allocate system memory accessible from both CPU and GPU + // Memory is alocated accessible to all CPU agents and AgentInfo parameter is ignored. // @param agent_info Agent from whose memory region to allocate - // // @param size Size of memory in terms of bytes - // // @return uint8_t* Pointer to buffer, null if allocation fails. - // uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size); - // Memcopy method - static bool CopyToHost(void* dest_buff, const void* src_buff, uint32_t length); - static bool Memcpy(hsa_agent_t agent, void* dest_buff, const void* src_buff, uint32_t length); + // Copy data from GPU to host memory + bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size); + bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size); - // Free method + // Memory free method static bool FreeMemory(void* ptr); // Loads an Assembled Brig file and Finalizes it into Device Isa - // // @param agent_info Gpu device for which to finalize - // // @param brig_path File path of the Assembled Brig file - // // @param kernel_name Name of the kernel to finalize - // // @param code_desc Handle of finalized Code Descriptor that could // be used to submit for execution - // // @return true if successful, false otherwise - // bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc); @@ -279,9 +258,11 @@ class HsaRsrcFactory { // Used to maintain a list of Hsa Gpu Agent Info std::vector gpu_list_; + std::vector gpu_agents_; // Used to maintain a list of Hsa Cpu Agent Info std::vector cpu_list_; + std::vector cpu_agents_; // System agents map std::map agent_map_; @@ -296,4 +277,4 @@ class HsaRsrcFactory { } // namespace util } // namespace rocprofiler -#endif // SRC_UTIL_HSA_RSRC_FACTORY_H_ +#endif // _HSA_RSRC_FACTORY_H_ diff --git a/sync_hsa_mgr.sh b/sync_hsa_mgr.sh new file mode 100755 index 0000000000..3d6faa692e --- /dev/null +++ b/sync_hsa_mgr.sh @@ -0,0 +1,5 @@ +#!/bin/sh -x +BDIR=`dirname $0` +for name in hsa_rsrc_factory.h hsa_rsrc_factory.cpp ; do + cat $BDIR/src/util/$name | grep -v namespace > $BDIR/test/util/$name +done diff --git a/test/ctrl/test_hsa.cpp b/test/ctrl/test_hsa.cpp index 076fe66ca7..2a7d1c7ede 100644 --- a/test/ctrl/test_hsa.cpp +++ b/test/ctrl/test_hsa.cpp @@ -220,8 +220,15 @@ bool TestHsa::Run() { // Wait on the dispatch signal until the kernel is finished. // Update wait condition to HSA_WAIT_STATE_ACTIVE for Polling - hsa_signal_wait_acquire(hsa_signal_, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1, - HSA_WAIT_STATE_BLOCKED); + if (hsa_signal_wait_scacquire( + hsa_signal_, + HSA_SIGNAL_CONDITION_LT, + 1, + UINT64_MAX, + HSA_WAIT_STATE_BLOCKED) != 0) + { + TEST_ASSERT("signal_wait failed"); + } std::clog << "> DONE, que_idx=" << que_idx << std::endl; @@ -243,6 +250,7 @@ bool TestHsa::VerifyResults() { if (test_->IsOutputLocal()) { output = hsa_rsrc_->AllocateSysMemory(agent_info_, size); suc = hsa_rsrc_->Memcpy(agent_info_, output, test_->GetOutputPtr(), size); + if (!suc) std::clog << "> VerifyResults: Memcpy failed" << std::endl << std::flush; } else { output = test_->GetOutputPtr();; suc = true; diff --git a/test/ctrl/tool.cpp b/test/ctrl/tool.cpp index a83cb04918..cdac18daf9 100644 --- a/test/ctrl/tool.cpp +++ b/test/ctrl/tool.cpp @@ -93,6 +93,8 @@ static uint32_t CTX_OUTSTANDING_MAX = 0; static uint32_t CTX_OUTSTANDING_MON = 0; // to truncate kernel names uint32_t to_truncate_names = 0; +// local SQTT buffer +bool is_sqtt_local = true; static inline uint32_t GetPid() { return syscall(__NR_getpid); } static inline uint32_t GetTid() { return syscall(__NR_gettid); } @@ -263,17 +265,23 @@ hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, hsa_status_t status = HSA_STATUS_SUCCESS; trace_data_arg_t* arg = reinterpret_cast(data); if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_SQTT_DATA) { - const uint32_t data_size = info_data->sqtt_data.size; const void* data_ptr = info_data->sqtt_data.ptr; + const uint32_t data_size = info_data->sqtt_data.size; fprintf(arg->file, " SE(%u) size(%u)\n", info_data->sample_id, data_size); - HsaRsrcFactory* hsa_rsrc = &HsaRsrcFactory::Instance(); - const AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(arg->agent); - void* buffer = hsa_rsrc->AllocateSysMemory(agent_info, data_size); - const bool suc = HsaRsrcFactory::Memcpy(arg->agent, buffer, data_ptr, data_size); - if (suc) dump_sqtt_trace(arg->label, info_data->sample_id, buffer, data_size); - else fatal("SQTT data memcopy to host failed"); - HsaRsrcFactory::FreeMemory(buffer); + if (is_sqtt_local) { + HsaRsrcFactory* hsa_rsrc = &HsaRsrcFactory::Instance(); + const AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(arg->agent); + const uint32_t mem_size = data_size; + void* buffer = hsa_rsrc->AllocateSysMemory(agent_info, mem_size); + if(!hsa_rsrc->Memcpy(agent_info, buffer, data_ptr, mem_size)) { + fatal("SQTT data memcopy to host failed"); + } + dump_sqtt_trace(arg->label, info_data->sample_id, buffer, data_size); + HsaRsrcFactory::FreeMemory(buffer); + } else { + dump_sqtt_trace(arg->label, info_data->sample_id, data_ptr, data_size); + } } else status = HSA_STATUS_ERROR; return status; @@ -719,6 +727,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) if (multiplier != 1) str = str.substr(0, str.length() - 1); settings->sqtt_size = strtoull(str.c_str(), NULL, 0) * multiplier; } + it = opts.find("sqtt-local"); + if (it != opts.end()) { settings->sqtt_local = (it->second == "on"); } } } // Enable verbose mode @@ -734,6 +744,10 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) check_env_var("ROCP_DATA_TIMEOUT", settings->timeout); // Set SQTT size check_env_var("ROCP_SQTT_SIZE", settings->sqtt_size); + // Set SQTT local buffer + check_env_var("ROCP_SQTT_LOCAL", settings->sqtt_local); + + is_sqtt_local = settings->sqtt_local; // Printing out info char* info_symb = getenv("ROCP_INFO"); @@ -837,12 +851,6 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) } if (name == "") fatal("ROCProfiler: Bad trace properties, name is not specified"); - printf(" %s (", name.c_str()); - features[index] = {}; - features[index].kind = ROCPROFILER_FEATURE_KIND_TRACE; - features[index].name = strdup(name.c_str()); - features[index].data.result_bytes.copy = to_copy_data; - std::map parameters_dict; parameters_dict["TARGET_CU"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; @@ -857,6 +865,12 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) parameters_dict["SE_MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK; + printf(" %s (", name.c_str()); + features[index] = {}; + features[index].kind = ROCPROFILER_FEATURE_KIND_TRACE; + features[index].name = strdup(name.c_str()); + features[index].data.result_bytes.copy = to_copy_data; + for (auto* params : params_list) { const unsigned parameter_count = params->opts.size(); rocprofiler_parameter_t* parameters = new rocprofiler_parameter_t[parameter_count]; diff --git a/test/input.xml b/test/input.xml index 53622e07e9..9ba6165082 100644 --- a/test/input.xml +++ b/test/input.xml @@ -14,7 +14,7 @@ > # SQTT trace with parameters - + coarse_region = region; + err = hsa_amd_memory_pool_get_info(pool, + HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + CHECK_STATUS("hsa_amd_memory_pool_get_info", err); + + uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; + + if ((karg_st == 0 && kern_arg) || + (karg_st != 0 && !kern_arg)) { + return HSA_STATUS_SUCCESS; } - if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) { - agent_info->kernarg_region = region; - } + *(reinterpret_cast(data)) = pool; + return HSA_STATUS_INFO_BREAK; +} - return HSA_STATUS_SUCCESS; +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, false); +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, true); } // Constructor of the class @@ -169,7 +202,15 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { agent_info->dev_id = agent; agent_info->dev_type = HSA_DEVICE_TYPE_CPU; agent_info->dev_index = cpu_list_.size(); + + status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(cpu pool)", status); + status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(kern arg pool)", status); + agent_info->gpu_pool = {}; + cpu_list_.push_back(agent_info); + cpu_agents_.push_back(agent); } if (type == HSA_DEVICE_TYPE_GPU) { @@ -189,16 +230,15 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &agent_info->se_num); hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &agent_info->shader_arrays_per_se); - // Initialize memory regions to zero - agent_info->kernarg_region.handle = 0; - agent_info->coarse_region.handle = 0; - - // Find and Bind Memory regions of the Gpu agent - hsa_agent_iterate_regions(agent, FindMemRegionsCallback, agent_info); + agent_info->cpu_pool = {}; + agent_info->kern_arg_pool = {}; + status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); // Set GPU index agent_info->dev_index = gpu_list_.size(); gpu_list_.push_back(agent_info); + gpu_agents_.push_back(agent); } if (agent_info) agent_map_[agent.handle] = agent_info; @@ -217,17 +257,25 @@ const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) { } // Get the count of Hsa Gpu Agents available on the platform +// // @return uint32_t Number of Gpu agents on platform +// uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); } // Get the count of Hsa Cpu Agents available on the platform +// // @return uint32_t Number of Cpu agents on platform +// uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); } // Get the AgentInfo handle of a Gpu device +// // @param idx Gpu Agent at specified index +// // @param agent_info Output parameter updated with AgentInfo +// // @return bool true if successful, false otherwise +// bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { // Determine if request is valid uint32_t size = uint32_t(gpu_list_.size()); @@ -242,9 +290,13 @@ bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) } // Get the AgentInfo handle of a Cpu device +// // @param idx Cpu Agent at specified index +// // @param agent_info Output parameter updated with AgentInfo +// // @return bool true if successful, false otherwise +// bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { // Determine if request is valid uint32_t size = uint32_t(cpu_list_.size()); @@ -259,10 +311,15 @@ bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) // Create a Queue object and return its handle. The queue object is expected // to support user requested number of Aql dispatch packets. +// // @param agent_info Gpu Agent on which to create a queue object +// // @param num_Pkts Number of packets to be held by queue +// // @param queue Output parameter updated with handle of queue object +// // @return bool true if successful, false otherwise +// bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) { hsa_status_t status; @@ -282,66 +339,83 @@ bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { } // Allocate memory for use by a kernel of specified size in specified -// agent's memory region. Currently supports Global segment whose Kernarg -// flag set. +// agent's memory region. // @param agent_info Agent from whose memory region to allocate // @param size Size of memory in terms of bytes // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) { - hsa_status_t status; + hsa_status_t status = HSA_STATUS_ERROR; uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - - if (agent_info->coarse_region.handle != 0) { - // Allocate in local memory if it is available - status = hsa_memory_allocate(agent_info->coarse_region, size, (void**)&buffer); - if (status == HSA_STATUS_SUCCESS) { - status = hsa_memory_assign_agent(buffer, agent_info->dev_id, HSA_ACCESS_PERMISSION_RW); - } - } else { - // Allocate in system memory if local memory is not available - status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer); - } - - CHECK_STATUS("hsa_memory_allocate", status); - return (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + status = hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, (void**)&buffer); + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; } -// Allocate host memory. -// @param agent_info Agent from whose memory region to allocate -// @param size Size of memory in terms of bytes -// @return uint8_t* Pointer to buffer, null if allocation fails. -uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) { - hsa_status_t status; - size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - - uint8_t* buffer = NULL; - status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer); - CHECK_STATUS("hsa_memory_allocate", status); - return (status == HSA_STATUS_SUCCESS) ? buffer : NULL; -} - -// Allocate memory tp pass kernel parameters. +// Allocate memory to pass kernel parameters. +// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. // @param agent_info Agent from whose memory region to allocate // @param size Size of memory in terms of bytes // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) { - return AllocateSysMemory(agent_info, size); + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + if (!cpu_agents_.empty()) { + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + status = hsa_amd_memory_pool_allocate(cpu_list_[0]->kern_arg_pool, size, 0, (void**)&buffer); + // Both the CPU and GPU can access the kernel arguments + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; } -// Memcopy method -bool HsaRsrcFactory::Memcpy(hsa_agent_t agent, void* dest_buff, const void* src_buff, uint32_t length) { - (void)agent; - const hsa_status_t status = hsa_memory_copy(dest_buff, src_buff, length); - CHECK_STATUS("hsa_memory_copy", status); +// Allocate system memory accessible by both CPU and GPU +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + if (!cpu_agents_.empty()) { + status = hsa_amd_memory_pool_allocate(cpu_list_[0]->cpu_pool, size, 0, (void**)&buffer); + // Both the CPU and GPU can access the memory + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Copy data from GPU to host memory +bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + if (!cpu_agents_.empty()) { + hsa_signal_t s = {}; + status = hsa_signal_create(1, 0, NULL, &s); + if (status == HSA_STATUS_SUCCESS) { + status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + if (status == HSA_STATUS_SUCCESS) { + if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) { + status = HSA_STATUS_ERROR; + } + } + status = hsa_signal_destroy(s); + } + } return (status == HSA_STATUS_SUCCESS); } -bool HsaRsrcFactory::Memcpy(const AgentInfo* agent_info, void* dest_buff, const void* src_buff, uint32_t length) { - (void)agent_info; - return Memcpy(agent_info->dev_id, dest_buff, src_buff, length); +bool HsaRsrcFactory::Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size) { + return Memcpy(agent_info->dev_id, dst, src, size); } -// Free method +// Memory free method bool HsaRsrcFactory::FreeMemory(void* ptr) { const hsa_status_t status = hsa_memory_free(ptr); CHECK_STATUS("hsa_memory_free", status); @@ -419,7 +493,6 @@ bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl; std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl; std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl; - std::clog << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl; std::clog << ">> CU number : " << agent_info->cu_num << std::endl; std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl; std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl; @@ -458,3 +531,4 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, void* packet) { HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; + diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h index 021f02858b..4c30ee9f6e 100644 --- a/test/util/hsa_rsrc_factory.h +++ b/test/util/hsa_rsrc_factory.h @@ -22,10 +22,11 @@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING POSSIBILITY OF SUCH DAMAGE. ********************************************************************/ -#ifndef TEST_UTIL_HSA_RSRC_FACTORY_H_ -#define TEST_UTIL_HSA_RSRC_FACTORY_H_ +#ifndef _HSA_RSRC_FACTORY_H_ +#define _HSA_RSRC_FACTORY_H_ #include +#include #include #include #include @@ -52,8 +53,16 @@ POSSIBILITY OF SUCH DAMAGE. exit(1); \ } -static const unsigned MEM_PAGE_BYTES = 0x1000; -static const unsigned MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; +#define CHECK_ITER_STATUS(msg, status) \ + if (status != HSA_STATUS_INFO_BREAK) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + exit(1); \ + } + +static const size_t MEM_PAGE_BYTES = 0x1000; +static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; // Encapsulates information about a Hsa Agent such as its @@ -86,11 +95,10 @@ struct AgentInfo { // Hsail profile supported by agent hsa_profile_t profile; - // Memory region supporting kernel parameters - hsa_region_t coarse_region; - - // Memory region supporting kernel arguments - hsa_region_t kernarg_region; + // CPU/GPU/kern-arg memory pools + hsa_amd_memory_pool_t cpu_pool; + hsa_amd_memory_pool_t gpu_pool; + hsa_amd_memory_pool_t kern_arg_pool; // The number of compute unit available in the agent. uint32_t cu_num; @@ -170,31 +178,31 @@ class HsaRsrcFactory { // @return bool true if successful, false otherwise bool CreateSignal(uint32_t value, hsa_signal_t* signal); - // Allocate memory for use by a kernel of specified size in specified - // agent's memory region. Currently supports Global segment whose Kernarg - // flag set. + // Allocate local GPU memory // @param agent_info Agent from whose memory region to allocate // @param size Size of memory in terms of bytes // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size); - // Allocate system memory. - // @param agent_info Agent from whose memory region to allocate - // @param size Size of memory in terms of bytes - // @return uint8_t* Pointer to buffer, null if allocation fails. - uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size); - - // Allocate memory tp pass kernel parameters. + // Allocate memory tp pass kernel parameters + // Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. // @param agent_info Agent from whose memory region to allocate // @param size Size of memory in terms of bytes // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* AllocateKernArgMemory(const AgentInfo* agent_info, size_t size); - // Memcopy method - static bool Memcpy(const AgentInfo* agent_info, void* dest_buff, const void* src_buff, uint32_t length); - static bool Memcpy(hsa_agent_t agent, void* dest_buff, const void* src_buff, uint32_t length); + // Allocate system memory accessible from both CPU and GPU + // Memory is alocated accessible to all CPU agents and AgentInfo parameter is ignored. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size); - // Free method + // Copy data from GPU to host memory + bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size); + bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size); + + // Memory free method static bool FreeMemory(void* ptr); // Loads an Assembled Brig file and Finalizes it into Device Isa @@ -248,9 +256,11 @@ class HsaRsrcFactory { // Used to maintain a list of Hsa Gpu Agent Info std::vector gpu_list_; + std::vector gpu_agents_; // Used to maintain a list of Hsa Cpu Agent Info std::vector cpu_list_; + std::vector cpu_agents_; // System agents map std::map agent_map_; @@ -262,4 +272,5 @@ class HsaRsrcFactory { hsa_ven_amd_loader_1_00_pfn_t loader_api_; }; -#endif // TEST_UTIL_HSA_RSRC_FACTORY_H_ + +#endif // _HSA_RSRC_FACTORY_H_