/********************************************************************** Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: <95> Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. <95> Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ********************************************************************/ #include "util/hsa_rsrc_factory.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // Demangle C++ symbol name static const char* cpp_demangle(const char* symname) { size_t size = 0; int status; const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status); return (ret != 0) ? ret : strdup(symname); } // Callback function to get available in the system agents hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) { HsaRsrcFactory* hsa_rsrc = reinterpret_cast(data); // AddAgentInfo may return NULL for unsupported agent types (e.g., NPU). // We should continue iterating regardless. hsa_rsrc->AddAgentInfo(agent); return HSA_STATUS_SUCCESS; } // This function checks to see if the provided // pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true, // the function adds an additional requirement that the pool have the // HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false, // pools must NOT have this property. // Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is // returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but // no pool was found meeting the requirements. If an error is encountered, we // return that error. static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) { hsa_status_t err; hsa_amd_segment_t segment; uint32_t flag; if (nullptr == data) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info( pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); CHECK_STATUS("hsa_amd_memory_pool_get_info", err); if (HSA_AMD_SEGMENT_GLOBAL != segment) { return HSA_STATUS_SUCCESS; } err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info( pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); CHECK_STATUS("hsa_amd_memory_pool_get_info", err); uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; if ((karg_st == 0 && kern_arg) || (karg_st != 0 && !kern_arg)) { return HSA_STATUS_SUCCESS; } *(reinterpret_cast(data)) = pool; return HSA_STATUS_INFO_BREAK; } // This is the call-back function for hsa_amd_agent_iterate_memory_pools() that // finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT // HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) { return FindGlobalPool(pool, data, false); } // This is the call-back function for hsa_amd_agent_iterate_memory_pools() that // finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS // HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { return FindGlobalPool(pool, data, true); } // Constructor of the class HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) { hsa_status_t status; cpu_pool_ = NULL; kern_arg_pool_ = NULL; InitHsaApiTable(NULL); // Initialize the Hsa Runtime if (initialize_hsa_) { status = hsa_api_.hsa_init(); CHECK_STATUS("Error in hsa_init", status); } // Discover the set of Gpu devices available on the platform status = hsa_api_.hsa_iterate_agents(GetHsaAgentsCallback, this); CHECK_STATUS("Error Calling hsa_iterate_agents", status); if (cpu_pool_ == NULL) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR); if (kern_arg_pool_ == NULL) CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR); // Get AqlProfile API table aqlprofile_api_ = {0}; #ifdef ROCP_LD_AQLPROFILE status = LoadAqlProfileLib(&aqlprofile_api_); #else status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_); #endif CHECK_STATUS("aqlprofile API table load failed", status); // Get Loader API table loader_api_ = {0}; status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); CHECK_STATUS("loader API table query failed", status); // Instantiate HSA timer timer_ = new HsaTimer(&hsa_api_); CHECK_STATUS("HSA timer allocation failed", (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); // Time correlation const uint32_t corr_iters = 1000; for (unsigned time_id = 0; time_id < HsaTimer::TIME_ID_NUMBER; time_id += 1) { CorrelateTime((HsaTimer::time_id_t)time_id, corr_iters); } // System timeout timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_); } // Destructor of the class HsaRsrcFactory::~HsaRsrcFactory() { delete timer_; for (auto p : cpu_list_) delete p; for (auto p : gpu_list_) delete p; if (initialize_hsa_) { hsa_status_t status = hsa_api_.hsa_shut_down(); CHECK_STATUS("Error in hsa_shut_down", status); } } void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { std::lock_guard lck(mutex_); if (hsa_api_.hsa_init == NULL) { if (table != NULL) { hsa_api_.hsa_init = table->core_->hsa_init_fn; hsa_api_.hsa_shut_down = table->core_->hsa_shut_down_fn; hsa_api_.hsa_agent_get_info = table->core_->hsa_agent_get_info_fn; hsa_api_.hsa_iterate_agents = table->core_->hsa_iterate_agents_fn; hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn; hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn; hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn; hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn; hsa_api_.hsa_queue_add_write_index_scacq_screl = table->core_->hsa_queue_add_write_index_scacq_screl_fn; hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn; hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn; hsa_api_.hsa_signal_load_relaxed = table->core_->hsa_signal_load_relaxed_fn; hsa_api_.hsa_signal_store_relaxed = table->core_->hsa_signal_store_relaxed_fn; hsa_api_.hsa_signal_wait_scacquire = table->core_->hsa_signal_wait_scacquire_fn; hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn; hsa_api_.hsa_code_object_reader_create_from_file = table->core_->hsa_code_object_reader_create_from_file_fn; hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn; hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; hsa_api_.hsa_executable_destroy = table->core_->hsa_executable_destroy_fn; hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn; hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn; hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn; hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn; hsa_api_.hsa_amd_agent_iterate_memory_pools = table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn; hsa_api_.hsa_amd_memory_pool_get_info = table->amd_ext_->hsa_amd_memory_pool_get_info_fn; hsa_api_.hsa_amd_memory_pool_allocate = table->amd_ext_->hsa_amd_memory_pool_allocate_fn; hsa_api_.hsa_amd_agents_allow_access = table->amd_ext_->hsa_amd_agents_allow_access_fn; hsa_api_.hsa_amd_memory_async_copy = table->amd_ext_->hsa_amd_memory_async_copy_fn; hsa_api_.hsa_amd_signal_async_handler = table->amd_ext_->hsa_amd_signal_async_handler_fn; hsa_api_.hsa_amd_profiling_set_profiler_enabled = table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn; hsa_api_.hsa_amd_profiling_get_async_copy_time = table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn; hsa_api_.hsa_amd_profiling_get_dispatch_time = table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn; } else { hsa_api_.hsa_init = hsa_init; hsa_api_.hsa_shut_down = hsa_shut_down; hsa_api_.hsa_agent_get_info = hsa_agent_get_info; hsa_api_.hsa_iterate_agents = hsa_iterate_agents; hsa_api_.hsa_queue_create = hsa_queue_create; hsa_api_.hsa_queue_destroy = hsa_queue_destroy; hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed; hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed; hsa_api_.hsa_queue_add_write_index_scacq_screl = hsa_queue_add_write_index_scacq_screl; hsa_api_.hsa_signal_create = hsa_signal_create; hsa_api_.hsa_signal_destroy = hsa_signal_destroy; hsa_api_.hsa_signal_load_relaxed = hsa_signal_load_relaxed; hsa_api_.hsa_signal_store_relaxed = hsa_signal_store_relaxed; hsa_api_.hsa_signal_wait_scacquire = hsa_signal_wait_scacquire; hsa_api_.hsa_signal_store_screlease = hsa_signal_store_screlease; hsa_api_.hsa_code_object_reader_create_from_file = hsa_code_object_reader_create_from_file; hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt; hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; hsa_api_.hsa_executable_freeze = hsa_executable_freeze; hsa_api_.hsa_executable_destroy = hsa_executable_destroy; hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info; hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols; hsa_api_.hsa_system_get_info = hsa_system_get_info; hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table; hsa_api_.hsa_amd_agent_iterate_memory_pools = hsa_amd_agent_iterate_memory_pools; hsa_api_.hsa_amd_memory_pool_get_info = hsa_amd_memory_pool_get_info; hsa_api_.hsa_amd_memory_pool_allocate = hsa_amd_memory_pool_allocate; hsa_api_.hsa_amd_agents_allow_access = hsa_amd_agents_allow_access; hsa_api_.hsa_amd_memory_async_copy = hsa_amd_memory_async_copy; hsa_api_.hsa_amd_signal_async_handler = hsa_amd_signal_async_handler; hsa_api_.hsa_amd_profiling_set_profiler_enabled = hsa_amd_profiling_set_profiler_enabled; hsa_api_.hsa_amd_profiling_get_async_copy_time = hsa_amd_profiling_get_async_copy_time; hsa_api_.hsa_amd_profiling_get_dispatch_time = hsa_amd_profiling_get_dispatch_time; } } } hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { void* handle = dlopen(kAqlProfileLib, RTLD_NOW); if (handle == NULL) { fprintf(stderr, "Loading '%s' failed, %s\n", kAqlProfileLib, dlerror()); return HSA_STATUS_ERROR; } dlerror(); /* Clear any existing error */ api->hsa_ven_amd_aqlprofile_error_string = (decltype(::hsa_ven_amd_aqlprofile_error_string)*)dlsym( handle, "hsa_ven_amd_aqlprofile_error_string"); api->hsa_ven_amd_aqlprofile_validate_event = (decltype(::hsa_ven_amd_aqlprofile_validate_event)*)dlsym( handle, "hsa_ven_amd_aqlprofile_validate_event"); api->hsa_ven_amd_aqlprofile_start = (decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start"); api->hsa_ven_amd_aqlprofile_stop = (decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop"); api->hsa_ven_amd_aqlprofile_read = (decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read"); api->hsa_ven_amd_aqlprofile_legacy_get_pm4 = (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym( handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); api->hsa_ven_amd_aqlprofile_get_info = (decltype(::hsa_ven_amd_aqlprofile_get_info)*)dlsym( handle, "hsa_ven_amd_aqlprofile_get_info"); api->hsa_ven_amd_aqlprofile_iterate_data = (decltype(::hsa_ven_amd_aqlprofile_iterate_data)*)dlsym( handle, "hsa_ven_amd_aqlprofile_iterate_data"); return HSA_STATUS_SUCCESS; } // Add system agent info const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { // Determine if device is a Gpu agent hsa_status_t status; AgentInfo* agent_info = NULL; hsa_device_type_t type; status = hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); CHECK_STATUS("Error Calling hsa_agent_get_info", status); if (type == HSA_DEVICE_TYPE_CPU) { agent_info = new AgentInfo{}; agent_info->dev_id = agent; agent_info->dev_type = HSA_DEVICE_TYPE_CPU; agent_info->dev_index = cpu_list_.size(); status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool; status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL)) kern_arg_pool_ = &agent_info->kern_arg_pool; agent_info->gpu_pool = {}; cpu_list_.push_back(agent_info); cpu_agents_.push_back(agent); } if (type == HSA_DEVICE_TYPE_GPU) { agent_info = new AgentInfo{}; agent_info->dev_id = agent; agent_info->dev_type = HSA_DEVICE_TYPE_GPU; hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); const int gfxip_label_len = std::min(strlen(agent_info->name) - 2, sizeof(agent_info->gfxip) - 1); memcpy(agent_info->gfxip, agent_info->name, gfxip_label_len); agent_info->gfxip[gfxip_label_len] = '\0'; hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false; hsa_api_.hsa_agent_get_info( agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &agent_info->cu_num); hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), &agent_info->waves_per_cu); hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), &agent_info->simds_per_cu); hsa_api_.hsa_agent_get_info( agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &agent_info->se_num); hsa_api_.hsa_agent_get_info( agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &agent_info->shader_arrays_per_se); agent_info->cpu_pool = {}; agent_info->kern_arg_pool = {}; status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); // Set GPU index uint32_t driver_node_id; status = hsa_api_.hsa_agent_get_info( agent, static_cast(HSA_AMD_AGENT_INFO_DRIVER_NODE_ID), &driver_node_id); CHECK_STATUS("hsa_agent_get_info(gpu hsa_driver_node_id)", status); agent_info->dev_index = driver_node_id; gpu_list_.push_back(agent_info); gpu_agents_.push_back(agent); } if (agent_info) agent_map_[agent.handle] = agent_info; return agent_info; } // Return systen agent info const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) { const AgentInfo* agent_info = NULL; auto it = agent_map_.find(agent.handle); if (it != agent_map_.end()) { agent_info = it->second; } return agent_info; } // Get the count of Hsa Gpu Agents available on the platform // // @return uint32_t Number of Gpu agents on platform // uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); } // Get the count of Hsa Cpu Agents available on the platform // // @return uint32_t Number of Cpu agents on platform // uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); } // Get the AgentInfo handle of a Gpu device // // @param idx Gpu Agent at specified index // // @param agent_info Output parameter updated with AgentInfo // // @return bool true if successful, false otherwise // bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { // Determine if request is valid uint32_t size = uint32_t(gpu_list_.size()); if (idx >= size) { return false; } // Copy AgentInfo from specified index *agent_info = gpu_list_[idx]; return true; } // Get the AgentInfo handle of a Cpu device // // @param idx Cpu Agent at specified index // // @param agent_info Output parameter updated with AgentInfo // // @return bool true if successful, false otherwise // bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { // Determine if request is valid uint32_t size = uint32_t(cpu_list_.size()); if (idx >= size) { return false; } // Copy AgentInfo from specified index *agent_info = cpu_list_[idx]; return true; } // Create a Queue object and return its handle. The queue object is expected // to support user requested number of Aql dispatch packets. // // @param agent_info Gpu Agent on which to create a queue object // // @param num_Pkts Number of packets to be held by queue // // @param queue Output parameter updated with handle of queue object // // @return bool true if successful, false otherwise // bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) { hsa_status_t status; status = hsa_api_.hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, queue); return (status == HSA_STATUS_SUCCESS); } // Create a Signal object and return its handle. // @param value Initial value of signal object // @param signal Output parameter updated with handle of signal object // @return bool true if successful, false otherwise bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { hsa_status_t status; status = hsa_api_.hsa_signal_create(value, 0, NULL, signal); return (status == HSA_STATUS_SUCCESS); } // Allocate memory for use by a kernel of specified size in specified // agent's memory region. // @param agent_info Agent from whose memory region to allocate // @param size Size of memory in terms of bytes // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) { hsa_status_t status = HSA_STATUS_ERROR; uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; status = hsa_api_.hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG, reinterpret_cast(&buffer)); uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; return ptr; } // Allocate memory to pass kernel parameters. // Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. // @param agent_info Agent from whose memory region to allocate // @param size Size of memory in terms of bytes // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) { hsa_status_t status = HSA_STATUS_ERROR; uint8_t* buffer = NULL; if (!cpu_agents_.empty()) { size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; status = hsa_api_.hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the kernel arguments if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); } } uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; return ptr; } // Allocate system memory accessible by both CPU and GPU // @param agent_info Agent from whose memory region to allocate // @param size Size of memory in terms of bytes // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) { hsa_status_t status = HSA_STATUS_ERROR; uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; if (!cpu_agents_.empty()) { status = hsa_api_.hsa_amd_memory_pool_allocate(*cpu_pool_, size, HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); } } uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; return ptr; } // Allocate memory for command buffer. // @param agent_info Agent from whose memory region to allocate // @param size Size of memory in terms of bytes // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t size) { size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; uint8_t* ptr = (agent_info->is_apu && CMD_MEMORY_MMAP) ? reinterpret_cast( mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, 0, 0)) : AllocateSysMemory(agent_info, size); return ptr; } // Wait signal hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { const hsa_signal_value_t exp_value = signal_value - 1; hsa_signal_value_t ret_value = signal_value; while (1) { ret_value = hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, signal_value, timeout_, HSA_WAIT_STATE_BLOCKED); if (ret_value == exp_value) break; if (ret_value != signal_value) { std::cerr << "Error: HsaRsrcFactory::SignalWait: signal_value(" << signal_value << "), ret_value(" << ret_value << ")" << std::endl << std::flush; abort(); } } return ret_value; } // Wait signal with signal value restore void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { SignalWait(signal, signal_value); hsa_api_.hsa_signal_store_relaxed(const_cast(signal), signal_value); } // Copy data from GPU to host memory bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) { hsa_status_t status = HSA_STATUS_ERROR; if (!cpu_agents_.empty()) { hsa_signal_t s = {}; status = hsa_api_.hsa_signal_create(1, 0, NULL, &s); CHECK_STATUS("hsa_signal_create()", status); status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); CHECK_STATUS("hsa_amd_memory_async_copy()", status); SignalWait(s, 1); status = hsa_api_.hsa_signal_destroy(s); CHECK_STATUS("hsa_signal_destroy()", status); } return (status == HSA_STATUS_SUCCESS); } bool HsaRsrcFactory::Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size) { return Memcpy(agent_info->dev_id, dst, src, size); } // Memory free method bool HsaRsrcFactory::FreeMemory(void* ptr) { const hsa_status_t status = hsa_memory_free(ptr); CHECK_STATUS("hsa_memory_free", status); return (status == HSA_STATUS_SUCCESS); } // Loads an Assembled Brig file and Finalizes it into Device Isa // @param agent_info Gpu device for which to finalize // @param brig_path File path of the Assembled Brig file // @param kernel_name Name of the kernel to finalize // @param code_desc Handle of finalized Code Descriptor that could // be used to submit for execution // @return bool true if successful, false otherwise bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, hsa_executable_t* executable, hsa_executable_symbol_t* code_desc) { hsa_status_t status = HSA_STATUS_ERROR; // Build the code object filename std::string filename(brig_path); std::clog << "Code object filename: " << filename << std::endl; // Open the file containing code object hsa_file_t file_handle = open(filename.c_str(), O_RDONLY); if (file_handle == -1) { std::cerr << "Error: failed to load '" << filename << "'" << std::endl; assert(false); return false; } // Create code object reader hsa_code_object_reader_t code_obj_rdr = {0}; status = hsa_api_.hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); if (status != HSA_STATUS_SUCCESS) { std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl; return false; } // Create executable. status = hsa_api_.hsa_executable_create_alt( HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable); CHECK_STATUS("Error in creating executable object", status); // Load code object. status = hsa_api_.hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr, NULL, NULL); CHECK_STATUS("Error in loading executable object", status); // Freeze executable. status = hsa_api_.hsa_executable_freeze(*executable, ""); CHECK_STATUS("Error in freezing executable object", status); // Get symbol handle. hsa_executable_symbol_t kernelSymbol; status = hsa_api_.hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, &kernelSymbol); CHECK_STATUS("Error in looking up kernel symbol", status); close(file_handle); // Update output parameter *code_desc = kernelSymbol; return true; } // Print the various fields of Hsa Gpu Agents bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { std::cout << std::flush; std::clog << header << " :" << std::endl; const AgentInfo* agent_info; int size = uint32_t(gpu_list_.size()); for (int idx = 0; idx < size; idx++) { agent_info = gpu_list_[idx]; std::clog << "> agent[" << idx << "] :" << std::endl; std::clog << ">> Name : " << agent_info->name << std::endl; std::clog << ">> APU : " << agent_info->is_apu << std::endl; std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl; std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl; std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl; std::clog << ">> CU number : " << agent_info->cu_num << std::endl; std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl; std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl; std::clog << ">> SE number : " << agent_info->se_num << std::endl; std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl; } return true; } void* HsaRsrcFactory::GetReadPointer(hsa_queue_t* queue) { const uint32_t slot_size_b = CMD_SLOT_SIZE_B; const uint64_t read_idx = hsa_api_.hsa_queue_load_read_index_relaxed(queue); const uint32_t slot_idx = (uint32_t)(read_idx % queue->size); void* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b)); return queue_slot; } uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { const uint32_t slot_size_b = CMD_SLOT_SIZE_B; // adevance command queue const uint64_t write_idx = hsa_api_.hsa_queue_add_write_index_scacq_screl(queue, 1); while ((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { sched_yield(); } const uint32_t slot_idx = (uint32_t)(write_idx % queue->size); uint32_t* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b)); const uint32_t* slot_data = reinterpret_cast(packet); // Copy buffered commands into the queue slot. // Overwrite the AQL invalid header (first dword) last. // This prevents the slot from being read until it's fully written. memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t)); std::atomic* header_atomic_ptr = reinterpret_cast*>(&queue_slot[0]); header_atomic_ptr->store(slot_data[0], std::memory_order_release); // ringdoor bell hsa_api_.hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); return write_idx; } uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes) { const uint32_t slot_size_b = CMD_SLOT_SIZE_B; if ((size_bytes & (slot_size_b - 1)) != 0) { fprintf(stderr, "HsaRsrcFactory::Submit: Bad packet size %zx\n", size_bytes); abort(); } const char* begin = reinterpret_cast(packet); const char* end = begin + size_bytes; uint64_t write_idx = 0; for (const char* ptr = begin; ptr < end; ptr += slot_size_b) { write_idx = Submit(queue, ptr); } return write_idx; } hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void* arg) { hsa_symbol_kind_t value = (hsa_symbol_kind_t)0; hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value); CHECK_STATUS("Error in getting symbol info", status); if (value == HSA_SYMBOL_KIND_KERNEL) { uint64_t addr = 0; status = hsa_api_.hsa_executable_symbol_get_info( symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr); CHECK_STATUS("Error in getting kernel object", status); const int to_free = reinterpret_cast(arg); const char* name = NULL; if (to_free == 0) { uint32_t len = 0; status = hsa_api_.hsa_executable_symbol_get_info( symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); CHECK_STATUS("Error in getting name len", status); char sym_name[len + 1]; status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, sym_name); CHECK_STATUS("Error in getting kernel name", status); sym_name[len] = 0; name = cpp_demangle(sym_name); } SetKernelNameRef(addr, name, to_free); } return HSA_STATUS_SUCCESS; } hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, const char* options) { std::lock_guard lck(mutex_); if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t; hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, (void*)0); CHECK_STATUS("Error in iterating executable symbols", status); return hsa_api_.hsa_executable_freeze(executable, options); } hsa_status_t HsaRsrcFactory::hsa_executable_destroy_interceptor(hsa_executable_t executable) { std::lock_guard lck(mutex_); if (symbols_map_ != NULL) { hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, (void*)1); CHECK_STATUS("Error in iterating executable symbols", status); } return hsa_api_.hsa_executable_destroy(executable); } void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { std::lock_guard lck(mutex_); executable_tracking_on_ = true; table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; table->core_->hsa_executable_destroy_fn = hsa_executable_destroy_interceptor; } std::atomic HsaRsrcFactory::instance_{}; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; hsa_pfn_t HsaRsrcFactory::hsa_api_{}; bool HsaRsrcFactory::executable_tracking_on_ = false; HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = NULL;