diff --git a/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt b/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt index 921796b05f..d9a98c972d 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt +++ b/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt @@ -120,7 +120,6 @@ set ( SRCS "core/util/lnx/os_linux.cpp" "core/runtime/amd_blit_kernel.cpp" "core/runtime/amd_blit_sdma.cpp" "core/runtime/amd_cpu_agent.cpp" - "core/runtime/amd_debugger.cpp" "core/runtime/amd_gpu_agent.cpp" "core/runtime/amd_aql_queue.cpp" "core/runtime/amd_loader_context.cpp" diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/agent.h index 5043ecd28e..8a1b405097 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/agent.h @@ -218,12 +218,6 @@ class Agent : public Checked<0xF6BC25EB17E6F917> { uint32_t group_segment_size, Queue** queue) = 0; - virtual hsa_status_t HostQueueCreate(hsa_region_t region, uint32_t ring_size, - hsa_queue_type32_t type, uint32_t features, - hsa_signal_t doorbell_signal, Queue** queue) = 0; - - virtual hsa_status_t QueueDestroy(Queue* queue) = 0; - // @brief Query the value of an attribute. // // @param [in] attribute Attribute to query. diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h index 2bc2b3e50d..1932eaa90e 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h @@ -44,13 +44,12 @@ #define HSA_RUNTIME_CORE_INC_AMD_HW_AQL_COMMAND_PROCESSOR_H_ #include "core/inc/runtime.h" -#include "core/inc/amd_gpu_agent.h" #include "core/inc/signal.h" #include "core/inc/queue.h" +#include "core/inc/amd_gpu_agent.h" #include "core/util/locks.h" namespace amd { - /// @brief Encapsulates HW Aql Command Processor functionality. It /// provide the interface for things such as Doorbell register, read, /// write pointers and a buffer. @@ -185,8 +184,6 @@ class AqlQueue : public core::Queue, public core::Signal { // @brief Submits a block of PM4 and waits until it has been executed. void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override; - WaveStates GetWaveStates(); - /// @brief This operation is illegal hsa_signal_value_t LoadRelaxed() override { assert(false); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_cpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_cpu_agent.h index ee9e6ad6cf..af5de53d24 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_cpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_cpu_agent.h @@ -45,8 +45,6 @@ #ifndef HSA_RUNTIME_CORE_INC_AMD_CPU_AGENT_H_ #define HSA_RUNTIME_CORE_INC_AMD_CPU_AGENT_H_ -#include -#include #include #include "hsakmt.h" @@ -105,12 +103,6 @@ class CpuAgent : public core::Agent { uint32_t group_segment_size, core::Queue** queue) override; - hsa_status_t HostQueueCreate(hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type, - uint32_t features, hsa_signal_t doorbell_signal, - core::Queue** queue) override; - - hsa_status_t QueueDestroy(core::Queue* queue) override; - // @brief Returns number of data caches. __forceinline size_t num_cache() const { return cache_props_.size(); } @@ -162,8 +154,6 @@ class CpuAgent : public core::Agent { // @brief Array of regions owned by this agent. std::vector regions_; - std::list> queues_; - DISALLOW_COPY_AND_ASSIGN(CpuAgent); }; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_debugger.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_debugger.h deleted file mode 100644 index d8b318badd..0000000000 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_debugger.h +++ /dev/null @@ -1,57 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// -// The University of Illinois/NCSA -// Open Source License (NCSA) -// -// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// -// Developed by: -// -// AMD Research and AMD HSA Software Development -// -// Advanced Micro Devices, Inc. -// -// www.amd.com -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to -// deal with the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in -// the documentation and/or other materials provided with the distribution. -// - Neither the names of Advanced Micro Devices, Inc, -// nor the names of its contributors may be used to endorse or promote -// products derived from this Software without specific prior written -// permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS WITH THE SOFTWARE. -// -//////////////////////////////////////////////////////////////////////////////// - -#ifndef HSA_RUNTIME_CORE_INC_AMD_DEBUGGER_H_ -#define HSA_RUNTIME_CORE_INC_AMD_DEBUGGER_H_ - -#include "core/inc/amd_gpu_agent.h" -#include "hsakmt.h" - -namespace amd { - -class Debugger { - public: - static void HandleFault(const HsaMemoryAccessFault& fault, GpuAgentInt* agent); -}; -} - -#endif diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index 876924b367..6e9ef909a3 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -45,8 +45,6 @@ #ifndef HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_ #define HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_ -#include -#include #include #include "hsakmt.h" @@ -61,42 +59,6 @@ namespace amd { class MemoryRegion; -class AqlQueue; - -struct WaveState { - // Number of SGPRs allocated per wavefront. - uint32_t num_sgprs; - - // Array of packed SGPR data. - uint32_t* sgprs; - - // Number of VGPRs allocated per wavefront. - uint32_t num_vgprs; - - // Number of lanes in each VGPR. - uint32_t num_vgpr_lanes; - - // Array of packed VGPR data. - // VGPR value = vgprs[(vgpr_idx * num_vgpr_lanes) + lane_idx] - uint32_t* vgprs; - - // Data for miscellaneous registers. - struct { - uint64_t pc; - uint64_t exec; - uint32_t status; - uint32_t trapsts; - uint32_t m0; - } regs; - - // LDS allocation size for the work group, in 32-bit words. - uint32_t lds_size_dw; - - // Packed LDS data for the work group. - uint32_t* lds; -}; - -typedef std::vector WaveStates; // @brief Contains scratch memory information. struct ScratchInfo { @@ -113,14 +75,6 @@ class GpuAgentInt : public core::Agent { GpuAgentInt(uint32_t node_id) : core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {} - // @brief GpuAgent does not support HostQueueCreation. - hsa_status_t HostQueueCreate(hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type, - uint32_t features, hsa_signal_t doorbell_signal, - core::Queue** queue) override { - assert(false && "GpuAgent::HostQueueCreate not implemented"); - return HSA_STATUS_ERROR_INVALID_AGENT; - } - // @brief Initialize DMA queue. // // @retval HSA_STATUS_SUCCESS DMA queue initialization is successful. @@ -186,8 +140,6 @@ class GpuAgentInt : public core::Agent { // @brief Invalidate caches on the agent which may hold code object data. virtual void InvalidateCodeCaches() = 0; - virtual WaveStates GetWaveStates() = 0; - // @brief Sets the coherency type of this agent. // // @param [in] type New coherency type. @@ -300,8 +252,6 @@ class GpuAgent : public GpuAgentInt { uint32_t group_segment_size, core::Queue** queue) override; - hsa_status_t QueueDestroy(core::Queue* queue) override; - // @brief Override from amd::GpuAgentInt. void AcquireQueueScratch(ScratchInfo& scratch) override; @@ -318,8 +268,6 @@ class GpuAgent : public GpuAgentInt { // @brief Override from amd::GpuAgentInt. void InvalidateCodeCaches() override; - WaveStates GetWaveStates() override; - // @brief Override from amd::GpuAgentInt. bool current_coherency_type(hsa_amd_coherency_type_t type) override; @@ -430,10 +378,14 @@ class GpuAgent : public GpuAgentInt { core::Blit* blits_[BlitCount]; - std::list> queues_; + // @brief AQL queues for cache management and blit compute usage. + enum QueueEnum { + QueueUtility, // Cache management and device to {host,device} blit compute + QueueBlitOnly, // Host to device blit + QueueCount + }; - core::Queue* queue_util_; - core::Queue* queue_blit_; + core::Queue* queues_[QueueCount]; // @brief Mutex to protect the update to coherency type. KernelMutex coherency_lock_; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h index 0d1dd8c994..90ef35120a 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h @@ -51,7 +51,7 @@ namespace core { class HostQueue : public Queue { public: - HostQueue(Agent& agent, hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type, + HostQueue(hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type, uint32_t features, hsa_signal_t doorbell_signal); ~HostQueue(); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h index 8be14e7a71..5ec43bd7b9 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h @@ -45,7 +45,6 @@ #ifndef HSA_RUNTME_CORE_INC_COMMAND_QUEUE_H_ #define HSA_RUNTME_CORE_INC_COMMAND_QUEUE_H_ #include -#include #include "core/common/shared.h" @@ -56,8 +55,6 @@ #include "inc/amd_hsa_queue.h" namespace core { -class Agent; - struct AqlPacket { union { @@ -140,7 +137,7 @@ All funtions other than Convert and public_handle must be virtual. class Queue : public Checked<0xFA3906A679F9DB49>, public Shared { public: - explicit Queue(Agent& agent) : Shared(), amd_queue_(shared_object()->amd_queue), agent_(agent) { + Queue() : Shared(), amd_queue_(shared_object()->amd_queue) { if (!Shared::IsSharedObjectAllocationValid()) { return; } @@ -311,8 +308,6 @@ class Queue : public Checked<0xFA3906A679F9DB49>, hsa_queue_t* public_handle() const { return public_handle_; } - Agent& agent() { return agent_; } - protected: static void set_public_handle(Queue* ptr, hsa_queue_t* handle) { ptr->do_set_public_handle(handle); @@ -322,8 +317,6 @@ class Queue : public Checked<0xFA3906A679F9DB49>, } hsa_queue_t* public_handle_; - Agent& agent_; - private: DISALLOW_COPY_AND_ASSIGN(Queue); }; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index 8cd9abc8b3..9b316fd530 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -84,9 +84,10 @@ void* AqlQueue::operator new(size_t size) { void AqlQueue::operator delete(void* ptr) { _aligned_free(ptr); } -AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, ScratchInfo& scratch, - core::HsaEventCallback callback, void* err_data, bool is_kv) - : Queue(*agent), +AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, + ScratchInfo& scratch, core::HsaEventCallback callback, + void* err_data, bool is_kv) + : Queue(), Signal(0), ring_buf_(NULL), ring_buf_alloc_bytes_(0), @@ -961,113 +962,4 @@ void AqlQueue::InitScratchSRD() { return; } - -WaveStates AqlQueue::GetWaveStates() { - WaveStates wave_states; - - // Retrieve the control stack and context save area for the queue. - HsaQueueInfo queue_info; - HSAKMT_STATUS status = hsaKmtGetQueueInfo(queue_id_, &queue_info); - - if (status != HSAKMT_STATUS_SUCCESS) { - return wave_states; - } - - // The control stack is processed from start to end. - // The save area is processed from end to start. - uint32_t* ctl_stack = reinterpret_cast(queue_info.ControlStackTop); - uint32_t* wave_area = reinterpret_cast(uintptr_t(queue_info.UserContextSaveArea) + - queue_info.SaveAreaSizeInBytes); - uint32_t ctl_stack_ndw = uint32_t(queue_info.ControlStackUsedInBytes / sizeof(uint32_t)); - - // Control stack persists resource allocation until changed by a command. - uint32_t n_vgprs = 0; - uint32_t n_sgprs = 0; - uint32_t lds_size_dw = 0; - - // LDS is saved per-workgroup but the stack is parsed per-wavefront. - // Track the LDS save area for the current workgroup. - uint32_t* lds = nullptr; - - // Parse each write to COMPUTE_RELAUNCH in sequence. - // First two dwords are SET_SH_REG leader. - for (uint32_t idx = 2; idx < ctl_stack_ndw; ++idx) { - uint32_t relaunch = ctl_stack[idx]; - -#define COMPUTE_RELAUNCH_PAYLOAD_VGPRS(x) (((x) >> 0x0) & 0x3F) -#define COMPUTE_RELAUNCH_PAYLOAD_SGPRS(x) (((x) >> 0x6) & 0x7) -#define COMPUTE_RELAUNCH_PAYLOAD_LDS_SIZE(x) (((x) >> 0x9) & 0x1FF) -#define COMPUTE_RELAUNCH_PAYLOAD_FIRST_WAVE(x) (((x) >> 0x11) & 0x1) -#define COMPUTE_RELAUNCH_IS_EVENT(x) (((x) >> 0x1E) & 0x1) -#define COMPUTE_RELAUNCH_IS_STATE(x) (((x) >> 0x1F) & 0x1) - - bool is_event = COMPUTE_RELAUNCH_IS_EVENT(relaunch); - bool is_state = COMPUTE_RELAUNCH_IS_STATE(relaunch); - - if (is_state && !is_event) { - // Resource allocation state change, update tracked state. - n_vgprs = (0x1 + COMPUTE_RELAUNCH_PAYLOAD_VGPRS(relaunch)) * 0x4; - n_sgprs = ((0x1 + COMPUTE_RELAUNCH_PAYLOAD_SGPRS(relaunch)) - 0x1 /* no trap SGPRs */) * 0x10; - lds_size_dw = COMPUTE_RELAUNCH_PAYLOAD_LDS_SIZE(relaunch) * 0x80; - } else if (!is_state && !is_event) { - // Reference to one wavefront in the save area. - bool first_wave_in_group = COMPUTE_RELAUNCH_PAYLOAD_FIRST_WAVE(relaunch); - - // Save area layout is fixed by context save trap handler and SPI. - uint32_t vgprs_offset = 0x0; - uint32_t sgprs_offset = vgprs_offset + n_vgprs * 0x40; - uint32_t hwregs_offset = sgprs_offset + n_sgprs; - uint32_t lds_offset = hwregs_offset + 0x20; - uint32_t unused_offset = lds_offset + (first_wave_in_group ? lds_size_dw : 0x0); - uint32_t wave_area_size = unused_offset + 0x10; // trap SGPRs were allocated but not saved - uint32_t hwreg_m0_offset = hwregs_offset + 0x0; - uint32_t hwreg_pc_lo_offset = hwregs_offset + 0x1; - uint32_t hwreg_pc_hi_offset = hwregs_offset + 0x2; - uint32_t hwreg_exec_lo_offset = hwregs_offset + 0x3; - uint32_t hwreg_exec_hi_offset = hwregs_offset + 0x4; - uint32_t hwreg_status_offset = hwregs_offset + 0x5; - uint32_t hwreg_trapsts_offset = hwregs_offset + 0x6; - - // Find beginning of wavefront state in the save area. - wave_area -= wave_area_size; - - if (first_wave_in_group) { - // Track the LDS save area for this workgroup. - if (lds_size_dw > 0) { - lds = wave_area + lds_offset; - } else { - lds = nullptr; - } - } - - WaveState wave_state; - - wave_state.num_sgprs = n_sgprs; - wave_state.sgprs = wave_area + sgprs_offset; - wave_state.num_vgprs = n_vgprs; - wave_state.num_vgpr_lanes = 0x40; - wave_state.vgprs = wave_area + vgprs_offset; - wave_state.regs.pc = (uint64_t(wave_area[hwreg_pc_lo_offset]) | - (uint64_t(wave_area[hwreg_pc_hi_offset]) << 0x20)); - wave_state.regs.exec = uint64_t(wave_area[hwreg_exec_lo_offset]) | - (uint64_t(wave_area[hwreg_exec_hi_offset]) << 0x20); - wave_state.regs.status = wave_area[hwreg_status_offset]; - wave_state.regs.trapsts = wave_area[hwreg_trapsts_offset]; - wave_state.regs.m0 = wave_area[hwreg_m0_offset]; - wave_state.lds_size_dw = lds_size_dw; - wave_state.lds = lds; - -#define SQ_WAVE_TRAPSTS_XNACK_ERROR(x) (((x) >> 0x1C) & 0x1) - - if (SQ_WAVE_TRAPSTS_XNACK_ERROR(wave_state.regs.trapsts)) { - // Correct the PC: context save handler subtracted 0x8. - wave_state.regs.pc += 0x8; - } - - wave_states.push_back(wave_state); - } - } - - return wave_states; -} } // namespace amd diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp index 754ce00cc3..0a58d2986f 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp @@ -369,32 +369,4 @@ hsa_status_t CpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, return HSA_STATUS_ERROR; } -hsa_status_t CpuAgent::HostQueueCreate(hsa_region_t region, uint32_t ring_size, - hsa_queue_type32_t type, uint32_t features, - hsa_signal_t doorbell_signal, core::Queue** queue) { - core::HostQueue* host_queue = - new core::HostQueue(*this, region, ring_size, type, features, doorbell_signal); - - if (!host_queue->IsValid()) { - delete host_queue; - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - - queues_.emplace_back(host_queue); - *queue = host_queue; - - return HSA_STATUS_SUCCESS; -} - -hsa_status_t CpuAgent::QueueDestroy(core::Queue* queue) { - auto it = std::find_if( - queues_.begin(), queues_.end(), - [&](std::unique_ptr& queue_ptr) { return queue_ptr.get() == queue; }); - - assert(it != queues_.end() && "attempt to destroy an untracked queue"); - queues_.erase(it); - - return HSA_STATUS_SUCCESS; -} - } // namespace amd diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_debugger.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_debugger.cpp deleted file mode 100644 index 7c81a8c800..0000000000 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_debugger.cpp +++ /dev/null @@ -1,307 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// -// The University of Illinois/NCSA -// Open Source License (NCSA) -// -// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// -// Developed by: -// -// AMD Research and AMD HSA Software Development -// -// Advanced Micro Devices, Inc. -// -// www.amd.com -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to -// deal with the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in -// the documentation and/or other materials provided with the distribution. -// - Neither the names of Advanced Micro Devices, Inc, -// nor the names of its contributors may be used to endorse or promote -// products derived from this Software without specific prior written -// permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS WITH THE SOFTWARE. -// -//////////////////////////////////////////////////////////////////////////////// - -#include "core/inc/amd_debugger.h" -#include "core/inc/amd_loader_context.hpp" -#include "core/inc/amd_aql_queue.h" - -#include -#include -#include -#include -#include -#include -#include - -namespace amd { - -void Debugger::HandleFault(const HsaMemoryAccessFault& fault, GpuAgentInt* agent) { - std::stringstream err; - - uint64_t fault_page_idx = fault.VirtualAddress >> 0xC; - err << "\nMemory access fault by GPU node " << agent->node_id(); - err << " for address 0x" << std::hex << std::uppercase << fault_page_idx << "xxx ("; - - if (fault.Failure.NotPresent == 1) { - err << "page not present"; - } else if (fault.Failure.ReadOnly == 1) { - err << "write access to a read-only page"; - } else if (fault.Failure.NoExecute == 1) { - err << "execute access to a non-executable page"; - } else if (fault.Failure.ECC == 1) { - err << "uncorrectable ECC failure"; - } else { - err << "unknown reason"; - } - - err << ")\n\n"; - - if (core::Runtime::runtime_singleton_->flag().debug_fault() != Flag::DEBUG_FAULT_ANALYZE) { - if (agent->isa()->GetMajorVersion() >= 9) { - err << "For more detail set: HSA_DEBUG_FAULT=\"analyze\"\n\n"; - } - - std::cerr << err.str(); - std::abort(); - } - - WaveStates wave_states = agent->GetWaveStates(); - - for (WaveState& wave_state : wave_states) { -#define SQ_WAVE_TRAPSTS_XNACK_ERROR(x) (((x) >> 0x1C) & 0x1) - - if (SQ_WAVE_TRAPSTS_XNACK_ERROR(wave_state.regs.trapsts)) { - err << "Wavefront found in XNACK error state:\n\n"; - err << " PC: 0x" << std::setw(0x10) << std::setfill('0') << wave_state.regs.pc << "\n"; - err << " EXEC: 0x" << std::setw(0x10) << std::setfill('0') << wave_state.regs.exec << "\n"; - err << " STATUS: 0x" << std::setw(0x8) << std::setfill('0') << wave_state.regs.status << "\n"; - err << "TRAPSTS: 0x" << std::setw(0x8) << std::setfill('0') << wave_state.regs.trapsts - << "\n"; - err << " M0: 0x" << std::setw(0x8) << std::setfill('0') << wave_state.regs.m0 << "\n\n"; - - uint32_t n_sgpr_cols = 4; - uint32_t n_sgpr_rows = wave_state.num_sgprs / n_sgpr_cols; - - for (uint32_t sgpr_row = 0; sgpr_row < n_sgpr_rows; ++sgpr_row) { - err << " "; - - for (uint32_t sgpr_col = 0; sgpr_col < n_sgpr_cols; ++sgpr_col) { - uint32_t sgpr_idx = (sgpr_row * n_sgpr_cols) + sgpr_col; - uint32_t sgpr_val = wave_state.sgprs[sgpr_idx]; - - std::stringstream sgpr_str; - sgpr_str << "s" << sgpr_idx; - - err << std::setw(6) << std::setfill(' ') << sgpr_str.str(); - err << ": 0x" << std::setw(8) << std::setfill('0') << sgpr_val; - } - - err << "\n"; - } - - err << "\n"; - - uint32_t n_vgpr_cols = 4; - uint32_t n_vgpr_rows = wave_state.num_vgprs / n_vgpr_cols; - - for (uint32_t lane_idx = 0; lane_idx < wave_state.num_vgpr_lanes; ++lane_idx) { - err << "Lane 0x" << lane_idx << "\n"; - - for (uint32_t vgpr_row = 0; vgpr_row < n_vgpr_rows; ++vgpr_row) { - err << " "; - - for (uint32_t vgpr_col = 0; vgpr_col < n_vgpr_cols; ++vgpr_col) { - uint32_t vgpr_idx = (vgpr_row * n_vgpr_cols) + vgpr_col; - uint32_t vgpr_val = wave_state.vgprs[(vgpr_idx * wave_state.num_vgpr_lanes) + lane_idx]; - - std::stringstream vgpr_str; - vgpr_str << "v" << vgpr_idx; - - err << std::setw(6) << std::setfill(' ') << vgpr_str.str(); - err << ": 0x" << std::setw(8) << std::setfill('0') << vgpr_val; - } - - err << "\n"; - } - } - - err << "\n"; - - if (wave_state.lds) { - err << "LDS:\n\n"; - - uint32_t n_lds_cols = 4; - uint32_t n_lds_rows = wave_state.lds_size_dw / n_lds_cols; - - for (uint32_t lds_row = 0; lds_row < n_lds_rows; ++lds_row) { - uint32_t lds_addr = lds_row * n_lds_cols * 4; - - err << "0x" << std::setw(4) << std::setfill('0') << lds_addr << ":"; - - for (uint32_t lds_col = 0; lds_col < n_lds_cols; ++lds_col) { - uint32_t lds_idx = (lds_row * n_lds_cols) + lds_col; - uint32_t lds_val = wave_state.lds[lds_idx]; - - err << " 0x" << std::setw(8) << std::setfill('0') << lds_val; - } - - err << "\n"; - } - - err << "\n"; - } - - // Attempt to match the PC to a loaded code object. - amd::hsa::loader::LoadedCodeObject* pc_code_obj = nullptr; - uint64_t pc_code_obj_offset = 0; - - auto iter_execs = [&](hsa_executable_t exec) { - auto iter_code_objs = [&](hsa_loaded_code_object_t code_obj) { - auto iter_segments = [&](amd_loaded_segment_t segment) { - auto segment_int = amd::hsa::loader::LoadedSegment::Object(segment); - - uint64_t load_base, load_size; - segment_int->GetInfo(AMD_LOADED_SEGMENT_INFO_LOAD_BASE_ADDRESS, &load_base); - segment_int->GetInfo(AMD_LOADED_SEGMENT_INFO_SIZE, &load_size); - - if ((wave_state.regs.pc >= load_base) && - (wave_state.regs.pc < (load_base + load_size))) { - pc_code_obj = amd::hsa::loader::LoadedCodeObject::Object(code_obj); - pc_code_obj_offset = wave_state.regs.pc - load_base; - } - - return HSA_STATUS_SUCCESS; - }; - - amd::hsa::loader::LoadedCodeObject::Object(code_obj)->IterateLoadedSegments( - [](amd_loaded_segment_t segment, void* data) { - return (*reinterpret_cast(data))(segment); - }, - &iter_segments); - - return HSA_STATUS_SUCCESS; - }; - - amd::hsa::loader::Executable::Object(exec)->IterateLoadedCodeObjects( - [](hsa_loaded_code_object_t code_obj, void* data) { - return (*reinterpret_cast(data))(code_obj); - }, - &iter_code_objs); - - return HSA_STATUS_SUCCESS; - }; - - core::Runtime::runtime_singleton_->loader()->IterateExecutables( - [](hsa_executable_t exec, void* data) { - return (*reinterpret_cast(data))(exec); - }, - &iter_execs); - - if (pc_code_obj) { - // Write the code object to a temporary file. - uint64_t elf_addr; - size_t elf_size; - pc_code_obj->GetInfo(AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE, &elf_addr); - pc_code_obj->GetInfo(AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE_SIZE, &elf_size); - - char code_obj_path[] = "/tmp/hsartXXXXXX"; - int code_obj_fd = ::mkstemp(code_obj_path); - ::write(code_obj_fd, (const void*)uintptr_t(elf_addr), elf_size); - ::close(code_obj_fd); - - // Invoke binutils objdump on the code object. - int pipe_fd[2]; - ::pipe(pipe_fd); - - pid_t pid = ::fork(); - - if (pid == 0) { - ::dup2(pipe_fd[1], STDOUT_FILENO); - ::dup2(pipe_fd[1], STDERR_FILENO); - ::close(pipe_fd[0]); - ::close(pipe_fd[1]); - - // Disassemble X bytes before/after the PC. - uint32_t disasm_context = 0x20; - - std::stringstream arg_start_addr, arg_stop_addr; - arg_start_addr << "--start-addr=0x" << std::hex << (pc_code_obj_offset - disasm_context); - arg_stop_addr << "--stop-addr=0x" << std::hex << (pc_code_obj_offset + disasm_context); - - std::exit(execlp("objdump", "-d", "-S", "-l", arg_start_addr.str().c_str(), - arg_stop_addr.str().c_str(), code_obj_path, nullptr)); - } - - // Collect the output of objdump. - ::close(pipe_fd[1]); - - std::vector objdump_out_buf; - std::vector buf(0x1000); - ssize_t n_read_b; - - while ((n_read_b = read(pipe_fd[0], buf.data(), buf.size())) > 0) { - objdump_out_buf.insert(objdump_out_buf.end(), &buf[0], &buf[n_read_b]); - } - - ::close(pipe_fd[0]); - - int child_status = 0; - int ret = ::waitpid(pid, &child_status, 0); - - if (ret != -1 && child_status == 0) { - // Attempt to trim the leading output from objdump. - std::string objdump_out(objdump_out_buf.begin(), objdump_out_buf.end()); - size_t trim_start = objdump_out.find(":\n\n") + 3; - - if (trim_start != objdump_out.npos) { - objdump_out = objdump_out.substr(trim_start); - } - - // Attempt to add a PC indicator inside the disassembly text. - std::stringstream pc_offset_find; - pc_offset_find << std::hex << pc_code_obj_offset << ":\t"; - size_t replace_idx = objdump_out.find(pc_offset_find.str()); - - if (replace_idx != objdump_out.npos) { - std::stringstream pc_offset_replace; - pc_offset_replace << std::hex << pc_code_obj_offset << ": >>>>>\t"; - objdump_out.replace(replace_idx, pc_offset_find.str().size(), pc_offset_replace.str()); - err << objdump_out << "\n"; - } else { - err << objdump_out; - err << "\nPC offset: " << std::hex << pc_code_obj_offset << "\n\n"; - } - } else { - err << "(Disassembly unavailable - is amdgcn-capable objdump in PATH?)\n\n"; - } - - ::unlink(code_obj_path); - } else { - err << "(Cannot match PC to a loaded code object)\n\n"; - } - } - } - - std::cerr << err.str(); - std::abort(); -} -} diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index fcdab6eeca..1fa48a94da 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -73,8 +73,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) properties_(node_props), current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT), blits_(), - queue_util_(nullptr), - queue_blit_(nullptr), + queues_(), local_region_(NULL), is_kv_device_(false), trap_code_buf_(NULL), @@ -138,7 +137,9 @@ GpuAgent::~GpuAgent() { } } - queues_.clear(); + for (int i = 0; i < QueueCount; ++i) { + delete queues_[i]; + } if (end_ts_base_addr_ != NULL) { core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_); @@ -580,16 +581,16 @@ void GpuAgent::InitDma() { // Fall back to blit kernel if SDMA is unavailable. if (blits_[BlitHostToDev] == NULL) { // Create a dedicated compute queue for host-to-device blits. - queue_blit_ = CreateInterceptibleQueue(); - assert(queue_blit_ != NULL && "Queue creation failed"); + queues_[QueueBlitOnly] = CreateInterceptibleQueue(); + assert(queues_[QueueBlitOnly] != NULL && "Queue creation failed"); - blits_[BlitHostToDev] = CreateBlitKernel(queue_blit_); + blits_[BlitHostToDev] = CreateBlitKernel(queues_[QueueBlitOnly]); assert(blits_[BlitHostToDev] != NULL && "Blit creation failed"); } if (blits_[BlitDevToHost] == NULL) { // Share utility queue with device-to-host blits. - blits_[BlitDevToHost] = CreateBlitKernel(queue_util_); + blits_[BlitDevToHost] = CreateBlitKernel(queues_[QueueUtility]); assert(blits_[BlitDevToHost] != NULL && "Blit creation failed"); } @@ -604,14 +605,14 @@ hsa_status_t GpuAgent::PostToolsInit() { BindTrapHandler(); // Defer utility queue creation to allow tools to intercept. - queue_util_ = CreateInterceptibleQueue(); + queues_[QueueUtility] = CreateInterceptibleQueue(); - if (queue_util_ == NULL) { + if (queues_[QueueUtility] == NULL) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } // Share utility queue with device-to-device blits. - blits_[BlitDevToDev] = CreateBlitKernel(queue_util_); + blits_[BlitDevToDev] = CreateBlitKernel(queues_[QueueUtility]); if (blits_[BlitDevToDev] == NULL) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; @@ -925,7 +926,6 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, event_callback, data, is_kv_device_); if (hw_queue && hw_queue->IsValid()) { // return queue - queues_.emplace_back(hw_queue); *queue = hw_queue; return HSA_STATUS_SUCCESS; } @@ -935,28 +935,6 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } -WaveStates GpuAgent::GetWaveStates() { - WaveStates wave_states; - - for (auto& queue : queues_) { - WaveStates queue_wave_states = queue->GetWaveStates(); - wave_states.insert(wave_states.end(), queue_wave_states.begin(), queue_wave_states.end()); - } - - return wave_states; -} - -hsa_status_t GpuAgent::QueueDestroy(core::Queue* queue) { - auto it = std::find_if(queues_.begin(), queues_.end(), [&](std::unique_ptr& queue_ptr) { - return static_cast(queue_ptr.get()) == queue; - }); - - assert(it != queues_.end() && "attempt to destroy an untracked queue"); - queues_.erase(it); - - return HSA_STATUS_SUCCESS; -} - void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { bool need_queue_scratch_base = (isa_->GetMajorVersion() > 8); @@ -1241,7 +1219,7 @@ void GpuAgent::InvalidateCodeCaches() { cache_inv[6] = 0; // Submit the command to the utility queue and wait for it to complete. - queue_util_->ExecutePM4(cache_inv, sizeof(cache_inv)); + queues_[QueueUtility]->ExecutePM4(cache_inv, sizeof(cache_inv)); } } // namespace diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/host_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/host_queue.cpp index 29f8ad24b1..5d3d1b2501 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/host_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/host_queue.cpp @@ -46,9 +46,12 @@ #include "core/util/utils.h" namespace core { -HostQueue::HostQueue(Agent& agent, hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type, - uint32_t features, hsa_signal_t doorbell_signal) - : Queue(agent), size_(ring_size), active_(false) { +HostQueue::HostQueue(hsa_region_t region, uint32_t ring_size, + hsa_queue_type32_t type, uint32_t features, + hsa_signal_t doorbell_signal) + : Queue(), + size_(ring_size), + active_(false) { if (!Shared::IsSharedObjectAllocationValid()) { return; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp index 81c01565c4..64129ac03f 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa.cpp @@ -608,14 +608,17 @@ hsa_status_t hsa_soft_queue_create(hsa_region_t region, uint32_t size, const core::Signal* signal = core::Signal::Convert(doorbell_signal); IS_VALID(signal); - core::Agent* agent = core::Runtime::runtime_singleton_->cpu_agents().front(); - core::Queue* host_queue = nullptr; - hsa_status_t status = - agent->HostQueueCreate(region, size, type, features, doorbell_signal, &host_queue); + core::HostQueue* host_queue = + new core::HostQueue(region, size, type, features, doorbell_signal); - *queue = (host_queue ? core::Queue::Convert(host_queue) : nullptr); + if (!host_queue->active()) { + delete host_queue; + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } - return status; + *queue = core::Queue::Convert(host_queue); + + return HSA_STATUS_SUCCESS; } /// @brief Api to destroy a user mode queue @@ -628,7 +631,8 @@ hsa_status_t hsa_queue_destroy(hsa_queue_t* queue) { IS_BAD_PTR(queue); core::Queue* cmd_queue = core::Queue::Convert(queue); IS_VALID(cmd_queue); - return cmd_queue->agent().QueueDestroy(cmd_queue); + delete cmd_queue; + return HSA_STATUS_SUCCESS; } /// @brief Api to inactivate a user mode queue diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp index 1bdedcb9b7..442fe82f52 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -53,7 +53,6 @@ #include "core/inc/hsa_ext_interface.h" #include "core/inc/amd_cpu_agent.h" -#include "core/inc/amd_debugger.h" #include "core/inc/amd_gpu_agent.h" #include "core/inc/amd_memory_region.h" #include "core/inc/amd_topology.h" @@ -917,24 +916,55 @@ void Runtime::BindVmFaultHandler() { return; } - SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_), HSA_SIGNAL_CONDITION_NE, 0, - VMFaultHandler, this); + SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_), + HSA_SIGNAL_CONDITION_NE, 0, VMFaultHandler, + reinterpret_cast(vm_fault_signal_)); } } bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { - Runtime* runtime = reinterpret_cast(arg); - assert(runtime->vm_fault_signal_ != NULL); + core::InterruptSignal* vm_fault_signal = + reinterpret_cast(arg); - HsaEvent* vm_fault_event = runtime->vm_fault_signal_->EopEvent(); - const HsaMemoryAccessFault& fault = vm_fault_event->EventData.EventData.MemoryAccessFault; + assert(vm_fault_signal != NULL); - auto agent_it = std::find_if(runtime->gpu_agents_.begin(), runtime->gpu_agents_.end(), - [&](Agent* agent) { return agent->node_id() == fault.NodeId; }); - assert(agent_it != runtime->gpu_agents_.end()); + if (vm_fault_signal == NULL) { + return false; + } - amd::Debugger::HandleFault(fault, static_cast(*agent_it)); + if (runtime_singleton_->flag().enable_vm_fault_message()) { + HsaEvent* vm_fault_event = vm_fault_signal->EopEvent(); + const HsaMemoryAccessFault& fault = + vm_fault_event->EventData.EventData.MemoryAccessFault; + + std::string reason = ""; + if (fault.Failure.NotPresent == 1) { + reason += "Page not present or supervisor privilege"; + } else if (fault.Failure.ReadOnly == 1) { + reason += "Write access to a read-only page"; + } else if (fault.Failure.NoExecute == 1) { + reason += "Execute access to a page marked NX"; + } else if (fault.Failure.GpuAccess == 1) { + reason += "Host access only"; + } else if (fault.Failure.ECC == 1) { + reason += "ECC failure (if supported by HW)"; + } else { + reason += "Unknown"; + } + + fprintf(stderr, + "Memory access fault by GPU node-%u on address %p%s. Reason: %s.\n", + fault.NodeId, reinterpret_cast(fault.VirtualAddress), + (fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "", + reason.c_str()); + } else { + assert(false && "GPU memory access fault."); + } + + std::abort(); + + // No need to keep the signal because we are done. return false; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h index c6ec786bb6..d7add470c5 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h @@ -43,8 +43,6 @@ #ifndef HSA_RUNTIME_CORE_INC_FLAG_H_ #define HSA_RUNTIME_CORE_INC_FLAG_H_ -#include -#include #include #include @@ -62,15 +60,8 @@ class Flag { std::string var = os::GetEnvVar("HSA_CHECK_FLAT_SCRATCH"); check_flat_scratch_ = (var == "1") ? true : false; - var = os::GetEnvVar("HSA_DEBUG_FAULT"); - std::transform(var.begin(), var.end(), var.begin(), - [](unsigned char c) { return std::tolower(c); }); - - if (var == "analyze") { - debug_fault_ = DEBUG_FAULT_ANALYZE; - } else { - debug_fault_ = DEBUG_FAULT_OFF; - } + var = os::GetEnvVar("HSA_ENABLE_VM_FAULT_MESSAGE"); + enable_vm_fault_message_ = (var == "0") ? false : true; var = os::GetEnvVar("HSA_ENABLE_QUEUE_FAULT_MESSAGE"); enable_queue_fault_message_ = (var == "0") ? false : true; @@ -105,15 +96,10 @@ class Flag { tools_lib_names_ = os::GetEnvVar("HSA_TOOLS_LIB"); } - enum DebugFaultEnum { - DEBUG_FAULT_OFF, - DEBUG_FAULT_ANALYZE, - }; - bool check_flat_scratch() const { return check_flat_scratch_; } - DebugFaultEnum debug_fault() const { return debug_fault_; } - + bool enable_vm_fault_message() const { return enable_vm_fault_message_; } + bool enable_queue_fault_message() const { return enable_queue_fault_message_; } bool enable_interrupt() const { return enable_interrupt_; } @@ -138,7 +124,7 @@ class Flag { private: bool check_flat_scratch_; - DebugFaultEnum debug_fault_; + bool enable_vm_fault_message_; bool enable_interrupt_; bool enable_sdma_; bool emulate_aql_;