Revert "Implement memory fault analysis through context save area"

This reverts commit 498f3a7188.

Change-Id: Ibf11b764b383b9be291f3009a30550e1a1e2d115


[ROCm/ROCR-Runtime commit: 5b4df54b10]
This commit is contained in:
Kenny Ho
2017-06-14 14:14:38 -04:00
والد 231d7e8608
کامیت 415027b89f
16فایلهای تغییر یافته به همراه89 افزوده شده و 663 حذف شده
@@ -120,7 +120,6 @@ set ( SRCS "core/util/lnx/os_linux.cpp"
"core/runtime/amd_blit_kernel.cpp"
"core/runtime/amd_blit_sdma.cpp"
"core/runtime/amd_cpu_agent.cpp"
"core/runtime/amd_debugger.cpp"
"core/runtime/amd_gpu_agent.cpp"
"core/runtime/amd_aql_queue.cpp"
"core/runtime/amd_loader_context.cpp"
@@ -218,12 +218,6 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
uint32_t group_segment_size,
Queue** queue) = 0;
virtual hsa_status_t HostQueueCreate(hsa_region_t region, uint32_t ring_size,
hsa_queue_type32_t type, uint32_t features,
hsa_signal_t doorbell_signal, Queue** queue) = 0;
virtual hsa_status_t QueueDestroy(Queue* queue) = 0;
// @brief Query the value of an attribute.
//
// @param [in] attribute Attribute to query.
@@ -44,13 +44,12 @@
#define HSA_RUNTIME_CORE_INC_AMD_HW_AQL_COMMAND_PROCESSOR_H_
#include "core/inc/runtime.h"
#include "core/inc/amd_gpu_agent.h"
#include "core/inc/signal.h"
#include "core/inc/queue.h"
#include "core/inc/amd_gpu_agent.h"
#include "core/util/locks.h"
namespace amd {
/// @brief Encapsulates HW Aql Command Processor functionality. It
/// provide the interface for things such as Doorbell register, read,
/// write pointers and a buffer.
@@ -185,8 +184,6 @@ class AqlQueue : public core::Queue, public core::Signal {
// @brief Submits a block of PM4 and waits until it has been executed.
void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override;
WaveStates GetWaveStates();
/// @brief This operation is illegal
hsa_signal_value_t LoadRelaxed() override {
assert(false);
@@ -45,8 +45,6 @@
#ifndef HSA_RUNTIME_CORE_INC_AMD_CPU_AGENT_H_
#define HSA_RUNTIME_CORE_INC_AMD_CPU_AGENT_H_
#include <list>
#include <memory>
#include <vector>
#include "hsakmt.h"
@@ -105,12 +103,6 @@ class CpuAgent : public core::Agent {
uint32_t group_segment_size,
core::Queue** queue) override;
hsa_status_t HostQueueCreate(hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type,
uint32_t features, hsa_signal_t doorbell_signal,
core::Queue** queue) override;
hsa_status_t QueueDestroy(core::Queue* queue) override;
// @brief Returns number of data caches.
__forceinline size_t num_cache() const { return cache_props_.size(); }
@@ -162,8 +154,6 @@ class CpuAgent : public core::Agent {
// @brief Array of regions owned by this agent.
std::vector<const core::MemoryRegion*> regions_;
std::list<std::unique_ptr<core::Queue>> queues_;
DISALLOW_COPY_AND_ASSIGN(CpuAgent);
};
@@ -1,57 +0,0 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#ifndef HSA_RUNTIME_CORE_INC_AMD_DEBUGGER_H_
#define HSA_RUNTIME_CORE_INC_AMD_DEBUGGER_H_
#include "core/inc/amd_gpu_agent.h"
#include "hsakmt.h"
namespace amd {
class Debugger {
public:
static void HandleFault(const HsaMemoryAccessFault& fault, GpuAgentInt* agent);
};
}
#endif
@@ -45,8 +45,6 @@
#ifndef HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_
#define HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_
#include <list>
#include <memory>
#include <vector>
#include "hsakmt.h"
@@ -61,42 +59,6 @@
namespace amd {
class MemoryRegion;
class AqlQueue;
struct WaveState {
// Number of SGPRs allocated per wavefront.
uint32_t num_sgprs;
// Array of packed SGPR data.
uint32_t* sgprs;
// Number of VGPRs allocated per wavefront.
uint32_t num_vgprs;
// Number of lanes in each VGPR.
uint32_t num_vgpr_lanes;
// Array of packed VGPR data.
// VGPR value = vgprs[(vgpr_idx * num_vgpr_lanes) + lane_idx]
uint32_t* vgprs;
// Data for miscellaneous registers.
struct {
uint64_t pc;
uint64_t exec;
uint32_t status;
uint32_t trapsts;
uint32_t m0;
} regs;
// LDS allocation size for the work group, in 32-bit words.
uint32_t lds_size_dw;
// Packed LDS data for the work group.
uint32_t* lds;
};
typedef std::vector<WaveState> WaveStates;
// @brief Contains scratch memory information.
struct ScratchInfo {
@@ -113,14 +75,6 @@ class GpuAgentInt : public core::Agent {
GpuAgentInt(uint32_t node_id)
: core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {}
// @brief GpuAgent does not support HostQueueCreation.
hsa_status_t HostQueueCreate(hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type,
uint32_t features, hsa_signal_t doorbell_signal,
core::Queue** queue) override {
assert(false && "GpuAgent::HostQueueCreate not implemented");
return HSA_STATUS_ERROR_INVALID_AGENT;
}
// @brief Initialize DMA queue.
//
// @retval HSA_STATUS_SUCCESS DMA queue initialization is successful.
@@ -186,8 +140,6 @@ class GpuAgentInt : public core::Agent {
// @brief Invalidate caches on the agent which may hold code object data.
virtual void InvalidateCodeCaches() = 0;
virtual WaveStates GetWaveStates() = 0;
// @brief Sets the coherency type of this agent.
//
// @param [in] type New coherency type.
@@ -300,8 +252,6 @@ class GpuAgent : public GpuAgentInt {
uint32_t group_segment_size,
core::Queue** queue) override;
hsa_status_t QueueDestroy(core::Queue* queue) override;
// @brief Override from amd::GpuAgentInt.
void AcquireQueueScratch(ScratchInfo& scratch) override;
@@ -318,8 +268,6 @@ class GpuAgent : public GpuAgentInt {
// @brief Override from amd::GpuAgentInt.
void InvalidateCodeCaches() override;
WaveStates GetWaveStates() override;
// @brief Override from amd::GpuAgentInt.
bool current_coherency_type(hsa_amd_coherency_type_t type) override;
@@ -430,10 +378,14 @@ class GpuAgent : public GpuAgentInt {
core::Blit* blits_[BlitCount];
std::list<std::unique_ptr<AqlQueue>> queues_;
// @brief AQL queues for cache management and blit compute usage.
enum QueueEnum {
QueueUtility, // Cache management and device to {host,device} blit compute
QueueBlitOnly, // Host to device blit
QueueCount
};
core::Queue* queue_util_;
core::Queue* queue_blit_;
core::Queue* queues_[QueueCount];
// @brief Mutex to protect the update to coherency type.
KernelMutex coherency_lock_;
@@ -51,7 +51,7 @@
namespace core {
class HostQueue : public Queue {
public:
HostQueue(Agent& agent, hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type,
HostQueue(hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type,
uint32_t features, hsa_signal_t doorbell_signal);
~HostQueue();
@@ -45,7 +45,6 @@
#ifndef HSA_RUNTME_CORE_INC_COMMAND_QUEUE_H_
#define HSA_RUNTME_CORE_INC_COMMAND_QUEUE_H_
#include <sstream>
#include <vector>
#include "core/common/shared.h"
@@ -56,8 +55,6 @@
#include "inc/amd_hsa_queue.h"
namespace core {
class Agent;
struct AqlPacket {
union {
@@ -140,7 +137,7 @@ All funtions other than Convert and public_handle must be virtual.
class Queue : public Checked<0xFA3906A679F9DB49>,
public Shared<SharedQueue, AMD_QUEUE_ALIGN_BYTES> {
public:
explicit Queue(Agent& agent) : Shared(), amd_queue_(shared_object()->amd_queue), agent_(agent) {
Queue() : Shared(), amd_queue_(shared_object()->amd_queue) {
if (!Shared::IsSharedObjectAllocationValid()) {
return;
}
@@ -311,8 +308,6 @@ class Queue : public Checked<0xFA3906A679F9DB49>,
hsa_queue_t* public_handle() const { return public_handle_; }
Agent& agent() { return agent_; }
protected:
static void set_public_handle(Queue* ptr, hsa_queue_t* handle) {
ptr->do_set_public_handle(handle);
@@ -322,8 +317,6 @@ class Queue : public Checked<0xFA3906A679F9DB49>,
}
hsa_queue_t* public_handle_;
Agent& agent_;
private:
DISALLOW_COPY_AND_ASSIGN(Queue);
};
@@ -84,9 +84,10 @@ void* AqlQueue::operator new(size_t size) {
void AqlQueue::operator delete(void* ptr) { _aligned_free(ptr); }
AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, ScratchInfo& scratch,
core::HsaEventCallback callback, void* err_data, bool is_kv)
: Queue(*agent),
AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id,
ScratchInfo& scratch, core::HsaEventCallback callback,
void* err_data, bool is_kv)
: Queue(),
Signal(0),
ring_buf_(NULL),
ring_buf_alloc_bytes_(0),
@@ -961,113 +962,4 @@ void AqlQueue::InitScratchSRD() {
return;
}
WaveStates AqlQueue::GetWaveStates() {
WaveStates wave_states;
// Retrieve the control stack and context save area for the queue.
HsaQueueInfo queue_info;
HSAKMT_STATUS status = hsaKmtGetQueueInfo(queue_id_, &queue_info);
if (status != HSAKMT_STATUS_SUCCESS) {
return wave_states;
}
// The control stack is processed from start to end.
// The save area is processed from end to start.
uint32_t* ctl_stack = reinterpret_cast<uint32_t*>(queue_info.ControlStackTop);
uint32_t* wave_area = reinterpret_cast<uint32_t*>(uintptr_t(queue_info.UserContextSaveArea) +
queue_info.SaveAreaSizeInBytes);
uint32_t ctl_stack_ndw = uint32_t(queue_info.ControlStackUsedInBytes / sizeof(uint32_t));
// Control stack persists resource allocation until changed by a command.
uint32_t n_vgprs = 0;
uint32_t n_sgprs = 0;
uint32_t lds_size_dw = 0;
// LDS is saved per-workgroup but the stack is parsed per-wavefront.
// Track the LDS save area for the current workgroup.
uint32_t* lds = nullptr;
// Parse each write to COMPUTE_RELAUNCH in sequence.
// First two dwords are SET_SH_REG leader.
for (uint32_t idx = 2; idx < ctl_stack_ndw; ++idx) {
uint32_t relaunch = ctl_stack[idx];
#define COMPUTE_RELAUNCH_PAYLOAD_VGPRS(x) (((x) >> 0x0) & 0x3F)
#define COMPUTE_RELAUNCH_PAYLOAD_SGPRS(x) (((x) >> 0x6) & 0x7)
#define COMPUTE_RELAUNCH_PAYLOAD_LDS_SIZE(x) (((x) >> 0x9) & 0x1FF)
#define COMPUTE_RELAUNCH_PAYLOAD_FIRST_WAVE(x) (((x) >> 0x11) & 0x1)
#define COMPUTE_RELAUNCH_IS_EVENT(x) (((x) >> 0x1E) & 0x1)
#define COMPUTE_RELAUNCH_IS_STATE(x) (((x) >> 0x1F) & 0x1)
bool is_event = COMPUTE_RELAUNCH_IS_EVENT(relaunch);
bool is_state = COMPUTE_RELAUNCH_IS_STATE(relaunch);
if (is_state && !is_event) {
// Resource allocation state change, update tracked state.
n_vgprs = (0x1 + COMPUTE_RELAUNCH_PAYLOAD_VGPRS(relaunch)) * 0x4;
n_sgprs = ((0x1 + COMPUTE_RELAUNCH_PAYLOAD_SGPRS(relaunch)) - 0x1 /* no trap SGPRs */) * 0x10;
lds_size_dw = COMPUTE_RELAUNCH_PAYLOAD_LDS_SIZE(relaunch) * 0x80;
} else if (!is_state && !is_event) {
// Reference to one wavefront in the save area.
bool first_wave_in_group = COMPUTE_RELAUNCH_PAYLOAD_FIRST_WAVE(relaunch);
// Save area layout is fixed by context save trap handler and SPI.
uint32_t vgprs_offset = 0x0;
uint32_t sgprs_offset = vgprs_offset + n_vgprs * 0x40;
uint32_t hwregs_offset = sgprs_offset + n_sgprs;
uint32_t lds_offset = hwregs_offset + 0x20;
uint32_t unused_offset = lds_offset + (first_wave_in_group ? lds_size_dw : 0x0);
uint32_t wave_area_size = unused_offset + 0x10; // trap SGPRs were allocated but not saved
uint32_t hwreg_m0_offset = hwregs_offset + 0x0;
uint32_t hwreg_pc_lo_offset = hwregs_offset + 0x1;
uint32_t hwreg_pc_hi_offset = hwregs_offset + 0x2;
uint32_t hwreg_exec_lo_offset = hwregs_offset + 0x3;
uint32_t hwreg_exec_hi_offset = hwregs_offset + 0x4;
uint32_t hwreg_status_offset = hwregs_offset + 0x5;
uint32_t hwreg_trapsts_offset = hwregs_offset + 0x6;
// Find beginning of wavefront state in the save area.
wave_area -= wave_area_size;
if (first_wave_in_group) {
// Track the LDS save area for this workgroup.
if (lds_size_dw > 0) {
lds = wave_area + lds_offset;
} else {
lds = nullptr;
}
}
WaveState wave_state;
wave_state.num_sgprs = n_sgprs;
wave_state.sgprs = wave_area + sgprs_offset;
wave_state.num_vgprs = n_vgprs;
wave_state.num_vgpr_lanes = 0x40;
wave_state.vgprs = wave_area + vgprs_offset;
wave_state.regs.pc = (uint64_t(wave_area[hwreg_pc_lo_offset]) |
(uint64_t(wave_area[hwreg_pc_hi_offset]) << 0x20));
wave_state.regs.exec = uint64_t(wave_area[hwreg_exec_lo_offset]) |
(uint64_t(wave_area[hwreg_exec_hi_offset]) << 0x20);
wave_state.regs.status = wave_area[hwreg_status_offset];
wave_state.regs.trapsts = wave_area[hwreg_trapsts_offset];
wave_state.regs.m0 = wave_area[hwreg_m0_offset];
wave_state.lds_size_dw = lds_size_dw;
wave_state.lds = lds;
#define SQ_WAVE_TRAPSTS_XNACK_ERROR(x) (((x) >> 0x1C) & 0x1)
if (SQ_WAVE_TRAPSTS_XNACK_ERROR(wave_state.regs.trapsts)) {
// Correct the PC: context save handler subtracted 0x8.
wave_state.regs.pc += 0x8;
}
wave_states.push_back(wave_state);
}
}
return wave_states;
}
} // namespace amd
@@ -369,32 +369,4 @@ hsa_status_t CpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
return HSA_STATUS_ERROR;
}
hsa_status_t CpuAgent::HostQueueCreate(hsa_region_t region, uint32_t ring_size,
hsa_queue_type32_t type, uint32_t features,
hsa_signal_t doorbell_signal, core::Queue** queue) {
core::HostQueue* host_queue =
new core::HostQueue(*this, region, ring_size, type, features, doorbell_signal);
if (!host_queue->IsValid()) {
delete host_queue;
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
queues_.emplace_back(host_queue);
*queue = host_queue;
return HSA_STATUS_SUCCESS;
}
hsa_status_t CpuAgent::QueueDestroy(core::Queue* queue) {
auto it = std::find_if(
queues_.begin(), queues_.end(),
[&](std::unique_ptr<core::Queue>& queue_ptr) { return queue_ptr.get() == queue; });
assert(it != queues_.end() && "attempt to destroy an untracked queue");
queues_.erase(it);
return HSA_STATUS_SUCCESS;
}
} // namespace amd
@@ -1,307 +0,0 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#include "core/inc/amd_debugger.h"
#include "core/inc/amd_loader_context.hpp"
#include "core/inc/amd_aql_queue.h"
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <string>
#include <sys/wait.h>
#include <unistd.h>
namespace amd {
void Debugger::HandleFault(const HsaMemoryAccessFault& fault, GpuAgentInt* agent) {
std::stringstream err;
uint64_t fault_page_idx = fault.VirtualAddress >> 0xC;
err << "\nMemory access fault by GPU node " << agent->node_id();
err << " for address 0x" << std::hex << std::uppercase << fault_page_idx << "xxx (";
if (fault.Failure.NotPresent == 1) {
err << "page not present";
} else if (fault.Failure.ReadOnly == 1) {
err << "write access to a read-only page";
} else if (fault.Failure.NoExecute == 1) {
err << "execute access to a non-executable page";
} else if (fault.Failure.ECC == 1) {
err << "uncorrectable ECC failure";
} else {
err << "unknown reason";
}
err << ")\n\n";
if (core::Runtime::runtime_singleton_->flag().debug_fault() != Flag::DEBUG_FAULT_ANALYZE) {
if (agent->isa()->GetMajorVersion() >= 9) {
err << "For more detail set: HSA_DEBUG_FAULT=\"analyze\"\n\n";
}
std::cerr << err.str();
std::abort();
}
WaveStates wave_states = agent->GetWaveStates();
for (WaveState& wave_state : wave_states) {
#define SQ_WAVE_TRAPSTS_XNACK_ERROR(x) (((x) >> 0x1C) & 0x1)
if (SQ_WAVE_TRAPSTS_XNACK_ERROR(wave_state.regs.trapsts)) {
err << "Wavefront found in XNACK error state:\n\n";
err << " PC: 0x" << std::setw(0x10) << std::setfill('0') << wave_state.regs.pc << "\n";
err << " EXEC: 0x" << std::setw(0x10) << std::setfill('0') << wave_state.regs.exec << "\n";
err << " STATUS: 0x" << std::setw(0x8) << std::setfill('0') << wave_state.regs.status << "\n";
err << "TRAPSTS: 0x" << std::setw(0x8) << std::setfill('0') << wave_state.regs.trapsts
<< "\n";
err << " M0: 0x" << std::setw(0x8) << std::setfill('0') << wave_state.regs.m0 << "\n\n";
uint32_t n_sgpr_cols = 4;
uint32_t n_sgpr_rows = wave_state.num_sgprs / n_sgpr_cols;
for (uint32_t sgpr_row = 0; sgpr_row < n_sgpr_rows; ++sgpr_row) {
err << " ";
for (uint32_t sgpr_col = 0; sgpr_col < n_sgpr_cols; ++sgpr_col) {
uint32_t sgpr_idx = (sgpr_row * n_sgpr_cols) + sgpr_col;
uint32_t sgpr_val = wave_state.sgprs[sgpr_idx];
std::stringstream sgpr_str;
sgpr_str << "s" << sgpr_idx;
err << std::setw(6) << std::setfill(' ') << sgpr_str.str();
err << ": 0x" << std::setw(8) << std::setfill('0') << sgpr_val;
}
err << "\n";
}
err << "\n";
uint32_t n_vgpr_cols = 4;
uint32_t n_vgpr_rows = wave_state.num_vgprs / n_vgpr_cols;
for (uint32_t lane_idx = 0; lane_idx < wave_state.num_vgpr_lanes; ++lane_idx) {
err << "Lane 0x" << lane_idx << "\n";
for (uint32_t vgpr_row = 0; vgpr_row < n_vgpr_rows; ++vgpr_row) {
err << " ";
for (uint32_t vgpr_col = 0; vgpr_col < n_vgpr_cols; ++vgpr_col) {
uint32_t vgpr_idx = (vgpr_row * n_vgpr_cols) + vgpr_col;
uint32_t vgpr_val = wave_state.vgprs[(vgpr_idx * wave_state.num_vgpr_lanes) + lane_idx];
std::stringstream vgpr_str;
vgpr_str << "v" << vgpr_idx;
err << std::setw(6) << std::setfill(' ') << vgpr_str.str();
err << ": 0x" << std::setw(8) << std::setfill('0') << vgpr_val;
}
err << "\n";
}
}
err << "\n";
if (wave_state.lds) {
err << "LDS:\n\n";
uint32_t n_lds_cols = 4;
uint32_t n_lds_rows = wave_state.lds_size_dw / n_lds_cols;
for (uint32_t lds_row = 0; lds_row < n_lds_rows; ++lds_row) {
uint32_t lds_addr = lds_row * n_lds_cols * 4;
err << "0x" << std::setw(4) << std::setfill('0') << lds_addr << ":";
for (uint32_t lds_col = 0; lds_col < n_lds_cols; ++lds_col) {
uint32_t lds_idx = (lds_row * n_lds_cols) + lds_col;
uint32_t lds_val = wave_state.lds[lds_idx];
err << " 0x" << std::setw(8) << std::setfill('0') << lds_val;
}
err << "\n";
}
err << "\n";
}
// Attempt to match the PC to a loaded code object.
amd::hsa::loader::LoadedCodeObject* pc_code_obj = nullptr;
uint64_t pc_code_obj_offset = 0;
auto iter_execs = [&](hsa_executable_t exec) {
auto iter_code_objs = [&](hsa_loaded_code_object_t code_obj) {
auto iter_segments = [&](amd_loaded_segment_t segment) {
auto segment_int = amd::hsa::loader::LoadedSegment::Object(segment);
uint64_t load_base, load_size;
segment_int->GetInfo(AMD_LOADED_SEGMENT_INFO_LOAD_BASE_ADDRESS, &load_base);
segment_int->GetInfo(AMD_LOADED_SEGMENT_INFO_SIZE, &load_size);
if ((wave_state.regs.pc >= load_base) &&
(wave_state.regs.pc < (load_base + load_size))) {
pc_code_obj = amd::hsa::loader::LoadedCodeObject::Object(code_obj);
pc_code_obj_offset = wave_state.regs.pc - load_base;
}
return HSA_STATUS_SUCCESS;
};
amd::hsa::loader::LoadedCodeObject::Object(code_obj)->IterateLoadedSegments(
[](amd_loaded_segment_t segment, void* data) {
return (*reinterpret_cast<decltype(iter_segments)*>(data))(segment);
},
&iter_segments);
return HSA_STATUS_SUCCESS;
};
amd::hsa::loader::Executable::Object(exec)->IterateLoadedCodeObjects(
[](hsa_loaded_code_object_t code_obj, void* data) {
return (*reinterpret_cast<decltype(iter_code_objs)*>(data))(code_obj);
},
&iter_code_objs);
return HSA_STATUS_SUCCESS;
};
core::Runtime::runtime_singleton_->loader()->IterateExecutables(
[](hsa_executable_t exec, void* data) {
return (*reinterpret_cast<decltype(iter_execs)*>(data))(exec);
},
&iter_execs);
if (pc_code_obj) {
// Write the code object to a temporary file.
uint64_t elf_addr;
size_t elf_size;
pc_code_obj->GetInfo(AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE, &elf_addr);
pc_code_obj->GetInfo(AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE_SIZE, &elf_size);
char code_obj_path[] = "/tmp/hsartXXXXXX";
int code_obj_fd = ::mkstemp(code_obj_path);
::write(code_obj_fd, (const void*)uintptr_t(elf_addr), elf_size);
::close(code_obj_fd);
// Invoke binutils objdump on the code object.
int pipe_fd[2];
::pipe(pipe_fd);
pid_t pid = ::fork();
if (pid == 0) {
::dup2(pipe_fd[1], STDOUT_FILENO);
::dup2(pipe_fd[1], STDERR_FILENO);
::close(pipe_fd[0]);
::close(pipe_fd[1]);
// Disassemble X bytes before/after the PC.
uint32_t disasm_context = 0x20;
std::stringstream arg_start_addr, arg_stop_addr;
arg_start_addr << "--start-addr=0x" << std::hex << (pc_code_obj_offset - disasm_context);
arg_stop_addr << "--stop-addr=0x" << std::hex << (pc_code_obj_offset + disasm_context);
std::exit(execlp("objdump", "-d", "-S", "-l", arg_start_addr.str().c_str(),
arg_stop_addr.str().c_str(), code_obj_path, nullptr));
}
// Collect the output of objdump.
::close(pipe_fd[1]);
std::vector<char> objdump_out_buf;
std::vector<char> buf(0x1000);
ssize_t n_read_b;
while ((n_read_b = read(pipe_fd[0], buf.data(), buf.size())) > 0) {
objdump_out_buf.insert(objdump_out_buf.end(), &buf[0], &buf[n_read_b]);
}
::close(pipe_fd[0]);
int child_status = 0;
int ret = ::waitpid(pid, &child_status, 0);
if (ret != -1 && child_status == 0) {
// Attempt to trim the leading output from objdump.
std::string objdump_out(objdump_out_buf.begin(), objdump_out_buf.end());
size_t trim_start = objdump_out.find(":\n\n") + 3;
if (trim_start != objdump_out.npos) {
objdump_out = objdump_out.substr(trim_start);
}
// Attempt to add a PC indicator inside the disassembly text.
std::stringstream pc_offset_find;
pc_offset_find << std::hex << pc_code_obj_offset << ":\t";
size_t replace_idx = objdump_out.find(pc_offset_find.str());
if (replace_idx != objdump_out.npos) {
std::stringstream pc_offset_replace;
pc_offset_replace << std::hex << pc_code_obj_offset << ": >>>>>\t";
objdump_out.replace(replace_idx, pc_offset_find.str().size(), pc_offset_replace.str());
err << objdump_out << "\n";
} else {
err << objdump_out;
err << "\nPC offset: " << std::hex << pc_code_obj_offset << "\n\n";
}
} else {
err << "(Disassembly unavailable - is amdgcn-capable objdump in PATH?)\n\n";
}
::unlink(code_obj_path);
} else {
err << "(Cannot match PC to a loaded code object)\n\n";
}
}
}
std::cerr << err.str();
std::abort();
}
}
@@ -73,8 +73,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
properties_(node_props),
current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
blits_(),
queue_util_(nullptr),
queue_blit_(nullptr),
queues_(),
local_region_(NULL),
is_kv_device_(false),
trap_code_buf_(NULL),
@@ -138,7 +137,9 @@ GpuAgent::~GpuAgent() {
}
}
queues_.clear();
for (int i = 0; i < QueueCount; ++i) {
delete queues_[i];
}
if (end_ts_base_addr_ != NULL) {
core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_);
@@ -580,16 +581,16 @@ void GpuAgent::InitDma() {
// Fall back to blit kernel if SDMA is unavailable.
if (blits_[BlitHostToDev] == NULL) {
// Create a dedicated compute queue for host-to-device blits.
queue_blit_ = CreateInterceptibleQueue();
assert(queue_blit_ != NULL && "Queue creation failed");
queues_[QueueBlitOnly] = CreateInterceptibleQueue();
assert(queues_[QueueBlitOnly] != NULL && "Queue creation failed");
blits_[BlitHostToDev] = CreateBlitKernel(queue_blit_);
blits_[BlitHostToDev] = CreateBlitKernel(queues_[QueueBlitOnly]);
assert(blits_[BlitHostToDev] != NULL && "Blit creation failed");
}
if (blits_[BlitDevToHost] == NULL) {
// Share utility queue with device-to-host blits.
blits_[BlitDevToHost] = CreateBlitKernel(queue_util_);
blits_[BlitDevToHost] = CreateBlitKernel(queues_[QueueUtility]);
assert(blits_[BlitDevToHost] != NULL && "Blit creation failed");
}
@@ -604,14 +605,14 @@ hsa_status_t GpuAgent::PostToolsInit() {
BindTrapHandler();
// Defer utility queue creation to allow tools to intercept.
queue_util_ = CreateInterceptibleQueue();
queues_[QueueUtility] = CreateInterceptibleQueue();
if (queue_util_ == NULL) {
if (queues_[QueueUtility] == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
// Share utility queue with device-to-device blits.
blits_[BlitDevToDev] = CreateBlitKernel(queue_util_);
blits_[BlitDevToDev] = CreateBlitKernel(queues_[QueueUtility]);
if (blits_[BlitDevToDev] == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
@@ -925,7 +926,6 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
event_callback, data, is_kv_device_);
if (hw_queue && hw_queue->IsValid()) {
// return queue
queues_.emplace_back(hw_queue);
*queue = hw_queue;
return HSA_STATUS_SUCCESS;
}
@@ -935,28 +935,6 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
WaveStates GpuAgent::GetWaveStates() {
WaveStates wave_states;
for (auto& queue : queues_) {
WaveStates queue_wave_states = queue->GetWaveStates();
wave_states.insert(wave_states.end(), queue_wave_states.begin(), queue_wave_states.end());
}
return wave_states;
}
hsa_status_t GpuAgent::QueueDestroy(core::Queue* queue) {
auto it = std::find_if(queues_.begin(), queues_.end(), [&](std::unique_ptr<AqlQueue>& queue_ptr) {
return static_cast<core::Queue*>(queue_ptr.get()) == queue;
});
assert(it != queues_.end() && "attempt to destroy an untracked queue");
queues_.erase(it);
return HSA_STATUS_SUCCESS;
}
void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
bool need_queue_scratch_base = (isa_->GetMajorVersion() > 8);
@@ -1241,7 +1219,7 @@ void GpuAgent::InvalidateCodeCaches() {
cache_inv[6] = 0;
// Submit the command to the utility queue and wait for it to complete.
queue_util_->ExecutePM4(cache_inv, sizeof(cache_inv));
queues_[QueueUtility]->ExecutePM4(cache_inv, sizeof(cache_inv));
}
} // namespace
@@ -46,9 +46,12 @@
#include "core/util/utils.h"
namespace core {
HostQueue::HostQueue(Agent& agent, hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type,
uint32_t features, hsa_signal_t doorbell_signal)
: Queue(agent), size_(ring_size), active_(false) {
HostQueue::HostQueue(hsa_region_t region, uint32_t ring_size,
hsa_queue_type32_t type, uint32_t features,
hsa_signal_t doorbell_signal)
: Queue(),
size_(ring_size),
active_(false) {
if (!Shared::IsSharedObjectAllocationValid()) {
return;
}
@@ -608,14 +608,17 @@ hsa_status_t hsa_soft_queue_create(hsa_region_t region, uint32_t size,
const core::Signal* signal = core::Signal::Convert(doorbell_signal);
IS_VALID(signal);
core::Agent* agent = core::Runtime::runtime_singleton_->cpu_agents().front();
core::Queue* host_queue = nullptr;
hsa_status_t status =
agent->HostQueueCreate(region, size, type, features, doorbell_signal, &host_queue);
core::HostQueue* host_queue =
new core::HostQueue(region, size, type, features, doorbell_signal);
*queue = (host_queue ? core::Queue::Convert(host_queue) : nullptr);
if (!host_queue->active()) {
delete host_queue;
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
return status;
*queue = core::Queue::Convert(host_queue);
return HSA_STATUS_SUCCESS;
}
/// @brief Api to destroy a user mode queue
@@ -628,7 +631,8 @@ hsa_status_t hsa_queue_destroy(hsa_queue_t* queue) {
IS_BAD_PTR(queue);
core::Queue* cmd_queue = core::Queue::Convert(queue);
IS_VALID(cmd_queue);
return cmd_queue->agent().QueueDestroy(cmd_queue);
delete cmd_queue;
return HSA_STATUS_SUCCESS;
}
/// @brief Api to inactivate a user mode queue
@@ -53,7 +53,6 @@
#include "core/inc/hsa_ext_interface.h"
#include "core/inc/amd_cpu_agent.h"
#include "core/inc/amd_debugger.h"
#include "core/inc/amd_gpu_agent.h"
#include "core/inc/amd_memory_region.h"
#include "core/inc/amd_topology.h"
@@ -917,24 +916,55 @@ void Runtime::BindVmFaultHandler() {
return;
}
SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_), HSA_SIGNAL_CONDITION_NE, 0,
VMFaultHandler, this);
SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_),
HSA_SIGNAL_CONDITION_NE, 0, VMFaultHandler,
reinterpret_cast<void*>(vm_fault_signal_));
}
}
bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
Runtime* runtime = reinterpret_cast<Runtime*>(arg);
assert(runtime->vm_fault_signal_ != NULL);
core::InterruptSignal* vm_fault_signal =
reinterpret_cast<core::InterruptSignal*>(arg);
HsaEvent* vm_fault_event = runtime->vm_fault_signal_->EopEvent();
const HsaMemoryAccessFault& fault = vm_fault_event->EventData.EventData.MemoryAccessFault;
assert(vm_fault_signal != NULL);
auto agent_it = std::find_if(runtime->gpu_agents_.begin(), runtime->gpu_agents_.end(),
[&](Agent* agent) { return agent->node_id() == fault.NodeId; });
assert(agent_it != runtime->gpu_agents_.end());
if (vm_fault_signal == NULL) {
return false;
}
amd::Debugger::HandleFault(fault, static_cast<amd::GpuAgentInt*>(*agent_it));
if (runtime_singleton_->flag().enable_vm_fault_message()) {
HsaEvent* vm_fault_event = vm_fault_signal->EopEvent();
const HsaMemoryAccessFault& fault =
vm_fault_event->EventData.EventData.MemoryAccessFault;
std::string reason = "";
if (fault.Failure.NotPresent == 1) {
reason += "Page not present or supervisor privilege";
} else if (fault.Failure.ReadOnly == 1) {
reason += "Write access to a read-only page";
} else if (fault.Failure.NoExecute == 1) {
reason += "Execute access to a page marked NX";
} else if (fault.Failure.GpuAccess == 1) {
reason += "Host access only";
} else if (fault.Failure.ECC == 1) {
reason += "ECC failure (if supported by HW)";
} else {
reason += "Unknown";
}
fprintf(stderr,
"Memory access fault by GPU node-%u on address %p%s. Reason: %s.\n",
fault.NodeId, reinterpret_cast<const void*>(fault.VirtualAddress),
(fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "",
reason.c_str());
} else {
assert(false && "GPU memory access fault.");
}
std::abort();
// No need to keep the signal because we are done.
return false;
}
@@ -43,8 +43,6 @@
#ifndef HSA_RUNTIME_CORE_INC_FLAG_H_
#define HSA_RUNTIME_CORE_INC_FLAG_H_
#include <algorithm>
#include <cctype>
#include <stdint.h>
#include <string>
@@ -62,15 +60,8 @@ class Flag {
std::string var = os::GetEnvVar("HSA_CHECK_FLAT_SCRATCH");
check_flat_scratch_ = (var == "1") ? true : false;
var = os::GetEnvVar("HSA_DEBUG_FAULT");
std::transform(var.begin(), var.end(), var.begin(),
[](unsigned char c) { return std::tolower(c); });
if (var == "analyze") {
debug_fault_ = DEBUG_FAULT_ANALYZE;
} else {
debug_fault_ = DEBUG_FAULT_OFF;
}
var = os::GetEnvVar("HSA_ENABLE_VM_FAULT_MESSAGE");
enable_vm_fault_message_ = (var == "0") ? false : true;
var = os::GetEnvVar("HSA_ENABLE_QUEUE_FAULT_MESSAGE");
enable_queue_fault_message_ = (var == "0") ? false : true;
@@ -105,15 +96,10 @@ class Flag {
tools_lib_names_ = os::GetEnvVar("HSA_TOOLS_LIB");
}
enum DebugFaultEnum {
DEBUG_FAULT_OFF,
DEBUG_FAULT_ANALYZE,
};
bool check_flat_scratch() const { return check_flat_scratch_; }
DebugFaultEnum debug_fault() const { return debug_fault_; }
bool enable_vm_fault_message() const { return enable_vm_fault_message_; }
bool enable_queue_fault_message() const { return enable_queue_fault_message_; }
bool enable_interrupt() const { return enable_interrupt_; }
@@ -138,7 +124,7 @@ class Flag {
private:
bool check_flat_scratch_;
DebugFaultEnum debug_fault_;
bool enable_vm_fault_message_;
bool enable_interrupt_;
bool enable_sdma_;
bool emulate_aql_;