Revert "Implement memory fault analysis through context save area"

This reverts commit 498f3a7188. Change-Id: Ibf11b764b383b9be291f3009a30550e1a1e2d115 [ROCm/ROCR-Runtime commit: 5b4df54b10]
2017-06-14 14:14:38 -04:00
@@ -120,7 +120,6 @@ set ( SRCS "core/util/lnx/os_linux.cpp"
            "core/runtime/amd_blit_kernel.cpp"
            "core/runtime/amd_blit_sdma.cpp"
            "core/runtime/amd_cpu_agent.cpp"
-            "core/runtime/amd_debugger.cpp"
            "core/runtime/amd_gpu_agent.cpp"
            "core/runtime/amd_aql_queue.cpp"
            "core/runtime/amd_loader_context.cpp"
@@ -218,12 +218,6 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
                                   uint32_t group_segment_size,
                                   Queue** queue) = 0;

-  virtual hsa_status_t HostQueueCreate(hsa_region_t region, uint32_t ring_size,
-                                       hsa_queue_type32_t type, uint32_t features,
-                                       hsa_signal_t doorbell_signal, Queue** queue) = 0;
-
-  virtual hsa_status_t QueueDestroy(Queue* queue) = 0;
-
  // @brief Query the value of an attribute.
  //
  // @param [in] attribute Attribute to query.
@@ -44,13 +44,12 @@
 #define HSA_RUNTIME_CORE_INC_AMD_HW_AQL_COMMAND_PROCESSOR_H_

 #include "core/inc/runtime.h"
-#include "core/inc/amd_gpu_agent.h"
 #include "core/inc/signal.h"
 #include "core/inc/queue.h"
+#include "core/inc/amd_gpu_agent.h"
 #include "core/util/locks.h"

 namespace amd {
-
 /// @brief Encapsulates HW Aql Command Processor functionality. It
 /// provide the interface for things such as Doorbell register, read,
 /// write pointers and a buffer.
@@ -185,8 +184,6 @@ class AqlQueue : public core::Queue, public core::Signal {
  // @brief Submits a block of PM4 and waits until it has been executed.
  void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override;

-  WaveStates GetWaveStates();
-
  /// @brief This operation is illegal
  hsa_signal_value_t LoadRelaxed() override {
    assert(false);
@@ -45,8 +45,6 @@
 #ifndef HSA_RUNTIME_CORE_INC_AMD_CPU_AGENT_H_
 #define HSA_RUNTIME_CORE_INC_AMD_CPU_AGENT_H_

-#include <list>
-#include <memory>
 #include <vector>

 #include "hsakmt.h"
@@ -105,12 +103,6 @@ class CpuAgent : public core::Agent {
                           uint32_t group_segment_size,
                           core::Queue** queue) override;

-  hsa_status_t HostQueueCreate(hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type,
-                               uint32_t features, hsa_signal_t doorbell_signal,
-                               core::Queue** queue) override;
-
-  hsa_status_t QueueDestroy(core::Queue* queue) override;
-
  // @brief Returns number of data caches.
  __forceinline size_t num_cache() const { return cache_props_.size(); }

@@ -162,8 +154,6 @@ class CpuAgent : public core::Agent {
  // @brief Array of regions owned by this agent.
  std::vector<const core::MemoryRegion*> regions_;

-  std::list<std::unique_ptr<core::Queue>> queues_;
-
  DISALLOW_COPY_AND_ASSIGN(CpuAgent);
 };

@@ -1,57 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-//
-// The University of Illinois/NCSA
-// Open Source License (NCSA)
-//
-// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-//
-// Developed by:
-//
-//                 AMD Research and AMD HSA Software Development
-//
-//                 Advanced Micro Devices, Inc.
-//
-//                 www.amd.com
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal with the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-//  - Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimers.
-//  - Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimers in
-//    the documentation and/or other materials provided with the distribution.
-//  - Neither the names of Advanced Micro Devices, Inc,
-//    nor the names of its contributors may be used to endorse or promote
-//    products derived from this Software without specific prior written
-//    permission.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-// DEALINGS WITH THE SOFTWARE.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef HSA_RUNTIME_CORE_INC_AMD_DEBUGGER_H_
-#define HSA_RUNTIME_CORE_INC_AMD_DEBUGGER_H_
-
-#include "core/inc/amd_gpu_agent.h"
-#include "hsakmt.h"
-
-namespace amd {
-
-class Debugger {
- public:
-  static void HandleFault(const HsaMemoryAccessFault& fault, GpuAgentInt* agent);
-};
-}
-
-#endif
@@ -45,8 +45,6 @@
 #ifndef HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_
 #define HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_

-#include <list>
-#include <memory>
 #include <vector>

 #include "hsakmt.h"
@@ -61,42 +59,6 @@

 namespace amd {
 class MemoryRegion;
-class AqlQueue;
-
-struct WaveState {
-  // Number of SGPRs allocated per wavefront.
-  uint32_t num_sgprs;
-
-  // Array of packed SGPR data.
-  uint32_t* sgprs;
-
-  // Number of VGPRs allocated per wavefront.
-  uint32_t num_vgprs;
-
-  // Number of lanes in each VGPR.
-  uint32_t num_vgpr_lanes;
-
-  // Array of packed VGPR data.
-  // VGPR value = vgprs[(vgpr_idx * num_vgpr_lanes) + lane_idx]
-  uint32_t* vgprs;
-
-  // Data for miscellaneous registers.
-  struct {
-    uint64_t pc;
-    uint64_t exec;
-    uint32_t status;
-    uint32_t trapsts;
-    uint32_t m0;
-  } regs;
-
-  // LDS allocation size for the work group, in 32-bit words.
-  uint32_t lds_size_dw;
-
-  // Packed LDS data for the work group.
-  uint32_t* lds;
-};
-
-typedef std::vector<WaveState> WaveStates;

 // @brief Contains scratch memory information.
 struct ScratchInfo {
@@ -113,14 +75,6 @@ class GpuAgentInt : public core::Agent {
  GpuAgentInt(uint32_t node_id)
      : core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {}

-  // @brief GpuAgent does not support HostQueueCreation.
-  hsa_status_t HostQueueCreate(hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type,
-                               uint32_t features, hsa_signal_t doorbell_signal,
-                               core::Queue** queue) override {
-    assert(false && "GpuAgent::HostQueueCreate not implemented");
-    return HSA_STATUS_ERROR_INVALID_AGENT;
-  }
-
  // @brief Initialize DMA queue.
  //
  // @retval HSA_STATUS_SUCCESS DMA queue initialization is successful.
@@ -186,8 +140,6 @@ class GpuAgentInt : public core::Agent {
  // @brief Invalidate caches on the agent which may hold code object data.
  virtual void InvalidateCodeCaches() = 0;

-  virtual WaveStates GetWaveStates() = 0;
-
  // @brief Sets the coherency type of this agent.
  //
  // @param [in] type New coherency type.
@@ -300,8 +252,6 @@ class GpuAgent : public GpuAgentInt {
                           uint32_t group_segment_size,
                           core::Queue** queue) override;

-  hsa_status_t QueueDestroy(core::Queue* queue) override;
-
  // @brief Override from amd::GpuAgentInt.
  void AcquireQueueScratch(ScratchInfo& scratch) override;

@@ -318,8 +268,6 @@ class GpuAgent : public GpuAgentInt {
  // @brief Override from amd::GpuAgentInt.
  void InvalidateCodeCaches() override;

-  WaveStates GetWaveStates() override;
-
  // @brief Override from amd::GpuAgentInt.
  bool current_coherency_type(hsa_amd_coherency_type_t type) override;

@@ -430,10 +378,14 @@ class GpuAgent : public GpuAgentInt {

  core::Blit* blits_[BlitCount];

-  std::list<std::unique_ptr<AqlQueue>> queues_;
+  // @brief AQL queues for cache management and blit compute usage.
+  enum QueueEnum {
+    QueueUtility,   // Cache management and device to {host,device} blit compute
+    QueueBlitOnly,  // Host to device blit
+    QueueCount
+  };

-  core::Queue* queue_util_;
-  core::Queue* queue_blit_;
+  core::Queue* queues_[QueueCount];

  // @brief Mutex to protect the update to coherency type.
  KernelMutex coherency_lock_;
@@ -51,7 +51,7 @@
 namespace core {
 class HostQueue : public Queue {
 public:
-  HostQueue(Agent& agent, hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type,
+  HostQueue(hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type,
            uint32_t features, hsa_signal_t doorbell_signal);

  ~HostQueue();
@@ -45,7 +45,6 @@
 #ifndef HSA_RUNTME_CORE_INC_COMMAND_QUEUE_H_
 #define HSA_RUNTME_CORE_INC_COMMAND_QUEUE_H_
 #include <sstream>
-#include <vector>

 #include "core/common/shared.h"

@@ -56,8 +55,6 @@
 #include "inc/amd_hsa_queue.h"

 namespace core {
-class Agent;
-
 struct AqlPacket {

  union {
@@ -140,7 +137,7 @@ All funtions other than Convert and public_handle must be virtual.
 class Queue : public Checked<0xFA3906A679F9DB49>,
              public Shared<SharedQueue, AMD_QUEUE_ALIGN_BYTES> {
 public:
-  explicit Queue(Agent& agent) : Shared(), amd_queue_(shared_object()->amd_queue), agent_(agent) {
+  Queue() : Shared(), amd_queue_(shared_object()->amd_queue) {
    if (!Shared::IsSharedObjectAllocationValid()) {
      return;
    }
@@ -311,8 +308,6 @@ class Queue : public Checked<0xFA3906A679F9DB49>,

  hsa_queue_t* public_handle() const { return public_handle_; }

-  Agent& agent() { return agent_; }
-
 protected:
  static void set_public_handle(Queue* ptr, hsa_queue_t* handle) {
    ptr->do_set_public_handle(handle);
@@ -322,8 +317,6 @@ class Queue : public Checked<0xFA3906A679F9DB49>,
  }
  hsa_queue_t* public_handle_;

-  Agent& agent_;
-
 private:
  DISALLOW_COPY_AND_ASSIGN(Queue);
 };
@@ -84,9 +84,10 @@ void* AqlQueue::operator new(size_t size) {

 void AqlQueue::operator delete(void* ptr) { _aligned_free(ptr); }

-AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, ScratchInfo& scratch,
-                   core::HsaEventCallback callback, void* err_data, bool is_kv)
-    : Queue(*agent),
+AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id,
+                   ScratchInfo& scratch, core::HsaEventCallback callback,
+                   void* err_data, bool is_kv)
+    : Queue(),
      Signal(0),
      ring_buf_(NULL),
      ring_buf_alloc_bytes_(0),
@@ -961,113 +962,4 @@ void AqlQueue::InitScratchSRD() {

  return;
 }
-
-WaveStates AqlQueue::GetWaveStates() {
-  WaveStates wave_states;
-
-  // Retrieve the control stack and context save area for the queue.
-  HsaQueueInfo queue_info;
-  HSAKMT_STATUS status = hsaKmtGetQueueInfo(queue_id_, &queue_info);
-
-  if (status != HSAKMT_STATUS_SUCCESS) {
-    return wave_states;
-  }
-
-  // The control stack is processed from start to end.
-  // The save area is processed from end to start.
-  uint32_t* ctl_stack = reinterpret_cast<uint32_t*>(queue_info.ControlStackTop);
-  uint32_t* wave_area = reinterpret_cast<uint32_t*>(uintptr_t(queue_info.UserContextSaveArea) +
-                                                    queue_info.SaveAreaSizeInBytes);
-  uint32_t ctl_stack_ndw = uint32_t(queue_info.ControlStackUsedInBytes / sizeof(uint32_t));
-
-  // Control stack persists resource allocation until changed by a command.
-  uint32_t n_vgprs = 0;
-  uint32_t n_sgprs = 0;
-  uint32_t lds_size_dw = 0;
-
-  // LDS is saved per-workgroup but the stack is parsed per-wavefront.
-  // Track the LDS save area for the current workgroup.
-  uint32_t* lds = nullptr;
-
-  // Parse each write to COMPUTE_RELAUNCH in sequence.
-  // First two dwords are SET_SH_REG leader.
-  for (uint32_t idx = 2; idx < ctl_stack_ndw; ++idx) {
-    uint32_t relaunch = ctl_stack[idx];
-
-#define COMPUTE_RELAUNCH_PAYLOAD_VGPRS(x) (((x) >> 0x0) & 0x3F)
-#define COMPUTE_RELAUNCH_PAYLOAD_SGPRS(x) (((x) >> 0x6) & 0x7)
-#define COMPUTE_RELAUNCH_PAYLOAD_LDS_SIZE(x) (((x) >> 0x9) & 0x1FF)
-#define COMPUTE_RELAUNCH_PAYLOAD_FIRST_WAVE(x) (((x) >> 0x11) & 0x1)
-#define COMPUTE_RELAUNCH_IS_EVENT(x) (((x) >> 0x1E) & 0x1)
-#define COMPUTE_RELAUNCH_IS_STATE(x) (((x) >> 0x1F) & 0x1)
-
-    bool is_event = COMPUTE_RELAUNCH_IS_EVENT(relaunch);
-    bool is_state = COMPUTE_RELAUNCH_IS_STATE(relaunch);
-
-    if (is_state && !is_event) {
-      // Resource allocation state change, update tracked state.
-      n_vgprs = (0x1 + COMPUTE_RELAUNCH_PAYLOAD_VGPRS(relaunch)) * 0x4;
-      n_sgprs = ((0x1 + COMPUTE_RELAUNCH_PAYLOAD_SGPRS(relaunch)) - 0x1 /* no trap SGPRs */) * 0x10;
-      lds_size_dw = COMPUTE_RELAUNCH_PAYLOAD_LDS_SIZE(relaunch) * 0x80;
-    } else if (!is_state && !is_event) {
-      // Reference to one wavefront in the save area.
-      bool first_wave_in_group = COMPUTE_RELAUNCH_PAYLOAD_FIRST_WAVE(relaunch);
-
-      // Save area layout is fixed by context save trap handler and SPI.
-      uint32_t vgprs_offset = 0x0;
-      uint32_t sgprs_offset = vgprs_offset + n_vgprs * 0x40;
-      uint32_t hwregs_offset = sgprs_offset + n_sgprs;
-      uint32_t lds_offset = hwregs_offset + 0x20;
-      uint32_t unused_offset = lds_offset + (first_wave_in_group ? lds_size_dw : 0x0);
-      uint32_t wave_area_size = unused_offset + 0x10;  // trap SGPRs were allocated but not saved
-      uint32_t hwreg_m0_offset = hwregs_offset + 0x0;
-      uint32_t hwreg_pc_lo_offset = hwregs_offset + 0x1;
-      uint32_t hwreg_pc_hi_offset = hwregs_offset + 0x2;
-      uint32_t hwreg_exec_lo_offset = hwregs_offset + 0x3;
-      uint32_t hwreg_exec_hi_offset = hwregs_offset + 0x4;
-      uint32_t hwreg_status_offset = hwregs_offset + 0x5;
-      uint32_t hwreg_trapsts_offset = hwregs_offset + 0x6;
-
-      // Find beginning of wavefront state in the save area.
-      wave_area -= wave_area_size;
-
-      if (first_wave_in_group) {
-        // Track the LDS save area for this workgroup.
-        if (lds_size_dw > 0) {
-          lds = wave_area + lds_offset;
-        } else {
-          lds = nullptr;
-        }
-      }
-
-      WaveState wave_state;
-
-      wave_state.num_sgprs = n_sgprs;
-      wave_state.sgprs = wave_area + sgprs_offset;
-      wave_state.num_vgprs = n_vgprs;
-      wave_state.num_vgpr_lanes = 0x40;
-      wave_state.vgprs = wave_area + vgprs_offset;
-      wave_state.regs.pc = (uint64_t(wave_area[hwreg_pc_lo_offset]) |
-                            (uint64_t(wave_area[hwreg_pc_hi_offset]) << 0x20));
-      wave_state.regs.exec = uint64_t(wave_area[hwreg_exec_lo_offset]) |
-          (uint64_t(wave_area[hwreg_exec_hi_offset]) << 0x20);
-      wave_state.regs.status = wave_area[hwreg_status_offset];
-      wave_state.regs.trapsts = wave_area[hwreg_trapsts_offset];
-      wave_state.regs.m0 = wave_area[hwreg_m0_offset];
-      wave_state.lds_size_dw = lds_size_dw;
-      wave_state.lds = lds;
-
-#define SQ_WAVE_TRAPSTS_XNACK_ERROR(x) (((x) >> 0x1C) & 0x1)
-
-      if (SQ_WAVE_TRAPSTS_XNACK_ERROR(wave_state.regs.trapsts)) {
-        // Correct the PC: context save handler subtracted 0x8.
-        wave_state.regs.pc += 0x8;
-      }
-
-      wave_states.push_back(wave_state);
-    }
-  }
-
-  return wave_states;
-}
 }  // namespace amd
@@ -369,32 +369,4 @@ hsa_status_t CpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
  return HSA_STATUS_ERROR;
 }

-hsa_status_t CpuAgent::HostQueueCreate(hsa_region_t region, uint32_t ring_size,
-                                       hsa_queue_type32_t type, uint32_t features,
-                                       hsa_signal_t doorbell_signal, core::Queue** queue) {
-  core::HostQueue* host_queue =
-      new core::HostQueue(*this, region, ring_size, type, features, doorbell_signal);
-
-  if (!host_queue->IsValid()) {
-    delete host_queue;
-    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-  }
-
-  queues_.emplace_back(host_queue);
-  *queue = host_queue;
-
-  return HSA_STATUS_SUCCESS;
-}
-
-hsa_status_t CpuAgent::QueueDestroy(core::Queue* queue) {
-  auto it = std::find_if(
-      queues_.begin(), queues_.end(),
-      [&](std::unique_ptr<core::Queue>& queue_ptr) { return queue_ptr.get() == queue; });
-
-  assert(it != queues_.end() && "attempt to destroy an untracked queue");
-  queues_.erase(it);
-
-  return HSA_STATUS_SUCCESS;
-}
-
 }  // namespace amd
@@ -1,307 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-//
-// The University of Illinois/NCSA
-// Open Source License (NCSA)
-//
-// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-//
-// Developed by:
-//
-//                 AMD Research and AMD HSA Software Development
-//
-//                 Advanced Micro Devices, Inc.
-//
-//                 www.amd.com
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal with the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-//  - Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimers.
-//  - Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimers in
-//    the documentation and/or other materials provided with the distribution.
-//  - Neither the names of Advanced Micro Devices, Inc,
-//    nor the names of its contributors may be used to endorse or promote
-//    products derived from this Software without specific prior written
-//    permission.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-// DEALINGS WITH THE SOFTWARE.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-#include "core/inc/amd_debugger.h"
-#include "core/inc/amd_loader_context.hpp"
-#include "core/inc/amd_aql_queue.h"
-
-#include <cstdlib>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <sys/wait.h>
-#include <unistd.h>
-
-namespace amd {
-
-void Debugger::HandleFault(const HsaMemoryAccessFault& fault, GpuAgentInt* agent) {
-  std::stringstream err;
-
-  uint64_t fault_page_idx = fault.VirtualAddress >> 0xC;
-  err << "\nMemory access fault by GPU node " << agent->node_id();
-  err << " for address 0x" << std::hex << std::uppercase << fault_page_idx << "xxx (";
-
-  if (fault.Failure.NotPresent == 1) {
-    err << "page not present";
-  } else if (fault.Failure.ReadOnly == 1) {
-    err << "write access to a read-only page";
-  } else if (fault.Failure.NoExecute == 1) {
-    err << "execute access to a non-executable page";
-  } else if (fault.Failure.ECC == 1) {
-    err << "uncorrectable ECC failure";
-  } else {
-    err << "unknown reason";
-  }
-
-  err << ")\n\n";
-
-  if (core::Runtime::runtime_singleton_->flag().debug_fault() != Flag::DEBUG_FAULT_ANALYZE) {
-    if (agent->isa()->GetMajorVersion() >= 9) {
-      err << "For more detail set: HSA_DEBUG_FAULT=\"analyze\"\n\n";
-    }
-
-    std::cerr << err.str();
-    std::abort();
-  }
-
-  WaveStates wave_states = agent->GetWaveStates();
-
-  for (WaveState& wave_state : wave_states) {
-#define SQ_WAVE_TRAPSTS_XNACK_ERROR(x) (((x) >> 0x1C) & 0x1)
-
-    if (SQ_WAVE_TRAPSTS_XNACK_ERROR(wave_state.regs.trapsts)) {
-      err << "Wavefront found in XNACK error state:\n\n";
-      err << "     PC: 0x" << std::setw(0x10) << std::setfill('0') << wave_state.regs.pc << "\n";
-      err << "   EXEC: 0x" << std::setw(0x10) << std::setfill('0') << wave_state.regs.exec << "\n";
-      err << " STATUS: 0x" << std::setw(0x8) << std::setfill('0') << wave_state.regs.status << "\n";
-      err << "TRAPSTS: 0x" << std::setw(0x8) << std::setfill('0') << wave_state.regs.trapsts
-          << "\n";
-      err << "     M0: 0x" << std::setw(0x8) << std::setfill('0') << wave_state.regs.m0 << "\n\n";
-
-      uint32_t n_sgpr_cols = 4;
-      uint32_t n_sgpr_rows = wave_state.num_sgprs / n_sgpr_cols;
-
-      for (uint32_t sgpr_row = 0; sgpr_row < n_sgpr_rows; ++sgpr_row) {
-        err << " ";
-
-        for (uint32_t sgpr_col = 0; sgpr_col < n_sgpr_cols; ++sgpr_col) {
-          uint32_t sgpr_idx = (sgpr_row * n_sgpr_cols) + sgpr_col;
-          uint32_t sgpr_val = wave_state.sgprs[sgpr_idx];
-
-          std::stringstream sgpr_str;
-          sgpr_str << "s" << sgpr_idx;
-
-          err << std::setw(6) << std::setfill(' ') << sgpr_str.str();
-          err << ": 0x" << std::setw(8) << std::setfill('0') << sgpr_val;
-        }
-
-        err << "\n";
-      }
-
-      err << "\n";
-
-      uint32_t n_vgpr_cols = 4;
-      uint32_t n_vgpr_rows = wave_state.num_vgprs / n_vgpr_cols;
-
-      for (uint32_t lane_idx = 0; lane_idx < wave_state.num_vgpr_lanes; ++lane_idx) {
-        err << "Lane 0x" << lane_idx << "\n";
-
-        for (uint32_t vgpr_row = 0; vgpr_row < n_vgpr_rows; ++vgpr_row) {
-          err << " ";
-
-          for (uint32_t vgpr_col = 0; vgpr_col < n_vgpr_cols; ++vgpr_col) {
-            uint32_t vgpr_idx = (vgpr_row * n_vgpr_cols) + vgpr_col;
-            uint32_t vgpr_val = wave_state.vgprs[(vgpr_idx * wave_state.num_vgpr_lanes) + lane_idx];
-
-            std::stringstream vgpr_str;
-            vgpr_str << "v" << vgpr_idx;
-
-            err << std::setw(6) << std::setfill(' ') << vgpr_str.str();
-            err << ": 0x" << std::setw(8) << std::setfill('0') << vgpr_val;
-          }
-
-          err << "\n";
-        }
-      }
-
-      err << "\n";
-
-      if (wave_state.lds) {
-        err << "LDS:\n\n";
-
-        uint32_t n_lds_cols = 4;
-        uint32_t n_lds_rows = wave_state.lds_size_dw / n_lds_cols;
-
-        for (uint32_t lds_row = 0; lds_row < n_lds_rows; ++lds_row) {
-          uint32_t lds_addr = lds_row * n_lds_cols * 4;
-
-          err << "0x" << std::setw(4) << std::setfill('0') << lds_addr << ":";
-
-          for (uint32_t lds_col = 0; lds_col < n_lds_cols; ++lds_col) {
-            uint32_t lds_idx = (lds_row * n_lds_cols) + lds_col;
-            uint32_t lds_val = wave_state.lds[lds_idx];
-
-            err << "  0x" << std::setw(8) << std::setfill('0') << lds_val;
-          }
-
-          err << "\n";
-        }
-
-        err << "\n";
-      }
-
-      // Attempt to match the PC to a loaded code object.
-      amd::hsa::loader::LoadedCodeObject* pc_code_obj = nullptr;
-      uint64_t pc_code_obj_offset = 0;
-
-      auto iter_execs = [&](hsa_executable_t exec) {
-        auto iter_code_objs = [&](hsa_loaded_code_object_t code_obj) {
-          auto iter_segments = [&](amd_loaded_segment_t segment) {
-            auto segment_int = amd::hsa::loader::LoadedSegment::Object(segment);
-
-            uint64_t load_base, load_size;
-            segment_int->GetInfo(AMD_LOADED_SEGMENT_INFO_LOAD_BASE_ADDRESS, &load_base);
-            segment_int->GetInfo(AMD_LOADED_SEGMENT_INFO_SIZE, &load_size);
-
-            if ((wave_state.regs.pc >= load_base) &&
-                (wave_state.regs.pc < (load_base + load_size))) {
-              pc_code_obj = amd::hsa::loader::LoadedCodeObject::Object(code_obj);
-              pc_code_obj_offset = wave_state.regs.pc - load_base;
-            }
-
-            return HSA_STATUS_SUCCESS;
-          };
-
-          amd::hsa::loader::LoadedCodeObject::Object(code_obj)->IterateLoadedSegments(
-              [](amd_loaded_segment_t segment, void* data) {
-                return (*reinterpret_cast<decltype(iter_segments)*>(data))(segment);
-              },
-              &iter_segments);
-
-          return HSA_STATUS_SUCCESS;
-        };
-
-        amd::hsa::loader::Executable::Object(exec)->IterateLoadedCodeObjects(
-            [](hsa_loaded_code_object_t code_obj, void* data) {
-              return (*reinterpret_cast<decltype(iter_code_objs)*>(data))(code_obj);
-            },
-            &iter_code_objs);
-
-        return HSA_STATUS_SUCCESS;
-      };
-
-      core::Runtime::runtime_singleton_->loader()->IterateExecutables(
-          [](hsa_executable_t exec, void* data) {
-            return (*reinterpret_cast<decltype(iter_execs)*>(data))(exec);
-          },
-          &iter_execs);
-
-      if (pc_code_obj) {
-        // Write the code object to a temporary file.
-        uint64_t elf_addr;
-        size_t elf_size;
-        pc_code_obj->GetInfo(AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE, &elf_addr);
-        pc_code_obj->GetInfo(AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE_SIZE, &elf_size);
-
-        char code_obj_path[] = "/tmp/hsartXXXXXX";
-        int code_obj_fd = ::mkstemp(code_obj_path);
-        ::write(code_obj_fd, (const void*)uintptr_t(elf_addr), elf_size);
-        ::close(code_obj_fd);
-
-        // Invoke binutils objdump on the code object.
-        int pipe_fd[2];
-        ::pipe(pipe_fd);
-
-        pid_t pid = ::fork();
-
-        if (pid == 0) {
-          ::dup2(pipe_fd[1], STDOUT_FILENO);
-          ::dup2(pipe_fd[1], STDERR_FILENO);
-          ::close(pipe_fd[0]);
-          ::close(pipe_fd[1]);
-
-          // Disassemble X bytes before/after the PC.
-          uint32_t disasm_context = 0x20;
-
-          std::stringstream arg_start_addr, arg_stop_addr;
-          arg_start_addr << "--start-addr=0x" << std::hex << (pc_code_obj_offset - disasm_context);
-          arg_stop_addr << "--stop-addr=0x" << std::hex << (pc_code_obj_offset + disasm_context);
-
-          std::exit(execlp("objdump", "-d", "-S", "-l", arg_start_addr.str().c_str(),
-                           arg_stop_addr.str().c_str(), code_obj_path, nullptr));
-        }
-
-        // Collect the output of objdump.
-        ::close(pipe_fd[1]);
-
-        std::vector<char> objdump_out_buf;
-        std::vector<char> buf(0x1000);
-        ssize_t n_read_b;
-
-        while ((n_read_b = read(pipe_fd[0], buf.data(), buf.size())) > 0) {
-          objdump_out_buf.insert(objdump_out_buf.end(), &buf[0], &buf[n_read_b]);
-        }
-
-        ::close(pipe_fd[0]);
-
-        int child_status = 0;
-        int ret = ::waitpid(pid, &child_status, 0);
-
-        if (ret != -1 && child_status == 0) {
-          // Attempt to trim the leading output from objdump.
-          std::string objdump_out(objdump_out_buf.begin(), objdump_out_buf.end());
-          size_t trim_start = objdump_out.find(":\n\n") + 3;
-
-          if (trim_start != objdump_out.npos) {
-            objdump_out = objdump_out.substr(trim_start);
-          }
-
-          // Attempt to add a PC indicator inside the disassembly text.
-          std::stringstream pc_offset_find;
-          pc_offset_find << std::hex << pc_code_obj_offset << ":\t";
-          size_t replace_idx = objdump_out.find(pc_offset_find.str());
-
-          if (replace_idx != objdump_out.npos) {
-            std::stringstream pc_offset_replace;
-            pc_offset_replace << std::hex << pc_code_obj_offset << ": >>>>>\t";
-            objdump_out.replace(replace_idx, pc_offset_find.str().size(), pc_offset_replace.str());
-            err << objdump_out << "\n";
-          } else {
-            err << objdump_out;
-            err << "\nPC offset: " << std::hex << pc_code_obj_offset << "\n\n";
-          }
-        } else {
-          err << "(Disassembly unavailable - is amdgcn-capable objdump in PATH?)\n\n";
-        }
-
-        ::unlink(code_obj_path);
-      } else {
-        err << "(Cannot match PC to a loaded code object)\n\n";
-      }
-    }
-  }
-
-  std::cerr << err.str();
-  std::abort();
-}
-}
@@ -73,8 +73,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
      properties_(node_props),
      current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
      blits_(),
-      queue_util_(nullptr),
-      queue_blit_(nullptr),
+      queues_(),
      local_region_(NULL),
      is_kv_device_(false),
      trap_code_buf_(NULL),
@@ -138,7 +137,9 @@ GpuAgent::~GpuAgent() {
    }
  }

-  queues_.clear();
+  for (int i = 0; i < QueueCount; ++i) {
+    delete queues_[i];
+  }

  if (end_ts_base_addr_ != NULL) {
    core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_);
@@ -580,16 +581,16 @@ void GpuAgent::InitDma() {
      // Fall back to blit kernel if SDMA is unavailable.
      if (blits_[BlitHostToDev] == NULL) {
        // Create a dedicated compute queue for host-to-device blits.
-        queue_blit_ = CreateInterceptibleQueue();
-        assert(queue_blit_ != NULL && "Queue creation failed");
+        queues_[QueueBlitOnly] = CreateInterceptibleQueue();
+        assert(queues_[QueueBlitOnly] != NULL && "Queue creation failed");

-        blits_[BlitHostToDev] = CreateBlitKernel(queue_blit_);
+        blits_[BlitHostToDev] = CreateBlitKernel(queues_[QueueBlitOnly]);
        assert(blits_[BlitHostToDev] != NULL && "Blit creation failed");
      }

      if (blits_[BlitDevToHost] == NULL) {
        // Share utility queue with device-to-host blits.
-        blits_[BlitDevToHost] = CreateBlitKernel(queue_util_);
+        blits_[BlitDevToHost] = CreateBlitKernel(queues_[QueueUtility]);
        assert(blits_[BlitDevToHost] != NULL && "Blit creation failed");
      }

@@ -604,14 +605,14 @@ hsa_status_t GpuAgent::PostToolsInit() {
  BindTrapHandler();

  // Defer utility queue creation to allow tools to intercept.
-  queue_util_ = CreateInterceptibleQueue();
+  queues_[QueueUtility] = CreateInterceptibleQueue();

-  if (queue_util_ == NULL) {
+  if (queues_[QueueUtility] == NULL) {
    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
  }

  // Share utility queue with device-to-device blits.
-  blits_[BlitDevToDev] = CreateBlitKernel(queue_util_);
+  blits_[BlitDevToDev] = CreateBlitKernel(queues_[QueueUtility]);

  if (blits_[BlitDevToDev] == NULL) {
    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
@@ -925,7 +926,6 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
                                    event_callback, data, is_kv_device_);
  if (hw_queue && hw_queue->IsValid()) {
    // return queue
-    queues_.emplace_back(hw_queue);
    *queue = hw_queue;
    return HSA_STATUS_SUCCESS;
  }
@@ -935,28 +935,6 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
  return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
 }

-WaveStates GpuAgent::GetWaveStates() {
-  WaveStates wave_states;
-
-  for (auto& queue : queues_) {
-    WaveStates queue_wave_states = queue->GetWaveStates();
-    wave_states.insert(wave_states.end(), queue_wave_states.begin(), queue_wave_states.end());
-  }
-
-  return wave_states;
-}
-
-hsa_status_t GpuAgent::QueueDestroy(core::Queue* queue) {
-  auto it = std::find_if(queues_.begin(), queues_.end(), [&](std::unique_ptr<AqlQueue>& queue_ptr) {
-    return static_cast<core::Queue*>(queue_ptr.get()) == queue;
-  });
-
-  assert(it != queues_.end() && "attempt to destroy an untracked queue");
-  queues_.erase(it);
-
-  return HSA_STATUS_SUCCESS;
-}
-
 void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
  bool need_queue_scratch_base = (isa_->GetMajorVersion() > 8);

@@ -1241,7 +1219,7 @@ void GpuAgent::InvalidateCodeCaches() {
  cache_inv[6] = 0;

  // Submit the command to the utility queue and wait for it to complete.
-  queue_util_->ExecutePM4(cache_inv, sizeof(cache_inv));
+  queues_[QueueUtility]->ExecutePM4(cache_inv, sizeof(cache_inv));
 }

 }  // namespace
@@ -46,9 +46,12 @@
 #include "core/util/utils.h"

 namespace core {
-HostQueue::HostQueue(Agent& agent, hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type,
-                     uint32_t features, hsa_signal_t doorbell_signal)
-    : Queue(agent), size_(ring_size), active_(false) {
+HostQueue::HostQueue(hsa_region_t region, uint32_t ring_size,
+                     hsa_queue_type32_t type, uint32_t features,
+                     hsa_signal_t doorbell_signal)
+    : Queue(),
+      size_(ring_size),
+      active_(false) {
  if (!Shared::IsSharedObjectAllocationValid()) {
    return;
  }
@@ -608,14 +608,17 @@ hsa_status_t hsa_soft_queue_create(hsa_region_t region, uint32_t size,
  const core::Signal* signal = core::Signal::Convert(doorbell_signal);
  IS_VALID(signal);

-  core::Agent* agent = core::Runtime::runtime_singleton_->cpu_agents().front();
-  core::Queue* host_queue = nullptr;
-  hsa_status_t status =
-      agent->HostQueueCreate(region, size, type, features, doorbell_signal, &host_queue);
+  core::HostQueue* host_queue =
+      new core::HostQueue(region, size, type, features, doorbell_signal);

-  *queue = (host_queue ? core::Queue::Convert(host_queue) : nullptr);
+  if (!host_queue->active()) {
+    delete host_queue;
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }

-  return status;
+  *queue = core::Queue::Convert(host_queue);
+
+  return HSA_STATUS_SUCCESS;
 }

 /// @brief Api to destroy a user mode queue
@@ -628,7 +631,8 @@ hsa_status_t hsa_queue_destroy(hsa_queue_t* queue) {
  IS_BAD_PTR(queue);
  core::Queue* cmd_queue = core::Queue::Convert(queue);
  IS_VALID(cmd_queue);
-  return cmd_queue->agent().QueueDestroy(cmd_queue);
+  delete cmd_queue;
+  return HSA_STATUS_SUCCESS;
 }

 /// @brief Api to inactivate a user mode queue
@@ -53,7 +53,6 @@

 #include "core/inc/hsa_ext_interface.h"
 #include "core/inc/amd_cpu_agent.h"
-#include "core/inc/amd_debugger.h"
 #include "core/inc/amd_gpu_agent.h"
 #include "core/inc/amd_memory_region.h"
 #include "core/inc/amd_topology.h"
@@ -917,24 +916,55 @@ void Runtime::BindVmFaultHandler() {
      return;
    }

-    SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_), HSA_SIGNAL_CONDITION_NE, 0,
-                          VMFaultHandler, this);
+    SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_),
+                          HSA_SIGNAL_CONDITION_NE, 0, VMFaultHandler,
+                          reinterpret_cast<void*>(vm_fault_signal_));
  }
 }

 bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
-  Runtime* runtime = reinterpret_cast<Runtime*>(arg);
-  assert(runtime->vm_fault_signal_ != NULL);
+  core::InterruptSignal* vm_fault_signal =
+      reinterpret_cast<core::InterruptSignal*>(arg);

-  HsaEvent* vm_fault_event = runtime->vm_fault_signal_->EopEvent();
-  const HsaMemoryAccessFault& fault = vm_fault_event->EventData.EventData.MemoryAccessFault;
+  assert(vm_fault_signal != NULL);

-  auto agent_it = std::find_if(runtime->gpu_agents_.begin(), runtime->gpu_agents_.end(),
-                               [&](Agent* agent) { return agent->node_id() == fault.NodeId; });
-  assert(agent_it != runtime->gpu_agents_.end());
+  if (vm_fault_signal == NULL) {
+    return false;
+  }

-  amd::Debugger::HandleFault(fault, static_cast<amd::GpuAgentInt*>(*agent_it));
+  if (runtime_singleton_->flag().enable_vm_fault_message()) {
+    HsaEvent* vm_fault_event = vm_fault_signal->EopEvent();

+    const HsaMemoryAccessFault& fault =
+        vm_fault_event->EventData.EventData.MemoryAccessFault;
+
+    std::string reason = "";
+    if (fault.Failure.NotPresent == 1) {
+      reason += "Page not present or supervisor privilege";
+    } else if (fault.Failure.ReadOnly == 1) {
+      reason += "Write access to a read-only page";
+    } else if (fault.Failure.NoExecute == 1) {
+      reason += "Execute access to a page marked NX";
+    } else if (fault.Failure.GpuAccess == 1) {
+      reason += "Host access only";
+    } else if (fault.Failure.ECC == 1) {
+      reason += "ECC failure (if supported by HW)";
+    } else {
+      reason += "Unknown";
+    }
+
+    fprintf(stderr,
+            "Memory access fault by GPU node-%u on address %p%s. Reason: %s.\n",
+            fault.NodeId, reinterpret_cast<const void*>(fault.VirtualAddress),
+            (fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "",
+            reason.c_str());
+  } else {
+    assert(false && "GPU memory access fault.");
+  }
+
+  std::abort();
+
+  // No need to keep the signal because we are done.
  return false;
 }

@@ -43,8 +43,6 @@
 #ifndef HSA_RUNTIME_CORE_INC_FLAG_H_
 #define HSA_RUNTIME_CORE_INC_FLAG_H_

-#include <algorithm>
-#include <cctype>
 #include <stdint.h>

 #include <string>
@@ -62,15 +60,8 @@ class Flag {
    std::string var = os::GetEnvVar("HSA_CHECK_FLAT_SCRATCH");
    check_flat_scratch_ = (var == "1") ? true : false;

-    var = os::GetEnvVar("HSA_DEBUG_FAULT");
-    std::transform(var.begin(), var.end(), var.begin(),
-                   [](unsigned char c) { return std::tolower(c); });
-
-    if (var == "analyze") {
-      debug_fault_ = DEBUG_FAULT_ANALYZE;
-    } else {
-      debug_fault_ = DEBUG_FAULT_OFF;
-    }
+    var = os::GetEnvVar("HSA_ENABLE_VM_FAULT_MESSAGE");
+    enable_vm_fault_message_ = (var == "0") ? false : true;

    var = os::GetEnvVar("HSA_ENABLE_QUEUE_FAULT_MESSAGE");
    enable_queue_fault_message_ = (var == "0") ? false : true;
@@ -105,15 +96,10 @@ class Flag {
    tools_lib_names_ = os::GetEnvVar("HSA_TOOLS_LIB");
  }

-  enum DebugFaultEnum {
-    DEBUG_FAULT_OFF,
-    DEBUG_FAULT_ANALYZE,
-  };
-
  bool check_flat_scratch() const { return check_flat_scratch_; }

-  DebugFaultEnum debug_fault() const { return debug_fault_; }
-
+  bool enable_vm_fault_message() const { return enable_vm_fault_message_; }
+  
  bool enable_queue_fault_message() const { return enable_queue_fault_message_; }

  bool enable_interrupt() const { return enable_interrupt_; }
@@ -138,7 +124,7 @@ class Flag {

 private:
  bool check_flat_scratch_;
-  DebugFaultEnum debug_fault_;
+  bool enable_vm_fault_message_;
  bool enable_interrupt_;
  bool enable_sdma_;
  bool emulate_aql_;