rocr: Adding support for Stochastic PC Sampling for gfx94x (#47)

Change-Id: Ide4c2e25b88f1f25ea4ce35a619b93963c0355ee
2025-02-22 00:13:08 -05:00
@@ -508,24 +508,20 @@ class GpuAgent : public GpuAgentInt {

  // @brief Binds the second-level trap handler to this node.
  void BindTrapHandler();
-  hsa_status_t UpdateTrapHandlerWithPCS(void* pcs_hosttrap_buffers, void* stochastic_hosttrap_buffers);

  // @brief Override from core::Agent.
  hsa_status_t EnableDmaProfiling(bool enable) override;

  hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
                                       void* cb_data) override;
-  hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) override;
+  hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session);
  hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId,
-                            pcs::PcsRuntime::PcSamplingSession& session) override;
-  hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) override;
-  hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) override;
-  hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) override;
-  hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) override;
-  hsa_status_t PcSamplingFlushHostTrapDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session);
-
-  static void PcSamplingThreadRun(void* agent);
-  void PcSamplingThread();
+                            pcs::PcsRuntime::PcSamplingSession& session);
+  hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session);
+  hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session);
+  hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session);
+  hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session);
+  hsa_status_t PcSamplingFlushDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session);

  // @brief Node properties.
  const HsaNodeProperties properties_;
@@ -749,13 +745,13 @@ class GpuAgent : public GpuAgentInt {
    uint8_t reserved1[16];
    /* pc_sample_t buffer0[buf_size]; */
    /* pc_sample_t buffer1[buf_size]; */
-  } pcs_hosttrap_sampling_data_t;
+  } pcs_sampling_data_t;

  typedef struct {
-    /* Hosttrap data - stored on device so that trap_handler code can access efficiently */
-    pcs_hosttrap_sampling_data_t* device_data;
+    /* Sampling data - stored on device for trap handler access */
+    pcs_sampling_data_t* device_data;

-    /* Hosttrap host buffer - stored on host */
+    /* Sampling host buffer - stored on host */
    uint8_t* host_buffer;
    size_t host_buffer_size;
    uint8_t* host_buffer_wrap_pos;
@@ -774,11 +770,16 @@ class GpuAgent : public GpuAgentInt {

    os::Thread thread;
    pcs::PcsRuntime::PcSamplingSession* session;
-  } pcs_hosttrap_t;
-
-  pcs_hosttrap_t pcs_hosttrap_data_;
+  } pcs_data_t;
  /* PC Sampling fields - end */

+  hsa_status_t UpdateTrapHandlerWithPCS(pcs_sampling_data_t* pcs_hosttrap_buffers,
+                                        pcs_sampling_data_t* pcs_stochastic_buffers);
+
+  // @brief Thread function to process PC sampling data collected via host-trap
+  // or Stochastic sampling.
+  void PcSamplingThread(pcs_data_t& pcs_data, const char* thread_name);
+
  // @brief device handle
  amdgpu_device_handle ldrm_dev_;

@@ -793,6 +794,12 @@ class GpuAgent : public GpuAgentInt {

  bool uses_rec_sdma_eng_id_mask_;

+  // structure for host trap sampling
+  pcs_data_t pcs_hosttrap_data_;
+
+  // structure for stochastic sampling
+  pcs_data_t pcs_stochastic_data_;
+
  // @bried XGMI CPU<->GPU
  bool xgmi_cpu_gpu_;
 };
@@ -116,6 +116,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
          [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }),
      trap_handler_tma_region_(NULL),
      pcs_hosttrap_data_(),
+      pcs_stochastic_data_(),
      xgmi_cpu_gpu_(false) {
  const bool is_apu_node = (properties_.NumCPUCores > 0);
  profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE;
@@ -2166,7 +2167,7 @@ void GpuAgent::SyncClocks() {
  assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error");
 }

-hsa_status_t GpuAgent::UpdateTrapHandlerWithPCS(void* pcs_hosttrap_buffers, void* pcs_stochastic_buffers) {
+hsa_status_t GpuAgent::UpdateTrapHandlerWithPCS(pcs_sampling_data_t* pcs_hosttrap_buffers, pcs_sampling_data_t* pcs_stochastic_buffers) {
  // Assemble the trap handler source code.
  void* tma_addr = nullptr;
  uint64_t tma_size = 0;
@@ -2541,7 +2542,11 @@ hsa_status_t GpuAgent::PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& sess
  ret = PcSamplingCreateFromId(0, session);
  if (ret != HSA_STATUS_SUCCESS) return ret;

+  // Obtain the sampling information from the session.
  session.GetHsaKmtSamplingInfo(&sampleInfo);
+
+  // Pass the sampling information to the kernel driver to create PC
+  // sampling session.
  HSAKMT_STATUS retkmt = hsaKmtPcSamplingCreate(node_id(), &sampleInfo, &thunkId);
  if (retkmt != HSAKMT_STATUS_SUCCESS) {
    return (retkmt == HSAKMT_STATUS_KERNEL_ALREADY_OPENED) ? (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY
@@ -2557,114 +2562,133 @@ hsa_status_t GpuAgent::PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& sess

 hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId,
                                              pcs::PcsRuntime::PcSamplingSession& session) {
-  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
+  // Determine the sampling method from the session
+  hsa_ven_amd_pcs_method_kind_t sampling_method = session.method();

-  if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
-    // TODO: For now can only have 1 hosttrap session at a time. As a final solution, we want to be
-    // able to support multiple sessions at a time. But this makes the session.HandleSampleData more
-    // complicated if multiple sessions have different buffer sizes.
-    if (ht_data.session) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  pcs_data_t* pcs_data = nullptr;

-    // This is current amd_aql_queue->pm4_ib_size_b_
-    ht_data.cmd_data_sz = 0x1000;
-    ht_data.cmd_data = (uint32_t*)malloc(ht_data.cmd_data_sz);
-    assert(ht_data.cmd_data);
+  if (sampling_method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
+    pcs_data = &pcs_hosttrap_data_;
+  } else if (sampling_method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
+    pcs_data = &pcs_stochastic_data_;
+  } else {
+    // Unsupported sampling method
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }

-    if (HSA::hsa_signal_create(1, 0, NULL, &ht_data.exec_pm4_signal) != HSA_STATUS_SUCCESS)
-      return HSA_STATUS_ERROR;
+  // Ensure only one session is active at a time for the given method
+  if (pcs_data->session)
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;  // TODO: For now, we can only have
+                                               // 1 pc sampling session at a
+                                               // time. As a final solution, we
+                                               // want to be able to support
+                                               // multiple sessions at a time.
+                                               // But this makes the
+                                               // session.HandleSampleData more
+                                               // complicated if multiple
+                                               // sessions have different buffer
+                                               // sizes.

-    ht_data.old_val = (uint64_t*)system_allocator()(sizeof(uint64_t), 0x1000, 0);
-    assert(ht_data.old_val);
+  // This is current amd_aql_queue->pm4_ib_size_b_
+  pcs_data->cmd_data_sz = 0x1000;  // 4KB
+  pcs_data->cmd_data = (uint32_t*)malloc(pcs_data->cmd_data_sz);
+  if (!pcs_data->cmd_data) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;

-    if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, ht_data.old_val))
-      return HSA_STATUS_ERROR;
+  if (HSA::hsa_signal_create(1, 0, NULL, &pcs_data->exec_pm4_signal) != HSA_STATUS_SUCCESS)
+    return HSA_STATUS_ERROR;

-    // Local copy of hosttrap data - we cannot access device memory directly on non-large BAR
-    // systems
-    pcs_hosttrap_sampling_data_t* device_datahost =
-        (pcs_hosttrap_sampling_data_t*)system_allocator()(sizeof(*device_datahost), 0x1000, 0);
-    if (!device_datahost) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  pcs_data->old_val = (uint64_t*)system_allocator()(sizeof(uint64_t), 0x1000, 0);
+  if (!pcs_data->old_val) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;

-    MAKE_SCOPE_GUARD([&]() { system_deallocator()(device_datahost); });
+  if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, pcs_data->old_val))
+    return HSA_STATUS_ERROR;

-    memset(device_datahost, 0, sizeof(*device_datahost));
+  // Local copy of pc sampling data - we cannot access device memory directly on non-large BAR
+  // systems
+  pcs_sampling_data_t* device_datahost =
+      (pcs_sampling_data_t*)system_allocator()(sizeof(pcs_sampling_data_t), 0x1000, 0);
+  if (!device_datahost) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;

-    if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, device_datahost) !=
-        HSA_STATUS_SUCCESS)
-      return HSA_STATUS_ERROR;
+  MAKE_SCOPE_GUARD([&]() { system_deallocator()(device_datahost); });

-    MAKE_NAMED_SCOPE_GUARD(freeHostTrapResources, [&]() {
-      if (ht_data.device_data) {
-        if (ht_data.device_data->done_sig0.handle)
-          HSA::hsa_signal_destroy(ht_data.device_data->done_sig0);
-        if (ht_data.device_data->done_sig1.handle)
-          HSA::hsa_signal_destroy(ht_data.device_data->done_sig1);
+  memset(device_datahost, 0, sizeof(*device_datahost));

-        finegrain_deallocator()(ht_data.device_data);
-      }
-      if (ht_data.host_buffer) system_deallocator()(ht_data.host_buffer);
-    });
+  if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, device_datahost) !=
+      HSA_STATUS_SUCCESS)
+    return HSA_STATUS_ERROR;

-    // Force creating of PC Sampling queue to trigger exception early in case we exceed max availble
-    // CP queues on this agent
-    queues_[QueuePCSampling].touch();
+  MAKE_NAMED_SCOPE_GUARD(freeResources, [&]() {
+    if (pcs_data->device_data) {
+      if (pcs_data->device_data->done_sig0.handle)
+        HSA::hsa_signal_destroy(pcs_data->device_data->done_sig0);
+      if (pcs_data->device_data->done_sig1.handle)
+        HSA::hsa_signal_destroy(pcs_data->device_data->done_sig1);

-    /*
-     * When calling queue->ExecutePM4() Indirect Buffer size which is 0x1000 bytes (1024 DW).
-     * The maximum indirect buffer size we need occurs when we enqueue the
-     * WAIT_REG_MEM, DMA_COPY(s), WRITE_DATA ops:
-     * For WAIT_REG_MEM = 7 DW
-     * For each DMA_COPY = 7 DW
-     * For WRITE_DATA_CMD = 6 DW
-     *
-     * So maximum number of DMA_COPY ops is:
-     * (MAX_IB_SIZE - sizeof(WAIT_REG_MEM) - sizeof(WRITE_DATA_CMD)) / sizeof(DMA_COPY)
-     * (1024 - 7 - 6) / 7 = 144
-     *
-     * Each DMA_COPY op can transfer (1 << 26) bytes, which is 9 GB. trap_buffer_size is a 32-bit
-     * number, so the buffer must be < 4 GB. So we are not limited by Indirect Buffer size.
-     * Set current limit to 256 MB to limit device VRAM usage
-     */
-    const size_t max_trap_buffer_size =
-        core::Runtime::runtime_singleton_->flag().pc_sampling_max_device_buffer_size();
+      finegrain_deallocator()(pcs_data->device_data);
+    }
+    if (pcs_data->host_buffer) system_deallocator()(pcs_data->host_buffer);
+  });

-    /*
-     * We use a double-buffer mechanism where there are 2 trap-buffers and 1 host-buffer
-     * Warning: This currently assumes that client latency is smaller than time to fill 1
-     * trap-buffer If latency is bigger, we have to increate host-buffer
-     *
-     * host-buffer must be >= client-buffer so that we can copy full size of client-buffer each
-     * time. To avoid having to deal with wrap-arounds, host-buffer must be a multiple of
-     * trap-buffers
-     *
-     * if client-buffer size is greater than 2x max_trap_buffer_size:
-     *    We are limited by max_trap_buffer_size.
-     *    trap-buffer = max-trap-buffer-size
-     *    host-buffer = 2*smallest size greater than client-buffer but multiple of 1 trap-buffer
-     * else:
-     *    We reduce the trap-buffers so that:
-     *    trap-buffer = half of user-buffer
-     *    host-buffer = 2*user-buffer
-     *
-     * TODO: We are currently using a temporary host-buffer so that we can increase host-buffer to
-     * factor in client latency. Using a direct-copy to the client buffer would be more efficient.
-     * Revisit this once we have empirical data of latency vs how long it takes to fill 1
-     * trap-buffer.
-     */
+  // Force creating of PC Sampling queue to trigger exception early in case we exceed max availble
+  // CP queues on this agent
+  queues_[QueuePCSampling].touch();

-    size_t trap_buffer_size = 0;
-    if (session.buffer_size() > 2 * max_trap_buffer_size) {
-      trap_buffer_size = max_trap_buffer_size;
-      ht_data.host_buffer_size = 2 * AlignUp(session.buffer_size(), trap_buffer_size);
+  /*
+   * When calling queue->ExecutePM4() Indirect Buffer size which is 0x1000 bytes (1024 DW).
+   * The maximum indirect buffer size we need occurs when we enqueue the
+   * WAIT_REG_MEM, DMA_COPY(s), WRITE_DATA ops:
+   * For WAIT_REG_MEM = 7 DW
+   * For each DMA_COPY = 7 DW
+   * For WRITE_DATA_CMD = 6 DW
+   *
+   * So maximum number of DMA_COPY ops is:
+   * (MAX_IB_SIZE - sizeof(WAIT_REG_MEM) - sizeof(WRITE_DATA_CMD)) / sizeof(DMA_COPY)
+   * (1024 - 7 - 6) / 7 = 144
+   *
+   * Each DMA_COPY op can transfer (1 << 26) bytes, which is 9 GB. trap_buffer_size is a 32-bit
+   * number, so the buffer must be < 4 GB. So we are not limited by Indirect Buffer size.
+   * Set current limit to 256 MB to limit device VRAM usage
+   */
+  const size_t max_trap_buffer_size =
+      core::Runtime::runtime_singleton_->flag().pc_sampling_max_device_buffer_size();
+
+  /*
+   * We use a double-buffer mechanism where there are 2 trap-buffers and 1 host-buffer
+   * Warning: This currently assumes that client latency is smaller than time to fill 1
+   * trap-buffer If latency is bigger, we have to increate host-buffer
+   *
+   * host-buffer must be >= client-buffer so that we can copy full size of client-buffer each
+   * time. To avoid having to deal with wrap-arounds, host-buffer must be a multiple of
+   * trap-buffers
+   *
+   * if client-buffer size is greater than 2x max_trap_buffer_size:
+   *    We are limited by max_trap_buffer_size.
+   *    trap-buffer = max-trap-buffer-size
+   *    host-buffer = 2*smallest size greater than client-buffer but multiple of 1 trap-buffer
+   * else:
+   *    We reduce the trap-buffers so that:
+   *    trap-buffer = half of user-buffer
+   *    host-buffer = 2*user-buffer
+   *
+   * TODO: We are currently using a temporary host-buffer so that we can increase host-buffer to
+   * factor in client latency. Using a direct-copy to the client buffer would be more efficient.
+   * Revisit this once we have empirical data of latency vs how long it takes to fill 1
+   * trap-buffer.
+   */
+
+  size_t trap_buffer_size = 0;
+  if (session.buffer_size() > 2 * max_trap_buffer_size) {
+    trap_buffer_size = max_trap_buffer_size;
+    pcs_data->host_buffer_size = 2 * AlignUp(session.buffer_size(), trap_buffer_size);
    } else {
      trap_buffer_size = session.buffer_size() / 2;
-      ht_data.host_buffer_size = 2 * session.buffer_size();
+      pcs_data->host_buffer_size = 2 * session.buffer_size();
    }

-    ht_data.host_buffer = (uint8_t*)system_allocator()(ht_data.host_buffer_size, 0x1000, 0);
-    if (!ht_data.host_buffer) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+    pcs_data->host_buffer = (uint8_t*)system_allocator()(pcs_data->host_buffer_size, 0x1000, 0);
+    if (!pcs_data->host_buffer) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;

-    if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, ht_data.host_buffer) !=
+    if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, pcs_data->host_buffer) !=
        HSA_STATUS_SUCCESS)
      return HSA_STATUS_ERROR;

@@ -2682,101 +2706,162 @@ hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId,
    device_datahost->buf_watermark1 = 0.8 * device_datahost->buf_size;

    // Allocate device memory for 2nd level trap handler TMA
-    size_t deviceAllocSize = sizeof(*ht_data.device_data) + (2 * trap_buffer_size);
-    ht_data.device_data = (pcs_hosttrap_sampling_data_t*)finegrain_allocator()(deviceAllocSize, 0);
-    if (ht_data.device_data == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+    size_t deviceAllocSize = sizeof(*pcs_data->device_data) + (2 * trap_buffer_size);
+    pcs_data->device_data = (pcs_sampling_data_t*)finegrain_allocator()(deviceAllocSize, 0);
+    if (pcs_data->device_data == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;

    // This cpuAgent is the owner of the system_allocator() pool
    auto cpuAgent = GetNearestCpuAgent()->public_handle();
-    hsa_status_t ret = AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, ht_data.device_data);
-    assert(ret == HSA_STATUS_SUCCESS);
+    if (AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, pcs_data->device_data) != HSA_STATUS_SUCCESS)
+      return HSA_STATUS_ERROR;

-    if (DmaCopy(ht_data.device_data, device_datahost, sizeof(*device_datahost)) !=
+    if (DmaCopy(pcs_data->device_data, device_datahost, sizeof(*device_datahost)) !=
        HSA_STATUS_SUCCESS) {
      debug_print("Failed to dmaCopy!\n");
      return HSA_STATUS_ERROR;
    }

    uint8_t* device_buf_ptr =
-        ((uint8_t*)ht_data.device_data) + sizeof(pcs_hosttrap_sampling_data_t);
-    if (DmaFill(device_buf_ptr, 0, deviceAllocSize - sizeof(pcs_hosttrap_sampling_data_t)) !=
+        ((uint8_t*)pcs_data->device_data) + sizeof(pcs_sampling_data_t);
+    if (DmaFill(device_buf_ptr, 0, deviceAllocSize - sizeof(pcs_sampling_data_t)) !=
        HSA_STATUS_SUCCESS) {
      debug_print("Failed to dmaFill!\n");
      return HSA_STATUS_ERROR;
    }

-    ht_data.lost_sample_count = 0;
-    ht_data.host_buffer_wrap_pos = 0;
-    ht_data.host_write_ptr = ht_data.host_buffer;
-    ht_data.host_read_ptr = ht_data.host_write_ptr;
+    pcs_data->lost_sample_count = 0;
+    pcs_data->host_buffer_wrap_pos = 0;
+    pcs_data->host_write_ptr = pcs_data->host_buffer;
+    pcs_data->host_read_ptr = pcs_data->host_write_ptr;

-    ht_data.session = &session;
-    freeHostTrapResources.Dismiss();
+    pcs_data->session = &session;

-    if (UpdateTrapHandlerWithPCS(ht_data.device_data, NULL) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR;
-  }
+    if (UpdateTrapHandlerWithPCS(
+            sampling_method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1 ? pcs_data->device_data : nullptr,
+            sampling_method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1
+                ? pcs_data->device_data
+                : nullptr) != HSA_STATUS_SUCCESS)
+      return HSA_STATUS_ERROR;

-  session.SetThunkId(ioctlId);
-  ht_data.session = &session;
+    session.SetThunkId(ioctlId);

-  return HSA_STATUS_SUCCESS;
+    freeResources.Dismiss();
+
+    return HSA_STATUS_SUCCESS;
 }

 hsa_status_t GpuAgent::PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) {
  if (PcSamplingStop(session) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR;

-  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
  HSAKMT_STATUS retKmt = hsaKmtPcSamplingDestroy(node_id(), session.ThunkId());
-  ht_data.session = NULL;
+  hsa_ven_amd_pcs_method_kind_t sampling_method = session.method();

-  if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
-    free(ht_data.cmd_data);
-    system_deallocator()(ht_data.old_val);
-    HSA::hsa_signal_destroy(ht_data.exec_pm4_signal);
-    HSA::hsa_signal_destroy(ht_data.device_data->done_sig0);
-    HSA::hsa_signal_destroy(ht_data.device_data->done_sig1);
-    finegrain_deallocator()(ht_data.device_data);
-    system_deallocator()(ht_data.host_buffer);
+  pcs_data_t* pcs_data = nullptr;

-    ht_data.device_data = NULL;
-    ht_data.host_buffer = NULL;
-    ht_data.session = NULL;
-
-    UpdateTrapHandlerWithPCS(NULL, NULL);
+  if (sampling_method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
+    pcs_data = &pcs_hosttrap_data_;
+  } else if (sampling_method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
+    pcs_data = &pcs_stochastic_data_;
+  } else {
+    // Unsupported sampling method
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
  }
+
+  // Mark session as inactive
+  pcs_data->session = nullptr;
+
+  free(pcs_data->cmd_data);
+  system_deallocator()(pcs_data->old_val);
+  HSA::hsa_signal_destroy(pcs_data->exec_pm4_signal);
+  HSA::hsa_signal_destroy(pcs_data->device_data->done_sig0);
+  HSA::hsa_signal_destroy(pcs_data->device_data->done_sig1);
+  finegrain_deallocator()(pcs_data->device_data);
+  system_deallocator()(pcs_data->host_buffer);
+
+  pcs_data->device_data = NULL;
+  pcs_data->host_buffer = NULL;
+  pcs_data->session = NULL;
+
+  // Update the trap handler to clear any associated device data
+  UpdateTrapHandlerWithPCS(nullptr, nullptr);
+
  return (retKmt == HSAKMT_STATUS_SUCCESS) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
 }

 hsa_status_t GpuAgent::PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) {
  if (session.isActive()) return HSA_STATUS_SUCCESS;

-  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;

  auto method = session.method();
+
+  pcs_data_t* pcs_data = nullptr;
+  const char* thread_name = nullptr;
  if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
-    if (ht_data.session->isActive()) {
-      debug_warning("Already have a Host trap session in progress!");
-      return (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY;
-    }
-    ht_data.session->start();
-    // This thread will handle all hosttrap sessions on this agent
-    // In the future, there will be another thread to handle stochastic sessions.
-    ht_data.thread = os::CreateThread(PcSamplingThreadRun, (void*)this);
-    if (!ht_data.thread)
-      throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
-                               "Failed to start PC Sampling thread.");
+    pcs_data = &pcs_hosttrap_data_;
+    thread_name = "PcSamplingHostTrapThread";
+  } else if (method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
+    pcs_data = &pcs_stochastic_data_;
+    thread_name = "PcSamplingStochasticThread";
+  } else {
+    // Unsupported sampling method
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
  }

+  // Check if a session is already active
+  if (pcs_data->session && pcs_data->session->isActive()) {
+    debug_warning("Already have a PC sampling session in progress!");
+    return (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY;
+  }
+
+  // Assign the new session and mark it as active
+  pcs_data->session = &session;
+  pcs_data->session->start();
+
+  // Creating thread data
+  struct ThreadData {
+    GpuAgent* agent;
+    pcs_data_t* pcs_data;
+    const char* thread_name;
+  };
+
+  auto* thread_data = new ThreadData{this, pcs_data, thread_name};
+
+  // This thread will handle all PC Sampling sessions on this agent
+  pcs_data->thread = os::CreateThread(
+      [](void* arg) -> void {
+        auto* thread_data = static_cast<ThreadData*>(arg);
+        try {
+          GpuAgent* agent = thread_data->agent;
+          pcs_data_t* pcs_data = thread_data->pcs_data;
+          const char* thread_name = thread_data->thread_name;
+
+          agent->PcSamplingThread(*pcs_data, thread_name);
+        } catch (...) {
+	   fprintf(stdout, "Exception caught in PcSamplingThread. Exiting the thread!");
+        }
+
+        delete thread_data;
+      },
+      thread_data);
+
+  if (!pcs_data->thread) {
+    // if thread creation failed
+    delete thread_data;
+    throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
+                             "Failed to start PC Sampling thread.");
+  }
+
+  // Start the sampling session in the kernel driver
  if (hsaKmtPcSamplingStart(node_id(), session.ThunkId()) == HSAKMT_STATUS_SUCCESS)
    return HSA_STATUS_SUCCESS;

  debug_print("Failed to start PC sampling session with thunkId:%d\n", session.ThunkId());
-  if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
-    ht_data.session->stop();
-    os::WaitForThread(ht_data.thread);
-    os::CloseThread(ht_data.thread);
-    ht_data.thread = NULL;
-  }
+  // Clean up if starting the session failed
+  pcs_data->session->stop();
+  os::WaitForThread(pcs_data->thread);
+  os::CloseThread(pcs_data->thread);
+  pcs_data->thread = nullptr;
+  pcs_data->session = nullptr;

  return HSA_STATUS_ERROR;
 }
@@ -2784,35 +2869,51 @@ hsa_status_t GpuAgent::PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& sessi
 hsa_status_t GpuAgent::PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) {
  if (!session.isActive()) return HSA_STATUS_SUCCESS;

-  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
-
+  // Stop the session
  session.stop();

+  // Stop PC sampling in the kernel driver
  HSAKMT_STATUS retKmt = hsaKmtPcSamplingStop(node_id(), session.ThunkId());
  if (retKmt != HSAKMT_STATUS_SUCCESS)
    throw AMD::hsa_exception(HSA_STATUS_ERROR, "Failed to stop PC Sampling session.");

-  if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
-    // Wake up pcs_hosttrap_thread_ if it is waiting for data
-    HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig0, -1);
-    HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig1, -1);
+  // Determine the sampling method and corresponding data
+  pcs_data_t* pcs_data = nullptr;
+  auto method = session.method();

-    os::WaitForThread(ht_data.thread);
-    os::CloseThread(ht_data.thread);
-    ht_data.thread = NULL;
+  if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
+    pcs_data = &pcs_hosttrap_data_;
+  } else if (method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
+    pcs_data = &pcs_stochastic_data_;
+  } else {
+    // Unsupported sampling method
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
  }
+  // Wake up pcs_hosttrap_thread_ if it is waiting for data
+  HSA::hsa_signal_store_screlease(pcs_data->device_data->done_sig0, -1);
+  HSA::hsa_signal_store_screlease(pcs_data->device_data->done_sig1, -1);
+
+  // Wait for the thread to finish and clean up
+  os::WaitForThread(pcs_data->thread);
+  os::CloseThread(pcs_data->thread);
+  pcs_data->thread = nullptr;
+  pcs_data->session = nullptr;

  return HSA_STATUS_SUCCESS;
 }

-hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
+hsa_status_t GpuAgent::PcSamplingFlushDeviceBuffers(
    pcs::PcsRuntime::PcSamplingSession& session) {
-  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
-  uint32_t& which_buffer = ht_data.which_buffer;
-  uint32_t* cmd_data = ht_data.cmd_data;
-  size_t& cmd_data_sz = ht_data.cmd_data_sz;
-  uint64_t* old_val = ht_data.old_val;
-  hsa_signal_t& exec_pm4_signal = ht_data.exec_pm4_signal;
+  pcs_data_t* pcs_data = nullptr;
+
+  if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
+    pcs_data = &pcs_hosttrap_data_;
+  } else if (session.method() == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
+    pcs_data = &pcs_stochastic_data_;
+  } else {
+    // No sampling session active
+    return HSA_STATUS_SUCCESS;
+  }

  /*
   * Device-buffer to Host-buffer to User-Buffer copy logic
@@ -2951,19 +3052,33 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(

  uint32_t pred_exec_cmd_sz = 0;

-  uint8_t* host_buffer_begin = ht_data.host_buffer;
-  uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size;
+  uint64_t buf_write_val;
+  uint64_t buf_written_val[2];
+  size_t buf_offset;
+  uint8_t* buffer[2];
+  size_t buf_size;

-  uint64_t buf_write_val = (uint64_t) & (ht_data.device_data->buf_write_val);
-  uint64_t buf_written_val[] = {(uint64_t) & (ht_data.device_data->buf_written_val0),
-                                (uint64_t) & (ht_data.device_data->buf_written_val1)};
+  uint32_t& which_buffer = pcs_data->which_buffer;
+  uint32_t* cmd_data = pcs_data->cmd_data;
+  size_t cmd_data_sz = pcs_data->cmd_data_sz;
+  uint64_t* old_val = pcs_data->old_val;
+  hsa_signal_t& exec_pm4_signal = pcs_data->exec_pm4_signal;

-  size_t const buf_offset = offsetof(pcs_hosttrap_sampling_data_t, reserved1) +
-      sizeof(((pcs_hosttrap_sampling_data_t*)0)->reserved1);
+  uint8_t* host_buffer_begin = pcs_data->host_buffer;
+  size_t& host_buffer_size = pcs_data->host_buffer_size;
+  uint8_t*& host_write_ptr = pcs_data->host_write_ptr;
+  uint8_t* host_buffer_end = host_buffer_begin + host_buffer_size;

-  uint8_t* buffer[] = {(uint8_t*)ht_data.device_data + buf_offset,
-                       (uint8_t*)ht_data.device_data + buf_offset +
-                           ht_data.device_data->buf_size * session.sample_size()};
+  buf_write_val = reinterpret_cast<uint64_t>(&pcs_data->device_data->buf_write_val);
+  buf_written_val[0] = reinterpret_cast<uint64_t>(&pcs_data->device_data->buf_written_val0);
+  buf_written_val[1] = reinterpret_cast<uint64_t>(&pcs_data->device_data->buf_written_val1);
+  buf_size = pcs_data->device_data->buf_size;
+
+  buf_offset =
+      offsetof(pcs_sampling_data_t, reserved1) + sizeof(((pcs_sampling_data_t*)0)->reserved1);
+
+  buffer[0] = reinterpret_cast<uint8_t*>(pcs_data->device_data) + buf_offset;
+  buffer[1] = buffer[0] + buf_size * session.sample_size();

  next_buffer = (which_buffer + 1) % 2;
  reset_write_val = (uint64_t)next_buffer << 63;
@@ -3022,25 +3137,25 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
  /* If the number of entries in old_val is larger than buf_size, then there was a buffer overflow
   * and the 2nd level trap handler code will skip recording samples, causing lost samples
   */
-  if (*old_val > (uint64_t)ht_data.device_data->buf_size) {
-    ht_data.lost_sample_count = *old_val - (uint64_t)ht_data.device_data->buf_size;
-    *old_val = (uint64_t)ht_data.device_data->buf_size;
+  if (*old_val > buf_size) {
+    pcs_data->lost_sample_count = *old_val - buf_size;
+    *old_val = buf_size;
  }

  to_copy = *old_val * session.sample_size();

  /* Make sure there is enough space after host_write_ptr */
-  if (ht_data.host_write_ptr + to_copy >= host_buffer_end) {
+  if (host_write_ptr + to_copy >= host_buffer_end) {
    // Need to wrap around
-    ht_data.host_buffer_wrap_pos = ht_data.host_write_ptr;
-    ht_data.host_write_ptr = host_buffer_begin;
+    pcs_data->host_buffer_wrap_pos = host_write_ptr;
+    host_write_ptr = host_buffer_begin;
  }

  i = 0;
  memset(cmd_data, 0, cmd_data_sz);

  if (properties_.NumXcc > 1) {
-    const uint32_t n = ceil(to_copy / (32 * 1024 * 1024));
+    const uint64_t n = ceil(to_copy / (32 * 1024 * 1024));
    pred_exec_cmd_sz = 2;
    cmd_data[i++] = PM4_HDR(PM4_HDR_IT_OPCODE_PRED_EXEC, pred_exec_cmd_sz, isa_->GetMajorVersion());
    cmd_data[i++] =
@@ -3073,7 +3188,8 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
  unsigned int num_copy_command = 0;
  uint8_t* buffer_temp = buffer[which_buffer];

-  for (copy_bytes = CP_DMA_DATA_TRANSFER_CNT_MAX; 0 < to_copy; to_copy -= copy_bytes) {
+  for (copy_bytes = std::min(to_copy, (uint32_t)CP_DMA_DATA_TRANSFER_CNT_MAX); 0 < to_copy;
+       to_copy -= copy_bytes) {
    num_copy_command++;

    /* DMA_DATA PACKETS, copy buffer using CPDMA */
@@ -3082,9 +3198,8 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
                                     PM4_DMA_DATA_SRC_SEL_SRC_ADDR_USING_L2);
    cmd_data[i++] = PM4_DMA_DATA_DW2_SRC_ADDR_LO((uint64_t)buffer_temp);
    cmd_data[i++] = PM4_DMA_DATA_DW3_SRC_ADDR_HI(((uint64_t)buffer_temp) >> 32);
-    cmd_data[i++] = PM4_DMA_DATA_DW4_DST_ADDR_LO((uint64_t)ht_data.host_write_ptr);
-    cmd_data[i++] = PM4_DMA_DATA_DW5_DST_ADDR_HI(((uint64_t)ht_data.host_write_ptr) >> 32);
-
+    cmd_data[i++] = PM4_DMA_DATA_DW4_DST_ADDR_LO((uint64_t)host_write_ptr);
+    cmd_data[i++] = PM4_DMA_DATA_DW5_DST_ADDR_HI(((uint64_t)host_write_ptr) >> 32);
    if (copy_bytes >= to_copy) {
      copy_bytes = to_copy;
      cmd_data[i++] =
@@ -3093,7 +3208,7 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
      cmd_data[i++] = PM4_DMA_DATA_DW6(PM4_DMA_DATA_BYTE_COUNT(copy_bytes) | PM4_DMA_DATA_DIS_WC);
    }
    buffer_temp += copy_bytes;
-    ht_data.host_write_ptr += copy_bytes;
+    host_write_ptr += copy_bytes;
  }

  /* WRITE_DATA, Reset buf_written_val */
@@ -3117,167 +3232,180 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
    if (val == 0) break;
  } while (true);

+  // save the position of next buffer
  which_buffer = next_buffer;

  return HSA_STATUS_SUCCESS;
 }

-void GpuAgent::PcSamplingThread() {
+void GpuAgent::PcSamplingThread(pcs_data_t& pcs_data, const char* thread_name) {
  // TODO: Implement lost sample count
  // TODO: Implement latency

-  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
-  pcs::PcsRuntime::PcSamplingSession& session = *ht_data.session;
-  uint32_t& which_buffer = ht_data.which_buffer;
+  try {
+    pcs::PcsRuntime::PcSamplingSession& session = *pcs_data.session;
+    uint32_t& which_buffer = pcs_data.which_buffer;

-  uint8_t* host_buffer_begin = ht_data.host_buffer;
-  uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size;
+    uint8_t* host_buffer_begin = pcs_data.host_buffer;
+    uint8_t* host_buffer_end = pcs_data.host_buffer + pcs_data.host_buffer_size;

-  hsa_signal_t done_sig[] = {ht_data.device_data->done_sig0, ht_data.device_data->done_sig1};
+    hsa_signal_t done_sig[] = {pcs_data.device_data->done_sig0, pcs_data.device_data->done_sig1};

-  while (ht_data.session->isActive()) {
-    do {
-      hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire(
-          done_sig[which_buffer], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
-      if (val == -1) goto thread_exit;
-      if (val == 0) break;
-    } while (true);
-    HSA::hsa_signal_store_screlease(done_sig[which_buffer], 1);
+    while (pcs_data.session->isActive()) {
+      // Wait for the signal to process the buffer
+      do {
+        hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire(
+            done_sig[which_buffer], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
+        if (val == -1) goto thread_exit;
+        if (val == 0) break;
+      } while (true);
+      HSA::hsa_signal_store_screlease(done_sig[which_buffer], 1);

-    std::lock_guard<std::mutex> lock(ht_data.host_buffer_mutex);
-    if (PcSamplingFlushHostTrapDeviceBuffers(session) != HSA_STATUS_SUCCESS)
-      goto thread_exit;
+      // Lock buffer to ensure thread-safe access
+      std::lock_guard<std::mutex> lock(pcs_data.host_buffer_mutex);
+      // Flush device buffers
+      if (PcSamplingFlushDeviceBuffers(session) != HSA_STATUS_SUCCESS)
+	    goto thread_exit;

-    size_t bytes_before_wrap;
-    size_t bytes_after_wrap;
+      size_t bytes_before_wrap;
+      size_t bytes_after_wrap;

-    assert(ht_data.host_read_ptr >= host_buffer_begin && ht_data.host_read_ptr < host_buffer_end);
-    assert(ht_data.host_write_ptr >= host_buffer_begin && ht_data.host_write_ptr < host_buffer_end);
-    assert(ht_data.host_buffer_wrap_pos ? (ht_data.host_read_ptr > ht_data.host_write_ptr)
-                                        : (ht_data.host_read_ptr <= ht_data.host_write_ptr));
+      assert(pcs_data.host_read_ptr >= host_buffer_begin && pcs_data.host_read_ptr < host_buffer_end);
+      assert(pcs_data.host_write_ptr >= host_buffer_begin && pcs_data.host_write_ptr < host_buffer_end);
+      assert(pcs_data.host_buffer_wrap_pos ? (pcs_data.host_read_ptr > pcs_data.host_write_ptr)
+                                           : (pcs_data.host_read_ptr <= pcs_data.host_write_ptr));

-    if (ht_data.host_buffer_wrap_pos) {
-      assert(ht_data.host_buffer_wrap_pos <= host_buffer_end &&
-             ht_data.host_buffer_wrap_pos > host_buffer_begin);
-      assert(ht_data.host_read_ptr <= ht_data.host_buffer_wrap_pos);
+      if (pcs_data.host_buffer_wrap_pos) {
+        assert(pcs_data.host_buffer_wrap_pos <= host_buffer_end &&
+               pcs_data.host_buffer_wrap_pos > host_buffer_begin);
+        assert(pcs_data.host_read_ptr <= pcs_data.host_buffer_wrap_pos);

-      // Wrapped around
-      bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
-      bytes_after_wrap = ht_data.host_write_ptr - host_buffer_begin;
+        // Wrapped around
+        bytes_before_wrap = pcs_data.host_buffer_wrap_pos - pcs_data.host_read_ptr;
+        bytes_after_wrap = pcs_data.host_write_ptr - host_buffer_begin;

-      while (bytes_before_wrap >= session.buffer_size()) {
-        session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0,
-                                 ht_data.lost_sample_count);
-        ht_data.host_read_ptr += session.buffer_size();
-        bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
-        ht_data.lost_sample_count = 0;
-      }
+        while (bytes_before_wrap >= session.buffer_size()) {
+          session.HandleSampleData(pcs_data.host_read_ptr, session.buffer_size(), nullptr, 0,
+                                   pcs_data.lost_sample_count);
+          pcs_data.host_read_ptr += session.buffer_size();
+          bytes_before_wrap = pcs_data.host_buffer_wrap_pos - pcs_data.host_read_ptr;
+          pcs_data.lost_sample_count = 0;
+        }

-      if (bytes_before_wrap + bytes_after_wrap >= session.buffer_size()) {
-        session.HandleSampleData(ht_data.host_read_ptr, bytes_before_wrap, host_buffer_begin,
-                                 (session.buffer_size() - bytes_before_wrap), 0);
-        ht_data.host_read_ptr = host_buffer_begin + (session.buffer_size() - bytes_before_wrap);
-        bytes_before_wrap = 0;
-        ht_data.host_buffer_wrap_pos = 0;
-        bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
-        ht_data.lost_sample_count = 0;
-      }
+        if (bytes_before_wrap + bytes_after_wrap >= session.buffer_size()) {
+          session.HandleSampleData(pcs_data.host_read_ptr, bytes_before_wrap, host_buffer_begin,
+                                   (session.buffer_size() - bytes_before_wrap), 0);
+          pcs_data.host_read_ptr = host_buffer_begin + (session.buffer_size() - bytes_before_wrap);
+          bytes_before_wrap = 0;
+          pcs_data.host_buffer_wrap_pos = 0;
+          bytes_after_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr;
+          pcs_data.lost_sample_count = 0;
+        }

-      while (bytes_after_wrap >= session.buffer_size()) {
-        session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0,
-                                 ht_data.lost_sample_count);
-        ht_data.host_read_ptr += session.buffer_size();
-        bytes_before_wrap = 0;
-        bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
-        ht_data.lost_sample_count = 0;
-      }
-    } else {
-      bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
+        while (bytes_after_wrap >= session.buffer_size()) {
+          session.HandleSampleData(pcs_data.host_read_ptr, session.buffer_size(), nullptr, 0,
+                                   pcs_data.lost_sample_count);
+          pcs_data.host_read_ptr += session.buffer_size();
+          bytes_before_wrap = 0;
+          bytes_after_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr;
+          pcs_data.lost_sample_count = 0;
+        }
+      } else {
+        // Handle non-wrapped buffer
+        bytes_before_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr;

-      while (bytes_before_wrap >= session.buffer_size()) {
-        assert(ht_data.host_read_ptr >= host_buffer_begin &&
-               ht_data.host_read_ptr + session.buffer_size() < host_buffer_end);
-        session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0,
-                                 ht_data.lost_sample_count);
-        ht_data.host_read_ptr += session.buffer_size();
-        bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
-        ht_data.lost_sample_count = 0;
+        while (bytes_before_wrap >= session.buffer_size()) {
+          assert(pcs_data.host_read_ptr >= host_buffer_begin &&
+                 pcs_data.host_read_ptr + session.buffer_size() <= host_buffer_end);
+          session.HandleSampleData(pcs_data.host_read_ptr, session.buffer_size(), nullptr, 0,
+                                   pcs_data.lost_sample_count);
+          pcs_data.host_read_ptr += session.buffer_size();
+          bytes_before_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr;
+          pcs_data.lost_sample_count = 0;
+        }
      }
    }
-  }
 thread_exit:
-  debug_print("PcSamplingThread::Exiting\n");
+  debug_print("%s::Exiting\n", thread_name);
+} catch (const std::exception& e) {
+  debug_print("Exception in %s: %s\n", thread_name, e.what());
+} catch (...) {
+  debug_print("Unknown exception in %s\n", thread_name);
 }
-
-void GpuAgent::PcSamplingThreadRun(void* _agent) {
-  GpuAgent* agent = (GpuAgent*)_agent;
-  agent->PcSamplingThread();
-  debug_print("PcSamplingThread exiting...");
 }

 hsa_status_t GpuAgent::PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) {
-  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
+  pcs_data_t* pcs_data = nullptr;

-  uint8_t* host_buffer_begin = ht_data.host_buffer;
-  uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size;
+  if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
+    pcs_data = &pcs_hosttrap_data_;
+  } else if (session.method() == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
+    pcs_data = &pcs_stochastic_data_;
+  } else {
+    return HSA_STATUS_SUCCESS;  // Unsupported sampling method
+  }
+
+  uint8_t* host_buffer_begin = pcs_data->host_buffer;
+  uint8_t* host_buffer_end = pcs_data->host_buffer + pcs_data->host_buffer_size;

  size_t bytes_before_wrap;
  size_t bytes_after_wrap;

-  std::lock_guard<std::mutex> lock(ht_data.host_buffer_mutex);
-  if (PcSamplingFlushHostTrapDeviceBuffers(session) != HSA_STATUS_SUCCESS)
-    return HSA_STATUS_ERROR;
+  std::lock_guard<std::mutex> lock(pcs_data->host_buffer_mutex);
+  // Flush device buffers
+  if (PcSamplingFlushDeviceBuffers(session) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR;

-  assert(ht_data.host_read_ptr >= host_buffer_begin && ht_data.host_read_ptr < host_buffer_end);
-  assert(ht_data.host_write_ptr >= host_buffer_begin && ht_data.host_write_ptr < host_buffer_end);
-  assert(ht_data.host_buffer_wrap_pos ? (ht_data.host_read_ptr > ht_data.host_write_ptr)
-                                      : (ht_data.host_read_ptr <= ht_data.host_write_ptr));
+  assert(pcs_data->host_read_ptr >= host_buffer_begin && pcs_data->host_read_ptr < host_buffer_end);
+  assert(pcs_data->host_write_ptr >= host_buffer_begin &&
+         pcs_data->host_write_ptr < host_buffer_end);
+  assert(pcs_data->host_buffer_wrap_pos ? (pcs_data->host_read_ptr > pcs_data->host_write_ptr)
+                                        : (pcs_data->host_read_ptr <= pcs_data->host_write_ptr));

-  if (ht_data.host_buffer_wrap_pos) {
-    assert(ht_data.host_buffer_wrap_pos <= host_buffer_end &&
-           ht_data.host_buffer_wrap_pos > host_buffer_begin);
-    assert(ht_data.host_read_ptr <= ht_data.host_buffer_wrap_pos);
+  if (pcs_data->host_buffer_wrap_pos) {
+    assert(pcs_data->host_buffer_wrap_pos <= host_buffer_end &&
+           pcs_data->host_buffer_wrap_pos > host_buffer_begin);
+    assert(pcs_data->host_read_ptr <= pcs_data->host_buffer_wrap_pos);

-    // Wrapped around
-    bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
-    bytes_after_wrap = ht_data.host_write_ptr - host_buffer_begin;
+    // Handle wrapped-around buffer
+    bytes_before_wrap = pcs_data->host_buffer_wrap_pos - pcs_data->host_read_ptr;
+    bytes_after_wrap = pcs_data->host_write_ptr - host_buffer_begin;

    while (bytes_before_wrap > 0) {
      size_t bytes_to_copy = std::min(bytes_before_wrap, session.buffer_size());

-      session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0,
-                               ht_data.lost_sample_count);
-      ht_data.host_read_ptr += bytes_to_copy;
-      bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
-      ht_data.lost_sample_count = 0;
+      session.HandleSampleData(pcs_data->host_read_ptr, bytes_to_copy, nullptr, 0,
+                               pcs_data->lost_sample_count);
+      pcs_data->host_read_ptr += bytes_to_copy;
+      bytes_before_wrap = pcs_data->host_buffer_wrap_pos - pcs_data->host_read_ptr;
+      pcs_data->lost_sample_count = 0;
    }

-    assert(ht_data.host_read_ptr == ht_data.host_buffer_wrap_pos);
-    ht_data.host_buffer_wrap_pos = 0;
-    ht_data.host_read_ptr = host_buffer_begin;
+    assert(pcs_data->host_read_ptr == pcs_data->host_buffer_wrap_pos);
+    pcs_data->host_buffer_wrap_pos = 0;
+    pcs_data->host_read_ptr = host_buffer_begin;

    while (bytes_after_wrap > 0) {
      size_t bytes_to_copy = std::min(bytes_after_wrap, session.buffer_size());

-      session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0,
-                               ht_data.lost_sample_count);
-      ht_data.host_read_ptr += bytes_to_copy;
-      bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
-      ht_data.lost_sample_count = 0;
+      session.HandleSampleData(pcs_data->host_read_ptr, bytes_to_copy, nullptr, 0,
+                               pcs_data->lost_sample_count);
+      pcs_data->host_read_ptr += bytes_to_copy;
+      bytes_after_wrap = pcs_data->host_write_ptr - pcs_data->host_read_ptr;
+      pcs_data->lost_sample_count = 0;
    }
  } else {
-    bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
+    bytes_before_wrap = pcs_data->host_write_ptr - pcs_data->host_read_ptr;

-    while (bytes_before_wrap) {
+    while (bytes_before_wrap > 0) {
      size_t bytes_to_copy = std::min(bytes_before_wrap, session.buffer_size());
-      assert(ht_data.host_read_ptr >= host_buffer_begin &&
-             ht_data.host_read_ptr + bytes_to_copy <= host_buffer_end);
+      assert(pcs_data->host_read_ptr >= host_buffer_begin &&
+             pcs_data->host_read_ptr + bytes_to_copy <= host_buffer_end);

-      session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0,
-                               ht_data.lost_sample_count);
-      ht_data.host_read_ptr += bytes_to_copy;
-      bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
-      ht_data.lost_sample_count = 0;
+      session.HandleSampleData(pcs_data->host_read_ptr, bytes_to_copy, nullptr, 0,
+                               pcs_data->lost_sample_count);
+      pcs_data->host_read_ptr += bytes_to_copy;
+      bytes_before_wrap = pcs_data->host_write_ptr - pcs_data->host_read_ptr;
+      pcs_data->lost_sample_count = 0;
    }
  }
  return HSA_STATUS_SUCCESS;
@@ -53,6 +53,7 @@
 .set SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT      , 11
 .set SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT       , 28
 .set SQ_WAVE_TRAPSTS_MATH_EXCP               , 0x7F
+.set SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT     , 26
 .set SQ_WAVE_MODE_EXCP_EN_SHIFT              , 12
 .set SQ_WAVE_MODE_EXCP_EN_SIZE               , 8
 .set TRAP_ID_ABORT                           , 2
@@ -95,12 +96,23 @@
 // TTMP_REG1 means ttmp6 register if gfx>=942 and means ttmp13 register if gfx<942
 // TTMP_REG2 means ttmp11 register if gfx>=942 and means ttmp6 register if gfx<942

-.if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4
+.if .amdgcn.gfx_generation_number == 9
  .set TTMP11_TTMPS_SETUP_SHIFT              , 31

-  // Bit to indicate that this is a hosttrap trap instead of stochastic trap
-  // Currently not used
-  .set TTMP13_PCS_IS_STOCHASTIC              , 24
+.if (.amdgcn.gfx_generation_minor >= 4)
+  .set TTMP11_WAVE_IN_WG_MASK 		     , 0x3F
+
+  // Bit to indicate that this is a stochastic trap
+  .set TTMP13_PCS_IS_STOCHASTIC              , 21
+
+  // Bit to indicate that this is a host trap
+  .set TTMP13_PCS_IS_HOSTTRAP                , 22
+
+.else
+
+  // Bit to indicate that this is a host trap
+  .set TTMP11_PCS_IS_HOSTTRAP                , 22
+.endif
 .endif

 .if (.amdgcn.gfx_generation_number == 9)
@@ -205,27 +217,45 @@
 //   ttmp15 = TMA[63:32]
 // gfx9:
 //   ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32]
-// all gfx9 (except gfx942):
+// For all gfx9 (except gfx940, gfx941, gfx942):
 //   ttmp6 = 0[6:0], DispatchPktIndx[24:0]
 //   ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveInWg[5:0]
-//            Note: Once stochastic sampling is implemented, L2 Trap Handler will use Bit 23
-//            (TTMP11_PCS_IS_STOCHASTIC) to differentiate between stochastic and hosttrap
-// gfx942:
+//
+// For gfx940/gfx941/gfx942:
 //   ttmp11 = 0[0], DispatchPktIndx[24:0], WaveIdInWg[5:0]
-//   ttmp13 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[22:0]
+//   ttmp13:
+//       Bits 31:26 : SQ_WAVE_IB_STS[20:15] (1TH)
+//            25:24 : 0 on 2TH entry. Used by 1st level TH but also
+//                    free to be used in the 2nd level TH
+//            23    : Debug Enabled (1TH)
+//            22:0  : values are unspecified on 2TH entry. Free.
+//
 // gfx10:
 //   ttmp1 = 0[0], PCRewind[5:0], HostTrap[0], TrapId[7:0], PC[47:32]
+//
 // gfx10/gfx11:
 //   ttmp6 = 0[6:0], DispatchPktIndx[24:0]
+//
 // gfx1010:
 //   ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
+//
 // gfx1030/gfx1100:
 //   ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
+//
+// ttmp[14:15] points to TMA2; Available: ttmp[2:3], ttmp[4:5]
+//
+// ttmp7 : gfx9, gfx1010, gfx1030, gfx11 - 31:0 : PC[31:0]  (2TH, DBG);
+//       : gfx940 - free;
+//       : gfx12 - ttmp7 - 31:16 : workgroup_z[15:0]  (SPI) and 15:0 : workgroup_y[15:0]  (SPI)

 trap_entry:
-  // Branch if not a trap (an exception instead).
-  s_bfe_u32            			ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
-  s_cbranch_scc0       			.no_skip_debugtrap
+  // Extract trap_id from ttmp2
+  s_bfe_u32                             ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
+  s_cbranch_scc0                        .not_s_trap                      // If trap_id == 0, it's not an s_trap nor host trap
+
+  // Check if the it was an host trap.
+  s_bitcmp1_b32       			ttmp1, SQ_WAVE_PC_HI_HT_SHIFT
+  s_cbranch_scc0      			.not_host_trap

 .if (.amdgcn.gfx_generation_number == 9) // PC_SAMPLING_GFX9
  // ttmp[14:15] is TMA2; Available: ttmp[2:3], ttmp[4:5], ttmp7, TTMP_REG1
@@ -236,29 +266,42 @@ trap_entry:
  //   [0x08] out_buf_t* stochastic_trap_buffers;
  //
  // --- Start profile trap handlers GFX9 --- //
-  //  if (host_trap) {
-  //    if (stochastic)       // Not implemented yet
-  //        ttmp11.bit23 = 1; // Not implemented yet
-  //    profiling_trap_handler(tma->host_trap_buffers);
-  //  }
+  // If the wave entered the trap handler: 
+  // If on gfx9:
+  // - Check SQ_WAVE_PC_HI_HT_SHIFT bit on TTMP1 register to
+  //   identify if it was a host trap.
+  // If a host trap is detected:
+  // - Mark TTMP13(gfx94x) or TTMP11(gfx9) hosttrap bit
+  // - Load host_trap_buffers
+  // - Branch to the profile trap handler logic.
+  //
+  // If on gfx9.4+:
+  // - Check TRAPSTS bit 26 (SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT) to
+  //   identify stochastic traps.
+  // If a stochastic trap is detected:
+  // - Set bit 21 in TTMP13 to indicate a stochastic trap.
+  // - Branch to the profile trap handler logic.

-  s_bitcmp1_b32       			ttmp1, SQ_WAVE_PC_HI_HT_SHIFT
-  s_cbranch_scc0      			.not_host_trap_gfx9
-  s_load_dwordx2      			ttmp[14:15], ttmp[14:15], 0 glc 	// ttmp[14:15]=&host_trap_buffers
-  // TODO: When implementing stochastic sampling, need to set TTMP11_PCS_IS_STOCHASTIC
-  // or TTMP13_PCS_IS_STOCHASTIC to differentiate between hosttrap and stochastic sampling
+  s_load_dwordx2  			ttmp[2:3], ttmp[14:15], 0 glc   // ttmp[14:15]=*host_trap_buffers
+.if .amdgcn.gfx_generation_minor >= 4
+  s_bitset0_b32                         ttmp13, TTMP13_PCS_IS_STOCHASTIC
+  s_bitset1_b32                         ttmp13, TTMP13_PCS_IS_HOSTTRAP   // set bit 22 in TTMP13
+.else
+  s_bitset1_b32                         ttmp11, TTMP11_PCS_IS_HOSTTRAP    // Set bit 22 in TTMP11
+.endif
  s_waitcnt           			lgkmcnt(0)
-  s_branch            			.profile_trap_handlers_gfx9		// Off to the profile handlers
+  s_mov_b64      			ttmp[14:15], ttmp[2:3]		//now ttmp[14:15] = host_trap_buffers
+  s_branch            			.profile_trap_handlers_gfx9	// Off to the profile handlers
+.else
+  // Ignore host traps.  They should be masked by the driver anyway.
+  s_branch .not_s_trap
+.endif

-.not_host_trap_gfx9:
-.endif // PC_SAMPLING_GFX9
-  // If caused by s_trap then advance PC.
-  s_bitcmp1_b32        			ttmp1, SQ_WAVE_PC_HI_HT_SHIFT
-  s_cbranch_scc1       			.not_s_trap
+.not_host_trap:
+  // It's an s_trap; advance the PC
  s_add_u32            			ttmp0, ttmp0, 0x4
  s_addc_u32           			ttmp1, ttmp1, 0x0

-.not_s_trap:
  // If llvm.debugtrap and debugger is not attached.
  s_cmp_eq_u32         			ttmp2, TRAP_ID_DEBUGTRAP
  s_cbranch_scc0       			.no_skip_debugtrap
@@ -272,6 +315,24 @@ trap_entry:
  // Ignore llvm.debugtrap.
  s_branch             			.exit_trap

+.not_s_trap:
+.if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4
+  //Check for stochastic trap on gfx9.4+
+  s_getreg_b32    			ttmp7, hwreg(HW_REG_TRAPSTS)           	 // On gfx94x, TRAPSTS bit 26 ...
+  s_bitcmp1_b32   			ttmp7, SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT   // is stochastic_sample_trap
+  s_cbranch_scc0  			.no_skip_debugtrap
+
+  // Handle stochastic trap
+  s_load_dwordx2  			ttmp[2:3], ttmp[14:15], 0x8 glc // ttmp[14:15]=*stoch_trap_buf
+  s_bitset0_b32                         ttmp13, TTMP13_PCS_IS_HOSTTRAP
+  s_bitset1_b32   			ttmp13, TTMP13_PCS_IS_STOCHASTIC  // set bit 25 in TTMP13
+  s_waitcnt       			lgkmcnt(0)
+  s_mov_b64      			ttmp[14:15], ttmp[2:3]
+  s_branch        			.profile_trap_handlers_gfx9      // Off to the profile handlers
+.else
+  s_branch                              .no_skip_debugtrap
+.endif // PC_SAMPLING_GFX9
+
 .if (.amdgcn.gfx_generation_number == 9) // PC_SAMPLING_GFX9
  // tma->host_trap_buffers Offsets:
  //    [0x00]	uint64_t buf_write_val;
@@ -348,6 +409,26 @@ trap_entry:
  s_addc_u32            		ttmp5, ttmp15, ttmp5            // buffer0 or buffer1
  s_mov_b32             		ttmp7, ttmp2

+ .if .amdgcn.gfx_generation_number == 9
+
+ .if .amdgcn.gfx_generation_minor >= 4
+  // Check if it's a stochastic trap
+  s_bitcmp1_b32  			ttmp13, TTMP13_PCS_IS_STOCHASTIC
+  s_cbranch_scc1 			.fill_sample_stochastic
+  // Check if it's a host trap
+  s_bitcmp1_b32  			ttmp13, TTMP13_PCS_IS_HOSTTRAP
+  s_cbranch_scc1 			.fill_sample_hosttrap
+.else
+ // Check if it's a host trap
+  s_bitcmp1_b32                         ttmp11, TTMP11_PCS_IS_HOSTTRAP
+  s_cbranch_scc1                        .fill_sample_hosttrap
+
+.endif
+.endif
+  // If neither bit is set, this is unexpected.
+  // This branch is not expected to be taken.
+  s_branch 				.no_skip_debugtrap
+
  // ttmp7 contains local_entry, ttmp[4:5] contains "&bufferX",
  // ttmp[14:15] holds 'tma->host_trap_buffers' pointer
  // ttmp[2:3] and ttmp13 are available for gathering perf sample info
@@ -381,7 +462,7 @@ trap_entry:
  //    buf->timestamp = s_memrealtime;
  //    buf->correlation_id = get_correlation_id();
  // }
-
+.fill_sample_hosttrap:
  s_mul_i32             		ttmp2, ttmp7, 0x40              // offset into buffer for 64B objects
  s_mul_hi_u32          		ttmp3, ttmp7, 0x40              // ttmp[2:3] will contain byte ...
  s_add_u32             		ttmp2, ttmp2, ttmp4
@@ -401,19 +482,56 @@ trap_entry:
 .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4)
  s_getreg_b32          		ttmp4, hwreg(HW_REG_XCC_ID)     //store XCC_ID
  s_lshl_b32            		ttmp4, ttmp4, 8
-  s_and_b32             		ttmp5, ttmp11, 0x3f
+  s_and_b32             		ttmp5, ttmp11, TTMP11_WAVE_IN_WG_MASK
  s_or_b32              		ttmp4, ttmp4, ttmp5
  s_store_dword         		ttmp4, ttmp[2:3], 0x1c          // store wave_in_wg
 .else
  s_and_b32             		ttmp4, ttmp11, 0x3f
  s_store_dword         		ttmp4, ttmp[2:3], 0x1c         	// store wave_in_wg
 .endif
-  // Get HW_ID using S_GETREG_B32 with size=32 (F8 in upper bits), offset=0, and HW_ID = 4 (0x4)
  s_getreg_b32          		ttmp4, hwreg(HW_REG_HW_ID)
  s_store_dword         		ttmp4, ttmp[2:3], 0x20          // store HW_ID

-  // ttmp[2:3] = &buffer[local_entry]; ttmp[4:5], ttmp7, and ttmp13 are free
-  // ttmp[14:15] = tma->host_trap_buffers and is live out; ttmp6.b31 is buf_to_use, 0 or 1
+  s_branch                              .get_correlation_id
+
+.if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4
+.fill_sample_stochastic:
+  s_mul_i32       			ttmp2, ttmp7, 0x40              // offset into buffer for 64B objects
+  s_mul_hi_u32                          ttmp3, ttmp7, 0x40
+  s_add_u32       			ttmp2, ttmp2, ttmp4
+  s_addc_u32      			ttmp3, ttmp3, ttmp5             // ttmp[2:3]=&buffer[local_entry]
+  s_memrealtime   			ttmp[4:5]
+  s_waitcnt       			lgkmcnt(0)                      // Wait for timestamp
+  s_store_dwordx2 			ttmp[4:5], ttmp[2:3] 0x30       // Store timestamp
+
+  s_getreg_b32    			ttmp4, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA)
+  s_getreg_b32   	 	  	ttmp5, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA1)
+  s_store_dwordx2 			ttmp[4:5], ttmp[2:3], 0x24            // store snapshot PC
+  s_getreg_b32          		ttmp4, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_LO)
+  s_getreg_b32          		ttmp5, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_HI)
+  s_store_dwordx2 			ttmp[4:5], ttmp[2:3] 0x00       // store snapshot data
+
+  s_mov_b32       			ttmp6, exec_lo
+  s_store_dword   			ttmp6, ttmp[2:3], 0x8           // store EXEC_LO
+  s_mov_b32       			ttmp6, exec_hi
+  s_store_dword   			ttmp6, ttmp[2:3], 0xc           // store EXEC_HI
+
+  s_store_dwordx2 			ttmp[8:9], ttmp[2:3], 0x10      // store wg_id_x and wg_id_y
+  s_store_dword   			ttmp10, ttmp[2:3], 0x18         // store wg_id_z
+  s_getreg_b32    			ttmp4, hwreg(HW_REG_XCC_ID)
+  s_lshl_b32      			ttmp4, ttmp4, 8
+  s_and_b32       			ttmp5, ttmp11, TTMP11_WAVE_IN_WG_MASK
+  s_or_b32        			ttmp4, ttmp4, ttmp5
+  s_store_dword   			ttmp4, ttmp[2:3], 0x1c          // store chiplet_and_wave_id
+  s_getreg_b32    			ttmp4, hwreg(HW_REG_HW_ID)
+  s_store_dword   			ttmp4, ttmp[2:3], 0x20          // store HW_ID
+  // ttmp[2:3]=&buffer[local_entry]; ttmp[4:5], ttmp[6:7] are free
+  // ttmp[14:15]=ptr to ‘tma’ and is live out; ttmp11.b31 is buf_to_use, 0 or 1
+  s_branch        			.get_correlation_id
+
+.endif
+
+.get_correlation_id:

  // get_correlation_id() -- begin //
  // Returns a value to use as a correlation ID.
@@ -437,6 +555,7 @@ trap_entry:
  // ttmp[4:5], ttmp7, and ttmp13 are free
  // ttmp[14:15] = tma->host_trap_buffers and is live out
  // ttmp6.b31 is buf_to_use, 0 or 1 and is live out
+
  s_mov_b64             		ttmp[4:5], exec                 // back up EXEC mask
  s_mov_b32             		exec_lo, 0x80000000             // prepare EXEC for doorbell spin
  s_sendmsg             		sendmsg(MSG_GET_DOORBELL)       // message 10, puts doorbell in EXEC
@@ -519,7 +638,6 @@ trap_entry:
  s_getreg_b32          		ttmp3, hwreg(HW_REG_MODE, SQ_WAVE_MODE_EXCP_EN_SHIFT, SQ_WAVE_MODE_EXCP_EN_SIZE) // ttmp3[7:0] = MODE.EXCP_EN
  // Set bits corresponding to TRAPSTS.MEM_VIOL, TRAPSTS.ILLEGAL_INST and TRAPSTS.XNACK_ERROR
  s_or_b32              		ttmp3, ttmp3, (1 << SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT | 1 << SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT | 1 << SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT)
-  s_getreg_b32          		ttmp2, hwreg(HW_REG_TRAPSTS)
  s_and_b32             		ttmp2, ttmp2, ttmp3
  // SCC will be 1 if either a maskable instruction was set, or one of MEM_VIOL, ILL_INST, XNACK_ERROR
  s_cbranch_scc1        		.no_skip_debugtrap		// if any of those are set, handle exceptions
@@ -539,6 +657,7 @@ trap_entry:
 .no_skip_debugtrap:
  // Save trap id and halt status in ttmp6.
  s_andn2_b32          			ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK)
+  s_bfe_u32                             ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
  s_min_u32            			ttmp2, ttmp2, 0xF
  s_lshl_b32           			ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT
  s_or_b32             			ttmp6, ttmp6, ttmp2