PC Sampling: Allocate resources to retrieve data from trap handler

Allocate required device and host buffers to be able to interact with the 2nd level trap handler. Change-Id: If99de5aacf956ca57ecafc7b04b797be9c9decaa
2023-08-17 15:04:58 +00:00
@@ -46,6 +46,7 @@
 #define HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_

 #include <vector>
+#include <list>
 #include <map>

 #include "hsakmt/hsakmt.h"
@@ -720,6 +721,13 @@ class GpuAgent : public GpuAgentInt {
  } pcs_hosttrap_sampling_data_t;

  typedef struct {
+    /* Hosttrap data - stored on device so that trap_handler code can access efficiently */
+    pcs_hosttrap_sampling_data_t* device_data;
+
+    /* Hosttrap host buffer - stored on host */
+    uint8_t* host_buffer;
+    size_t host_buffer_size;
+
    os::Thread thread;
    pcs::PcsRuntime::PcSamplingSession* session;
  } pcs_hosttrap_t;
@@ -2375,12 +2375,13 @@ hsa_status_t GpuAgent::PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configura
 }

 hsa_status_t GpuAgent::PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) {
+  hsa_status_t ret;
  HsaPcSamplingInfo sampleInfo = {};
  HsaPcSamplingTraceId thunkId;

  // IOCTL id does not exist at the moment, so passing 0 is OK,
  // since it will be overridden later in this function.
-  hsa_status_t ret = PcSamplingCreateFromId(0, session);
+  ret = PcSamplingCreateFromId(0, session);
  if (ret != HSA_STATUS_SUCCESS) return ret;

  session.GetHsaKmtSamplingInfo(&sampleInfo);
@@ -2401,8 +2402,141 @@ hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId,
                                              pcs::PcsRuntime::PcSamplingSession& session) {
  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;

-  if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1 && ht_data.session)
-    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
+    // TODO: For now can only have 1 hosttrap session at a time. As a final solution, we want to be
+    // able to support multiple sessions at a time. But this makes the session->HandleSampleData
+    // more complicated if multiple sessions have different buffer sizes.
+    if (ht_data.session) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+    // Local copy of hosttrap data - we cannot access device memory directly on non-large BAR
+    // systems
+    pcs_hosttrap_sampling_data_t* device_datahost =
+        (pcs_hosttrap_sampling_data_t*)system_allocator()(sizeof(*device_datahost), 0x1000, 0);
+    if (!device_datahost) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+    MAKE_SCOPE_GUARD([&]() { system_deallocator()(device_datahost); });
+
+    memset(device_datahost, 0, sizeof(*device_datahost));
+
+    if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, device_datahost) !=
+        HSA_STATUS_SUCCESS)
+      return HSA_STATUS_ERROR;
+
+    MAKE_NAMED_SCOPE_GUARD(freeHostTrapResources, [&]() {
+      if (ht_data.device_data) {
+        if (ht_data.device_data->done_sig0.handle)
+          HSA::hsa_signal_destroy(ht_data.device_data->done_sig0);
+        if (ht_data.device_data->done_sig1.handle)
+          HSA::hsa_signal_destroy(ht_data.device_data->done_sig1);
+
+        finegrain_deallocator()(ht_data.device_data);
+      }
+      if (ht_data.host_buffer) system_deallocator()(ht_data.host_buffer);
+    });
+
+    // Force creating of PC Sampling queue to trigger exception early in case we exceed max availble
+    // CP queues on this agent
+    queues_[QueuePCSampling].touch();
+
+    /*
+     * When calling queue->ExecutePM4() Indirect Buffer size which is 0x1000 bytes (1024 DW).
+     * The maximum indirect buffer size we need occurs when we enqueue the
+     * WAIT_REG_MEM, DMA_COPY(s), WRITE_DATA ops:
+     * For WAIT_REG_MEM = 7 DW
+     * For each DMA_COPY = 7 DW
+     * For WRITE_DATA_CMD = 6 DW
+     *
+     * So maximum number of DMA_COPY ops is:
+     * (MAX_IB_SIZE - sizeof(WAIT_REG_MEM) - sizeof(WRITE_DATA_CMD)) / sizeof(DMA_COPY)
+     * (1024 - 7 - 6) / 7 = 144
+     *
+     * Each DMA_COPY op can transfer (1 << 26) bytes, which is 9 GB. trap_buffer_size is a 32-bit
+     * number, so the buffer must be < 4 GB. So we are not limited by Indirect Buffer size.
+     * Set current limit to 256 MB to limit device VRAM usage
+     */
+    const size_t max_trap_buffer_size =
+        core::Runtime::runtime_singleton_->flag().pc_sampling_max_device_buffer_size();
+
+    /*
+     * We use a double-buffer mechanism where there are 2 trap-buffers and 1 host-buffer
+     * Warning: This currently assumes that client latency is smaller than time to fill 1
+     * trap-buffer If latency is bigger, we have to increate host-buffer
+     *
+     * host-buffer must be >= client-buffer so that we can copy full size of client-buffer each
+     * time. To avoid having to deal with wrap-arounds, host-buffer must be a multiple of
+     * trap-buffers
+     *
+     * if client-buffer size is greater than 2x max_trap_buffer_size:
+     *    We are limited by max_trap_buffer_size.
+     *    trap-buffer = max-trap-buffer-size
+     *    host-buffer = 2*smallest size greater than client-buffer but multiple of 1 trap-buffer
+     * else:
+     *    We reduce the trap-buffers so that:
+     *    trap-buffer = half of user-buffer
+     *    host-buffer = 2*user-buffer
+     *
+     * TODO: We are currently using a temporary host-buffer so that we can increase host-buffer to
+     * factor in client latency. Using a direct-copy to the client buffer would be more efficient.
+     * Revisit this once we have empirical data of latency vs how long it takes to fill 1
+     * trap-buffer.
+     */
+
+    size_t trap_buffer_size = 0;
+    if (session.buffer_size() > 2 * max_trap_buffer_size) {
+      trap_buffer_size = max_trap_buffer_size;
+      ht_data.host_buffer_size = 2 * AlignUp(session.buffer_size(), trap_buffer_size);
+    } else {
+      trap_buffer_size = session.buffer_size() / 2;
+      ht_data.host_buffer_size = 2 * session.buffer_size();
+    }
+
+    ht_data.host_buffer = (uint8_t*)system_allocator()(ht_data.host_buffer_size, 0x1000, 0);
+    if (!ht_data.host_buffer) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+    if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, ht_data.host_buffer) !=
+        HSA_STATUS_SUCCESS)
+      return HSA_STATUS_ERROR;
+
+    device_datahost->buf_size = trap_buffer_size / session.sample_size();
+
+    if (HSA::hsa_signal_create(1, 0, NULL, &device_datahost->done_sig0) != HSA_STATUS_SUCCESS)
+      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+    if (HSA::hsa_signal_create(1, 0, NULL, &device_datahost->done_sig1) != HSA_STATUS_SUCCESS)
+      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+    // TODO: Once we have things working and can measure
+    // latency after 2nd level trap handler decrements signals and set watermark accordingly
+    device_datahost->buf_watermark0 = 0.8 * device_datahost->buf_size;
+    device_datahost->buf_watermark1 = 0.8 * device_datahost->buf_size;
+
+    // Allocate device memory for 2nd level trap handler TMA
+    size_t deviceAllocSize = sizeof(*ht_data.device_data) + (2 * trap_buffer_size);
+    ht_data.device_data = (pcs_hosttrap_sampling_data_t*)finegrain_allocator()(deviceAllocSize, 0);
+    if (ht_data.device_data == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+    // This cpuAgent is the owner of the system_allocator() pool
+    auto cpuAgent = GetNearestCpuAgent()->public_handle();
+    hsa_status_t ret = AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, ht_data.device_data);
+    assert(ret == HSA_STATUS_SUCCESS);
+
+    if (DmaCopy(ht_data.device_data, device_datahost, sizeof(*device_datahost)) !=
+        HSA_STATUS_SUCCESS) {
+      debug_print("Failed to dmaCopy!\n");
+      return HSA_STATUS_ERROR;
+    }
+
+    uint8_t* device_buf_ptr =
+        ((uint8_t*)ht_data.device_data) + sizeof(pcs_hosttrap_sampling_data_t);
+    if (DmaFill(device_buf_ptr, 0, deviceAllocSize - sizeof(pcs_hosttrap_sampling_data_t)) !=
+        HSA_STATUS_SUCCESS) {
+      debug_print("Failed to dmaFill!\n");
+      return HSA_STATUS_ERROR;
+    }
+
+    ht_data.session = &session;
+    freeHostTrapResources.Dismiss();
+  }

  session.SetThunkId(ioctlId);
  ht_data.session = &session;
@@ -2417,6 +2551,16 @@ hsa_status_t GpuAgent::PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& ses
  HSAKMT_STATUS retKmt = hsaKmtPcSamplingDestroy(node_id(), session.ThunkId());
  ht_data.session = NULL;

+  if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
+    HSA::hsa_signal_destroy(ht_data.device_data->done_sig0);
+    HSA::hsa_signal_destroy(ht_data.device_data->done_sig1);
+    finegrain_deallocator()(ht_data.device_data);
+    system_deallocator()(ht_data.host_buffer);
+
+    ht_data.device_data = NULL;
+    ht_data.host_buffer = NULL;
+    ht_data.session = NULL;
+  }
  return (retKmt == HSAKMT_STATUS_SUCCESS) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
 }

@@ -2466,6 +2610,10 @@ hsa_status_t GpuAgent::PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& sessio
    throw AMD::hsa_exception(HSA_STATUS_ERROR, "Failed to stop PC Sampling session.");

  if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
+    // Wake up pcs_hosttrap_thread_ if it is waiting for data
+    HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig0, -1);
+    HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig1, -1);
+
    os::WaitForThread(ht_data.thread);
    os::CloseThread(ht_data.thread);
    ht_data.thread = NULL;
@@ -2475,11 +2623,24 @@ hsa_status_t GpuAgent::PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& sessio
 }

 void GpuAgent::PcSamplingThread() {
+  uint64_t which_buffer = 0;
  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
+  hsa_signal_t done_sig[] = {ht_data.device_data->done_sig0, ht_data.device_data->done_sig1};
+
  while (ht_data.session->isActive()) {
+    do {
+      hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire(
+          done_sig[which_buffer], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
+          HSA_WAIT_STATE_ACTIVE);  // TODO: change to HSA_WAIT_STATE_BLOCKED later
+
+      if (val == -1) goto thread_exit;
+      if (val == 0) break;
+    } while (true);
+
    // Implement code to read data from 2nd level trap handler here
    sleep(1);
  }
+thread_exit:
  debug_print("PcSamplingThread::Exiting\n");
 }

@@ -67,6 +67,7 @@ class Flag {
  // Lift limit for 2.10 release RCCL workaround. This limit is not used when asynchronous scratch
  // reclaim is supported
  const size_t DEFAULT_SCRATCH_SINGLE_LIMIT = 146800640;  // small_limit >> 2;
+  const size_t DEFAULT_PCS_MAX_DEVICE_BUFFER_SIZE = 256 * 1024 * 1024;

  explicit Flag() { Refresh(); }

@@ -230,6 +231,13 @@ class Flag {

    var = os::GetEnvVar("HSA_ENABLE_IPC_MODE_LEGACY");
    enable_ipc_mode_legacy_ = (var == "1") ? true : false;
+    if (os::IsEnvVarSet("HSA_PCS_MAX_DEVICE_BUFFER_SIZE")) {
+      var = os::GetEnvVar("HSA_PCS_MAX_DEVICE_BUFFER_SIZE");
+      char* end;
+      pc_sampling_max_device_buffer_size_ = strtoul(var.c_str(), &end, 10);
+    } else {
+      pc_sampling_max_device_buffer_size_ = DEFAULT_PCS_MAX_DEVICE_BUFFER_SIZE;
+    }

    // Temporary environment variable to disable CPU affinity override
    // Will either rename to HSA_OVERRIDE_CPU_AFFINITY later or remove completely.
@@ -341,6 +349,8 @@ class Flag {

  bool enable_ipc_mode_legacy() const { return enable_ipc_mode_legacy_; }

+  size_t pc_sampling_max_device_buffer_size() const { return pc_sampling_max_device_buffer_size_; }
+
 private:
  bool check_flat_scratch_;
  bool enable_vm_fault_message_;
@@ -396,6 +406,8 @@ class Flag {

  SRAMECC_ENABLE sramecc_enable_;

+  size_t pc_sampling_max_device_buffer_size_;
+
  // Map GPU index post RVD to its default cu mask.
  std::map<uint32_t, std::vector<uint32_t>> cu_mask_;