diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index c75cb7a8b8..4aa65edaee 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -508,24 +508,20 @@ class GpuAgent : public GpuAgentInt { // @brief Binds the second-level trap handler to this node. void BindTrapHandler(); - hsa_status_t UpdateTrapHandlerWithPCS(void* pcs_hosttrap_buffers, void* stochastic_hosttrap_buffers); // @brief Override from core::Agent. hsa_status_t EnableDmaProfiling(bool enable) override; hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb, void* cb_data) override; - hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) override; + hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session); hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId, - pcs::PcsRuntime::PcSamplingSession& session) override; - hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) override; - hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) override; - hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) override; - hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) override; - hsa_status_t PcSamplingFlushHostTrapDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session); - - static void PcSamplingThreadRun(void* agent); - void PcSamplingThread(); + pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingFlushDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session); // @brief Node properties. const HsaNodeProperties properties_; @@ -749,13 +745,13 @@ class GpuAgent : public GpuAgentInt { uint8_t reserved1[16]; /* pc_sample_t buffer0[buf_size]; */ /* pc_sample_t buffer1[buf_size]; */ - } pcs_hosttrap_sampling_data_t; + } pcs_sampling_data_t; typedef struct { - /* Hosttrap data - stored on device so that trap_handler code can access efficiently */ - pcs_hosttrap_sampling_data_t* device_data; + /* Sampling data - stored on device for trap handler access */ + pcs_sampling_data_t* device_data; - /* Hosttrap host buffer - stored on host */ + /* Sampling host buffer - stored on host */ uint8_t* host_buffer; size_t host_buffer_size; uint8_t* host_buffer_wrap_pos; @@ -774,11 +770,16 @@ class GpuAgent : public GpuAgentInt { os::Thread thread; pcs::PcsRuntime::PcSamplingSession* session; - } pcs_hosttrap_t; - - pcs_hosttrap_t pcs_hosttrap_data_; + } pcs_data_t; /* PC Sampling fields - end */ + hsa_status_t UpdateTrapHandlerWithPCS(pcs_sampling_data_t* pcs_hosttrap_buffers, + pcs_sampling_data_t* pcs_stochastic_buffers); + + // @brief Thread function to process PC sampling data collected via host-trap + // or Stochastic sampling. + void PcSamplingThread(pcs_data_t& pcs_data, const char* thread_name); + // @brief device handle amdgpu_device_handle ldrm_dev_; @@ -793,6 +794,12 @@ class GpuAgent : public GpuAgentInt { bool uses_rec_sdma_eng_id_mask_; + // structure for host trap sampling + pcs_data_t pcs_hosttrap_data_; + + // structure for stochastic sampling + pcs_data_t pcs_stochastic_data_; + // @bried XGMI CPU<->GPU bool xgmi_cpu_gpu_; }; diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index d8c5c820e8..25fd06909c 100644 --- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -116,6 +116,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }), trap_handler_tma_region_(NULL), pcs_hosttrap_data_(), + pcs_stochastic_data_(), xgmi_cpu_gpu_(false) { const bool is_apu_node = (properties_.NumCPUCores > 0); profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE; @@ -2166,7 +2167,7 @@ void GpuAgent::SyncClocks() { assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error"); } -hsa_status_t GpuAgent::UpdateTrapHandlerWithPCS(void* pcs_hosttrap_buffers, void* pcs_stochastic_buffers) { +hsa_status_t GpuAgent::UpdateTrapHandlerWithPCS(pcs_sampling_data_t* pcs_hosttrap_buffers, pcs_sampling_data_t* pcs_stochastic_buffers) { // Assemble the trap handler source code. void* tma_addr = nullptr; uint64_t tma_size = 0; @@ -2541,7 +2542,11 @@ hsa_status_t GpuAgent::PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& sess ret = PcSamplingCreateFromId(0, session); if (ret != HSA_STATUS_SUCCESS) return ret; + // Obtain the sampling information from the session. session.GetHsaKmtSamplingInfo(&sampleInfo); + + // Pass the sampling information to the kernel driver to create PC + // sampling session. HSAKMT_STATUS retkmt = hsaKmtPcSamplingCreate(node_id(), &sampleInfo, &thunkId); if (retkmt != HSAKMT_STATUS_SUCCESS) { return (retkmt == HSAKMT_STATUS_KERNEL_ALREADY_OPENED) ? (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY @@ -2557,114 +2562,133 @@ hsa_status_t GpuAgent::PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& sess hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId, pcs::PcsRuntime::PcSamplingSession& session) { - pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; + // Determine the sampling method from the session + hsa_ven_amd_pcs_method_kind_t sampling_method = session.method(); - if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { - // TODO: For now can only have 1 hosttrap session at a time. As a final solution, we want to be - // able to support multiple sessions at a time. But this makes the session.HandleSampleData more - // complicated if multiple sessions have different buffer sizes. - if (ht_data.session) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + pcs_data_t* pcs_data = nullptr; - // This is current amd_aql_queue->pm4_ib_size_b_ - ht_data.cmd_data_sz = 0x1000; - ht_data.cmd_data = (uint32_t*)malloc(ht_data.cmd_data_sz); - assert(ht_data.cmd_data); + if (sampling_method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { + pcs_data = &pcs_hosttrap_data_; + } else if (sampling_method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) { + pcs_data = &pcs_stochastic_data_; + } else { + // Unsupported sampling method + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } - if (HSA::hsa_signal_create(1, 0, NULL, &ht_data.exec_pm4_signal) != HSA_STATUS_SUCCESS) - return HSA_STATUS_ERROR; + // Ensure only one session is active at a time for the given method + if (pcs_data->session) + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; // TODO: For now, we can only have + // 1 pc sampling session at a + // time. As a final solution, we + // want to be able to support + // multiple sessions at a time. + // But this makes the + // session.HandleSampleData more + // complicated if multiple + // sessions have different buffer + // sizes. - ht_data.old_val = (uint64_t*)system_allocator()(sizeof(uint64_t), 0x1000, 0); - assert(ht_data.old_val); + // This is current amd_aql_queue->pm4_ib_size_b_ + pcs_data->cmd_data_sz = 0x1000; // 4KB + pcs_data->cmd_data = (uint32_t*)malloc(pcs_data->cmd_data_sz); + if (!pcs_data->cmd_data) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, ht_data.old_val)) - return HSA_STATUS_ERROR; + if (HSA::hsa_signal_create(1, 0, NULL, &pcs_data->exec_pm4_signal) != HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR; - // Local copy of hosttrap data - we cannot access device memory directly on non-large BAR - // systems - pcs_hosttrap_sampling_data_t* device_datahost = - (pcs_hosttrap_sampling_data_t*)system_allocator()(sizeof(*device_datahost), 0x1000, 0); - if (!device_datahost) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + pcs_data->old_val = (uint64_t*)system_allocator()(sizeof(uint64_t), 0x1000, 0); + if (!pcs_data->old_val) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - MAKE_SCOPE_GUARD([&]() { system_deallocator()(device_datahost); }); + if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, pcs_data->old_val)) + return HSA_STATUS_ERROR; - memset(device_datahost, 0, sizeof(*device_datahost)); + // Local copy of pc sampling data - we cannot access device memory directly on non-large BAR + // systems + pcs_sampling_data_t* device_datahost = + (pcs_sampling_data_t*)system_allocator()(sizeof(pcs_sampling_data_t), 0x1000, 0); + if (!device_datahost) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, device_datahost) != - HSA_STATUS_SUCCESS) - return HSA_STATUS_ERROR; + MAKE_SCOPE_GUARD([&]() { system_deallocator()(device_datahost); }); - MAKE_NAMED_SCOPE_GUARD(freeHostTrapResources, [&]() { - if (ht_data.device_data) { - if (ht_data.device_data->done_sig0.handle) - HSA::hsa_signal_destroy(ht_data.device_data->done_sig0); - if (ht_data.device_data->done_sig1.handle) - HSA::hsa_signal_destroy(ht_data.device_data->done_sig1); + memset(device_datahost, 0, sizeof(*device_datahost)); - finegrain_deallocator()(ht_data.device_data); - } - if (ht_data.host_buffer) system_deallocator()(ht_data.host_buffer); - }); + if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, device_datahost) != + HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR; - // Force creating of PC Sampling queue to trigger exception early in case we exceed max availble - // CP queues on this agent - queues_[QueuePCSampling].touch(); + MAKE_NAMED_SCOPE_GUARD(freeResources, [&]() { + if (pcs_data->device_data) { + if (pcs_data->device_data->done_sig0.handle) + HSA::hsa_signal_destroy(pcs_data->device_data->done_sig0); + if (pcs_data->device_data->done_sig1.handle) + HSA::hsa_signal_destroy(pcs_data->device_data->done_sig1); - /* - * When calling queue->ExecutePM4() Indirect Buffer size which is 0x1000 bytes (1024 DW). - * The maximum indirect buffer size we need occurs when we enqueue the - * WAIT_REG_MEM, DMA_COPY(s), WRITE_DATA ops: - * For WAIT_REG_MEM = 7 DW - * For each DMA_COPY = 7 DW - * For WRITE_DATA_CMD = 6 DW - * - * So maximum number of DMA_COPY ops is: - * (MAX_IB_SIZE - sizeof(WAIT_REG_MEM) - sizeof(WRITE_DATA_CMD)) / sizeof(DMA_COPY) - * (1024 - 7 - 6) / 7 = 144 - * - * Each DMA_COPY op can transfer (1 << 26) bytes, which is 9 GB. trap_buffer_size is a 32-bit - * number, so the buffer must be < 4 GB. So we are not limited by Indirect Buffer size. - * Set current limit to 256 MB to limit device VRAM usage - */ - const size_t max_trap_buffer_size = - core::Runtime::runtime_singleton_->flag().pc_sampling_max_device_buffer_size(); + finegrain_deallocator()(pcs_data->device_data); + } + if (pcs_data->host_buffer) system_deallocator()(pcs_data->host_buffer); + }); - /* - * We use a double-buffer mechanism where there are 2 trap-buffers and 1 host-buffer - * Warning: This currently assumes that client latency is smaller than time to fill 1 - * trap-buffer If latency is bigger, we have to increate host-buffer - * - * host-buffer must be >= client-buffer so that we can copy full size of client-buffer each - * time. To avoid having to deal with wrap-arounds, host-buffer must be a multiple of - * trap-buffers - * - * if client-buffer size is greater than 2x max_trap_buffer_size: - * We are limited by max_trap_buffer_size. - * trap-buffer = max-trap-buffer-size - * host-buffer = 2*smallest size greater than client-buffer but multiple of 1 trap-buffer - * else: - * We reduce the trap-buffers so that: - * trap-buffer = half of user-buffer - * host-buffer = 2*user-buffer - * - * TODO: We are currently using a temporary host-buffer so that we can increase host-buffer to - * factor in client latency. Using a direct-copy to the client buffer would be more efficient. - * Revisit this once we have empirical data of latency vs how long it takes to fill 1 - * trap-buffer. - */ + // Force creating of PC Sampling queue to trigger exception early in case we exceed max availble + // CP queues on this agent + queues_[QueuePCSampling].touch(); - size_t trap_buffer_size = 0; - if (session.buffer_size() > 2 * max_trap_buffer_size) { - trap_buffer_size = max_trap_buffer_size; - ht_data.host_buffer_size = 2 * AlignUp(session.buffer_size(), trap_buffer_size); + /* + * When calling queue->ExecutePM4() Indirect Buffer size which is 0x1000 bytes (1024 DW). + * The maximum indirect buffer size we need occurs when we enqueue the + * WAIT_REG_MEM, DMA_COPY(s), WRITE_DATA ops: + * For WAIT_REG_MEM = 7 DW + * For each DMA_COPY = 7 DW + * For WRITE_DATA_CMD = 6 DW + * + * So maximum number of DMA_COPY ops is: + * (MAX_IB_SIZE - sizeof(WAIT_REG_MEM) - sizeof(WRITE_DATA_CMD)) / sizeof(DMA_COPY) + * (1024 - 7 - 6) / 7 = 144 + * + * Each DMA_COPY op can transfer (1 << 26) bytes, which is 9 GB. trap_buffer_size is a 32-bit + * number, so the buffer must be < 4 GB. So we are not limited by Indirect Buffer size. + * Set current limit to 256 MB to limit device VRAM usage + */ + const size_t max_trap_buffer_size = + core::Runtime::runtime_singleton_->flag().pc_sampling_max_device_buffer_size(); + + /* + * We use a double-buffer mechanism where there are 2 trap-buffers and 1 host-buffer + * Warning: This currently assumes that client latency is smaller than time to fill 1 + * trap-buffer If latency is bigger, we have to increate host-buffer + * + * host-buffer must be >= client-buffer so that we can copy full size of client-buffer each + * time. To avoid having to deal with wrap-arounds, host-buffer must be a multiple of + * trap-buffers + * + * if client-buffer size is greater than 2x max_trap_buffer_size: + * We are limited by max_trap_buffer_size. + * trap-buffer = max-trap-buffer-size + * host-buffer = 2*smallest size greater than client-buffer but multiple of 1 trap-buffer + * else: + * We reduce the trap-buffers so that: + * trap-buffer = half of user-buffer + * host-buffer = 2*user-buffer + * + * TODO: We are currently using a temporary host-buffer so that we can increase host-buffer to + * factor in client latency. Using a direct-copy to the client buffer would be more efficient. + * Revisit this once we have empirical data of latency vs how long it takes to fill 1 + * trap-buffer. + */ + + size_t trap_buffer_size = 0; + if (session.buffer_size() > 2 * max_trap_buffer_size) { + trap_buffer_size = max_trap_buffer_size; + pcs_data->host_buffer_size = 2 * AlignUp(session.buffer_size(), trap_buffer_size); } else { trap_buffer_size = session.buffer_size() / 2; - ht_data.host_buffer_size = 2 * session.buffer_size(); + pcs_data->host_buffer_size = 2 * session.buffer_size(); } - ht_data.host_buffer = (uint8_t*)system_allocator()(ht_data.host_buffer_size, 0x1000, 0); - if (!ht_data.host_buffer) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + pcs_data->host_buffer = (uint8_t*)system_allocator()(pcs_data->host_buffer_size, 0x1000, 0); + if (!pcs_data->host_buffer) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, ht_data.host_buffer) != + if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, pcs_data->host_buffer) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR; @@ -2682,101 +2706,162 @@ hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId, device_datahost->buf_watermark1 = 0.8 * device_datahost->buf_size; // Allocate device memory for 2nd level trap handler TMA - size_t deviceAllocSize = sizeof(*ht_data.device_data) + (2 * trap_buffer_size); - ht_data.device_data = (pcs_hosttrap_sampling_data_t*)finegrain_allocator()(deviceAllocSize, 0); - if (ht_data.device_data == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + size_t deviceAllocSize = sizeof(*pcs_data->device_data) + (2 * trap_buffer_size); + pcs_data->device_data = (pcs_sampling_data_t*)finegrain_allocator()(deviceAllocSize, 0); + if (pcs_data->device_data == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; // This cpuAgent is the owner of the system_allocator() pool auto cpuAgent = GetNearestCpuAgent()->public_handle(); - hsa_status_t ret = AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, ht_data.device_data); - assert(ret == HSA_STATUS_SUCCESS); + if (AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, pcs_data->device_data) != HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR; - if (DmaCopy(ht_data.device_data, device_datahost, sizeof(*device_datahost)) != + if (DmaCopy(pcs_data->device_data, device_datahost, sizeof(*device_datahost)) != HSA_STATUS_SUCCESS) { debug_print("Failed to dmaCopy!\n"); return HSA_STATUS_ERROR; } uint8_t* device_buf_ptr = - ((uint8_t*)ht_data.device_data) + sizeof(pcs_hosttrap_sampling_data_t); - if (DmaFill(device_buf_ptr, 0, deviceAllocSize - sizeof(pcs_hosttrap_sampling_data_t)) != + ((uint8_t*)pcs_data->device_data) + sizeof(pcs_sampling_data_t); + if (DmaFill(device_buf_ptr, 0, deviceAllocSize - sizeof(pcs_sampling_data_t)) != HSA_STATUS_SUCCESS) { debug_print("Failed to dmaFill!\n"); return HSA_STATUS_ERROR; } - ht_data.lost_sample_count = 0; - ht_data.host_buffer_wrap_pos = 0; - ht_data.host_write_ptr = ht_data.host_buffer; - ht_data.host_read_ptr = ht_data.host_write_ptr; + pcs_data->lost_sample_count = 0; + pcs_data->host_buffer_wrap_pos = 0; + pcs_data->host_write_ptr = pcs_data->host_buffer; + pcs_data->host_read_ptr = pcs_data->host_write_ptr; - ht_data.session = &session; - freeHostTrapResources.Dismiss(); + pcs_data->session = &session; - if (UpdateTrapHandlerWithPCS(ht_data.device_data, NULL) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR; - } + if (UpdateTrapHandlerWithPCS( + sampling_method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1 ? pcs_data->device_data : nullptr, + sampling_method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1 + ? pcs_data->device_data + : nullptr) != HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR; - session.SetThunkId(ioctlId); - ht_data.session = &session; + session.SetThunkId(ioctlId); - return HSA_STATUS_SUCCESS; + freeResources.Dismiss(); + + return HSA_STATUS_SUCCESS; } hsa_status_t GpuAgent::PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) { if (PcSamplingStop(session) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR; - pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; HSAKMT_STATUS retKmt = hsaKmtPcSamplingDestroy(node_id(), session.ThunkId()); - ht_data.session = NULL; + hsa_ven_amd_pcs_method_kind_t sampling_method = session.method(); - if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { - free(ht_data.cmd_data); - system_deallocator()(ht_data.old_val); - HSA::hsa_signal_destroy(ht_data.exec_pm4_signal); - HSA::hsa_signal_destroy(ht_data.device_data->done_sig0); - HSA::hsa_signal_destroy(ht_data.device_data->done_sig1); - finegrain_deallocator()(ht_data.device_data); - system_deallocator()(ht_data.host_buffer); + pcs_data_t* pcs_data = nullptr; - ht_data.device_data = NULL; - ht_data.host_buffer = NULL; - ht_data.session = NULL; - - UpdateTrapHandlerWithPCS(NULL, NULL); + if (sampling_method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { + pcs_data = &pcs_hosttrap_data_; + } else if (sampling_method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) { + pcs_data = &pcs_stochastic_data_; + } else { + // Unsupported sampling method + return HSA_STATUS_ERROR_INVALID_ARGUMENT; } + + // Mark session as inactive + pcs_data->session = nullptr; + + free(pcs_data->cmd_data); + system_deallocator()(pcs_data->old_val); + HSA::hsa_signal_destroy(pcs_data->exec_pm4_signal); + HSA::hsa_signal_destroy(pcs_data->device_data->done_sig0); + HSA::hsa_signal_destroy(pcs_data->device_data->done_sig1); + finegrain_deallocator()(pcs_data->device_data); + system_deallocator()(pcs_data->host_buffer); + + pcs_data->device_data = NULL; + pcs_data->host_buffer = NULL; + pcs_data->session = NULL; + + // Update the trap handler to clear any associated device data + UpdateTrapHandlerWithPCS(nullptr, nullptr); + return (retKmt == HSAKMT_STATUS_SUCCESS) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; } hsa_status_t GpuAgent::PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) { if (session.isActive()) return HSA_STATUS_SUCCESS; - pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; auto method = session.method(); + + pcs_data_t* pcs_data = nullptr; + const char* thread_name = nullptr; if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { - if (ht_data.session->isActive()) { - debug_warning("Already have a Host trap session in progress!"); - return (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY; - } - ht_data.session->start(); - // This thread will handle all hosttrap sessions on this agent - // In the future, there will be another thread to handle stochastic sessions. - ht_data.thread = os::CreateThread(PcSamplingThreadRun, (void*)this); - if (!ht_data.thread) - throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, - "Failed to start PC Sampling thread."); + pcs_data = &pcs_hosttrap_data_; + thread_name = "PcSamplingHostTrapThread"; + } else if (method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) { + pcs_data = &pcs_stochastic_data_; + thread_name = "PcSamplingStochasticThread"; + } else { + // Unsupported sampling method + return HSA_STATUS_ERROR_INVALID_ARGUMENT; } + // Check if a session is already active + if (pcs_data->session && pcs_data->session->isActive()) { + debug_warning("Already have a PC sampling session in progress!"); + return (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY; + } + + // Assign the new session and mark it as active + pcs_data->session = &session; + pcs_data->session->start(); + + // Creating thread data + struct ThreadData { + GpuAgent* agent; + pcs_data_t* pcs_data; + const char* thread_name; + }; + + auto* thread_data = new ThreadData{this, pcs_data, thread_name}; + + // This thread will handle all PC Sampling sessions on this agent + pcs_data->thread = os::CreateThread( + [](void* arg) -> void { + auto* thread_data = static_cast(arg); + try { + GpuAgent* agent = thread_data->agent; + pcs_data_t* pcs_data = thread_data->pcs_data; + const char* thread_name = thread_data->thread_name; + + agent->PcSamplingThread(*pcs_data, thread_name); + } catch (...) { + fprintf(stdout, "Exception caught in PcSamplingThread. Exiting the thread!"); + } + + delete thread_data; + }, + thread_data); + + if (!pcs_data->thread) { + // if thread creation failed + delete thread_data; + throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, + "Failed to start PC Sampling thread."); + } + + // Start the sampling session in the kernel driver if (hsaKmtPcSamplingStart(node_id(), session.ThunkId()) == HSAKMT_STATUS_SUCCESS) return HSA_STATUS_SUCCESS; debug_print("Failed to start PC sampling session with thunkId:%d\n", session.ThunkId()); - if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { - ht_data.session->stop(); - os::WaitForThread(ht_data.thread); - os::CloseThread(ht_data.thread); - ht_data.thread = NULL; - } + // Clean up if starting the session failed + pcs_data->session->stop(); + os::WaitForThread(pcs_data->thread); + os::CloseThread(pcs_data->thread); + pcs_data->thread = nullptr; + pcs_data->session = nullptr; return HSA_STATUS_ERROR; } @@ -2784,35 +2869,51 @@ hsa_status_t GpuAgent::PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& sessi hsa_status_t GpuAgent::PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) { if (!session.isActive()) return HSA_STATUS_SUCCESS; - pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; - + // Stop the session session.stop(); + // Stop PC sampling in the kernel driver HSAKMT_STATUS retKmt = hsaKmtPcSamplingStop(node_id(), session.ThunkId()); if (retKmt != HSAKMT_STATUS_SUCCESS) throw AMD::hsa_exception(HSA_STATUS_ERROR, "Failed to stop PC Sampling session."); - if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { - // Wake up pcs_hosttrap_thread_ if it is waiting for data - HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig0, -1); - HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig1, -1); + // Determine the sampling method and corresponding data + pcs_data_t* pcs_data = nullptr; + auto method = session.method(); - os::WaitForThread(ht_data.thread); - os::CloseThread(ht_data.thread); - ht_data.thread = NULL; + if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { + pcs_data = &pcs_hosttrap_data_; + } else if (method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) { + pcs_data = &pcs_stochastic_data_; + } else { + // Unsupported sampling method + return HSA_STATUS_ERROR_INVALID_ARGUMENT; } + // Wake up pcs_hosttrap_thread_ if it is waiting for data + HSA::hsa_signal_store_screlease(pcs_data->device_data->done_sig0, -1); + HSA::hsa_signal_store_screlease(pcs_data->device_data->done_sig1, -1); + + // Wait for the thread to finish and clean up + os::WaitForThread(pcs_data->thread); + os::CloseThread(pcs_data->thread); + pcs_data->thread = nullptr; + pcs_data->session = nullptr; return HSA_STATUS_SUCCESS; } -hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers( +hsa_status_t GpuAgent::PcSamplingFlushDeviceBuffers( pcs::PcsRuntime::PcSamplingSession& session) { - pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; - uint32_t& which_buffer = ht_data.which_buffer; - uint32_t* cmd_data = ht_data.cmd_data; - size_t& cmd_data_sz = ht_data.cmd_data_sz; - uint64_t* old_val = ht_data.old_val; - hsa_signal_t& exec_pm4_signal = ht_data.exec_pm4_signal; + pcs_data_t* pcs_data = nullptr; + + if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { + pcs_data = &pcs_hosttrap_data_; + } else if (session.method() == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) { + pcs_data = &pcs_stochastic_data_; + } else { + // No sampling session active + return HSA_STATUS_SUCCESS; + } /* * Device-buffer to Host-buffer to User-Buffer copy logic @@ -2951,19 +3052,33 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers( uint32_t pred_exec_cmd_sz = 0; - uint8_t* host_buffer_begin = ht_data.host_buffer; - uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size; + uint64_t buf_write_val; + uint64_t buf_written_val[2]; + size_t buf_offset; + uint8_t* buffer[2]; + size_t buf_size; - uint64_t buf_write_val = (uint64_t) & (ht_data.device_data->buf_write_val); - uint64_t buf_written_val[] = {(uint64_t) & (ht_data.device_data->buf_written_val0), - (uint64_t) & (ht_data.device_data->buf_written_val1)}; + uint32_t& which_buffer = pcs_data->which_buffer; + uint32_t* cmd_data = pcs_data->cmd_data; + size_t cmd_data_sz = pcs_data->cmd_data_sz; + uint64_t* old_val = pcs_data->old_val; + hsa_signal_t& exec_pm4_signal = pcs_data->exec_pm4_signal; - size_t const buf_offset = offsetof(pcs_hosttrap_sampling_data_t, reserved1) + - sizeof(((pcs_hosttrap_sampling_data_t*)0)->reserved1); + uint8_t* host_buffer_begin = pcs_data->host_buffer; + size_t& host_buffer_size = pcs_data->host_buffer_size; + uint8_t*& host_write_ptr = pcs_data->host_write_ptr; + uint8_t* host_buffer_end = host_buffer_begin + host_buffer_size; - uint8_t* buffer[] = {(uint8_t*)ht_data.device_data + buf_offset, - (uint8_t*)ht_data.device_data + buf_offset + - ht_data.device_data->buf_size * session.sample_size()}; + buf_write_val = reinterpret_cast(&pcs_data->device_data->buf_write_val); + buf_written_val[0] = reinterpret_cast(&pcs_data->device_data->buf_written_val0); + buf_written_val[1] = reinterpret_cast(&pcs_data->device_data->buf_written_val1); + buf_size = pcs_data->device_data->buf_size; + + buf_offset = + offsetof(pcs_sampling_data_t, reserved1) + sizeof(((pcs_sampling_data_t*)0)->reserved1); + + buffer[0] = reinterpret_cast(pcs_data->device_data) + buf_offset; + buffer[1] = buffer[0] + buf_size * session.sample_size(); next_buffer = (which_buffer + 1) % 2; reset_write_val = (uint64_t)next_buffer << 63; @@ -3022,25 +3137,25 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers( /* If the number of entries in old_val is larger than buf_size, then there was a buffer overflow * and the 2nd level trap handler code will skip recording samples, causing lost samples */ - if (*old_val > (uint64_t)ht_data.device_data->buf_size) { - ht_data.lost_sample_count = *old_val - (uint64_t)ht_data.device_data->buf_size; - *old_val = (uint64_t)ht_data.device_data->buf_size; + if (*old_val > buf_size) { + pcs_data->lost_sample_count = *old_val - buf_size; + *old_val = buf_size; } to_copy = *old_val * session.sample_size(); /* Make sure there is enough space after host_write_ptr */ - if (ht_data.host_write_ptr + to_copy >= host_buffer_end) { + if (host_write_ptr + to_copy >= host_buffer_end) { // Need to wrap around - ht_data.host_buffer_wrap_pos = ht_data.host_write_ptr; - ht_data.host_write_ptr = host_buffer_begin; + pcs_data->host_buffer_wrap_pos = host_write_ptr; + host_write_ptr = host_buffer_begin; } i = 0; memset(cmd_data, 0, cmd_data_sz); if (properties_.NumXcc > 1) { - const uint32_t n = ceil(to_copy / (32 * 1024 * 1024)); + const uint64_t n = ceil(to_copy / (32 * 1024 * 1024)); pred_exec_cmd_sz = 2; cmd_data[i++] = PM4_HDR(PM4_HDR_IT_OPCODE_PRED_EXEC, pred_exec_cmd_sz, isa_->GetMajorVersion()); cmd_data[i++] = @@ -3073,7 +3188,8 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers( unsigned int num_copy_command = 0; uint8_t* buffer_temp = buffer[which_buffer]; - for (copy_bytes = CP_DMA_DATA_TRANSFER_CNT_MAX; 0 < to_copy; to_copy -= copy_bytes) { + for (copy_bytes = std::min(to_copy, (uint32_t)CP_DMA_DATA_TRANSFER_CNT_MAX); 0 < to_copy; + to_copy -= copy_bytes) { num_copy_command++; /* DMA_DATA PACKETS, copy buffer using CPDMA */ @@ -3082,9 +3198,8 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers( PM4_DMA_DATA_SRC_SEL_SRC_ADDR_USING_L2); cmd_data[i++] = PM4_DMA_DATA_DW2_SRC_ADDR_LO((uint64_t)buffer_temp); cmd_data[i++] = PM4_DMA_DATA_DW3_SRC_ADDR_HI(((uint64_t)buffer_temp) >> 32); - cmd_data[i++] = PM4_DMA_DATA_DW4_DST_ADDR_LO((uint64_t)ht_data.host_write_ptr); - cmd_data[i++] = PM4_DMA_DATA_DW5_DST_ADDR_HI(((uint64_t)ht_data.host_write_ptr) >> 32); - + cmd_data[i++] = PM4_DMA_DATA_DW4_DST_ADDR_LO((uint64_t)host_write_ptr); + cmd_data[i++] = PM4_DMA_DATA_DW5_DST_ADDR_HI(((uint64_t)host_write_ptr) >> 32); if (copy_bytes >= to_copy) { copy_bytes = to_copy; cmd_data[i++] = @@ -3093,7 +3208,7 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers( cmd_data[i++] = PM4_DMA_DATA_DW6(PM4_DMA_DATA_BYTE_COUNT(copy_bytes) | PM4_DMA_DATA_DIS_WC); } buffer_temp += copy_bytes; - ht_data.host_write_ptr += copy_bytes; + host_write_ptr += copy_bytes; } /* WRITE_DATA, Reset buf_written_val */ @@ -3117,167 +3232,180 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers( if (val == 0) break; } while (true); + // save the position of next buffer which_buffer = next_buffer; return HSA_STATUS_SUCCESS; } -void GpuAgent::PcSamplingThread() { +void GpuAgent::PcSamplingThread(pcs_data_t& pcs_data, const char* thread_name) { // TODO: Implement lost sample count // TODO: Implement latency - pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; - pcs::PcsRuntime::PcSamplingSession& session = *ht_data.session; - uint32_t& which_buffer = ht_data.which_buffer; + try { + pcs::PcsRuntime::PcSamplingSession& session = *pcs_data.session; + uint32_t& which_buffer = pcs_data.which_buffer; - uint8_t* host_buffer_begin = ht_data.host_buffer; - uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size; + uint8_t* host_buffer_begin = pcs_data.host_buffer; + uint8_t* host_buffer_end = pcs_data.host_buffer + pcs_data.host_buffer_size; - hsa_signal_t done_sig[] = {ht_data.device_data->done_sig0, ht_data.device_data->done_sig1}; + hsa_signal_t done_sig[] = {pcs_data.device_data->done_sig0, pcs_data.device_data->done_sig1}; - while (ht_data.session->isActive()) { - do { - hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire( - done_sig[which_buffer], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); - if (val == -1) goto thread_exit; - if (val == 0) break; - } while (true); - HSA::hsa_signal_store_screlease(done_sig[which_buffer], 1); + while (pcs_data.session->isActive()) { + // Wait for the signal to process the buffer + do { + hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire( + done_sig[which_buffer], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + if (val == -1) goto thread_exit; + if (val == 0) break; + } while (true); + HSA::hsa_signal_store_screlease(done_sig[which_buffer], 1); - std::lock_guard lock(ht_data.host_buffer_mutex); - if (PcSamplingFlushHostTrapDeviceBuffers(session) != HSA_STATUS_SUCCESS) - goto thread_exit; + // Lock buffer to ensure thread-safe access + std::lock_guard lock(pcs_data.host_buffer_mutex); + // Flush device buffers + if (PcSamplingFlushDeviceBuffers(session) != HSA_STATUS_SUCCESS) + goto thread_exit; - size_t bytes_before_wrap; - size_t bytes_after_wrap; + size_t bytes_before_wrap; + size_t bytes_after_wrap; - assert(ht_data.host_read_ptr >= host_buffer_begin && ht_data.host_read_ptr < host_buffer_end); - assert(ht_data.host_write_ptr >= host_buffer_begin && ht_data.host_write_ptr < host_buffer_end); - assert(ht_data.host_buffer_wrap_pos ? (ht_data.host_read_ptr > ht_data.host_write_ptr) - : (ht_data.host_read_ptr <= ht_data.host_write_ptr)); + assert(pcs_data.host_read_ptr >= host_buffer_begin && pcs_data.host_read_ptr < host_buffer_end); + assert(pcs_data.host_write_ptr >= host_buffer_begin && pcs_data.host_write_ptr < host_buffer_end); + assert(pcs_data.host_buffer_wrap_pos ? (pcs_data.host_read_ptr > pcs_data.host_write_ptr) + : (pcs_data.host_read_ptr <= pcs_data.host_write_ptr)); - if (ht_data.host_buffer_wrap_pos) { - assert(ht_data.host_buffer_wrap_pos <= host_buffer_end && - ht_data.host_buffer_wrap_pos > host_buffer_begin); - assert(ht_data.host_read_ptr <= ht_data.host_buffer_wrap_pos); + if (pcs_data.host_buffer_wrap_pos) { + assert(pcs_data.host_buffer_wrap_pos <= host_buffer_end && + pcs_data.host_buffer_wrap_pos > host_buffer_begin); + assert(pcs_data.host_read_ptr <= pcs_data.host_buffer_wrap_pos); - // Wrapped around - bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr; - bytes_after_wrap = ht_data.host_write_ptr - host_buffer_begin; + // Wrapped around + bytes_before_wrap = pcs_data.host_buffer_wrap_pos - pcs_data.host_read_ptr; + bytes_after_wrap = pcs_data.host_write_ptr - host_buffer_begin; - while (bytes_before_wrap >= session.buffer_size()) { - session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0, - ht_data.lost_sample_count); - ht_data.host_read_ptr += session.buffer_size(); - bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr; - ht_data.lost_sample_count = 0; - } + while (bytes_before_wrap >= session.buffer_size()) { + session.HandleSampleData(pcs_data.host_read_ptr, session.buffer_size(), nullptr, 0, + pcs_data.lost_sample_count); + pcs_data.host_read_ptr += session.buffer_size(); + bytes_before_wrap = pcs_data.host_buffer_wrap_pos - pcs_data.host_read_ptr; + pcs_data.lost_sample_count = 0; + } - if (bytes_before_wrap + bytes_after_wrap >= session.buffer_size()) { - session.HandleSampleData(ht_data.host_read_ptr, bytes_before_wrap, host_buffer_begin, - (session.buffer_size() - bytes_before_wrap), 0); - ht_data.host_read_ptr = host_buffer_begin + (session.buffer_size() - bytes_before_wrap); - bytes_before_wrap = 0; - ht_data.host_buffer_wrap_pos = 0; - bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr; - ht_data.lost_sample_count = 0; - } + if (bytes_before_wrap + bytes_after_wrap >= session.buffer_size()) { + session.HandleSampleData(pcs_data.host_read_ptr, bytes_before_wrap, host_buffer_begin, + (session.buffer_size() - bytes_before_wrap), 0); + pcs_data.host_read_ptr = host_buffer_begin + (session.buffer_size() - bytes_before_wrap); + bytes_before_wrap = 0; + pcs_data.host_buffer_wrap_pos = 0; + bytes_after_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr; + pcs_data.lost_sample_count = 0; + } - while (bytes_after_wrap >= session.buffer_size()) { - session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0, - ht_data.lost_sample_count); - ht_data.host_read_ptr += session.buffer_size(); - bytes_before_wrap = 0; - bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr; - ht_data.lost_sample_count = 0; - } - } else { - bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr; + while (bytes_after_wrap >= session.buffer_size()) { + session.HandleSampleData(pcs_data.host_read_ptr, session.buffer_size(), nullptr, 0, + pcs_data.lost_sample_count); + pcs_data.host_read_ptr += session.buffer_size(); + bytes_before_wrap = 0; + bytes_after_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr; + pcs_data.lost_sample_count = 0; + } + } else { + // Handle non-wrapped buffer + bytes_before_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr; - while (bytes_before_wrap >= session.buffer_size()) { - assert(ht_data.host_read_ptr >= host_buffer_begin && - ht_data.host_read_ptr + session.buffer_size() < host_buffer_end); - session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0, - ht_data.lost_sample_count); - ht_data.host_read_ptr += session.buffer_size(); - bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr; - ht_data.lost_sample_count = 0; + while (bytes_before_wrap >= session.buffer_size()) { + assert(pcs_data.host_read_ptr >= host_buffer_begin && + pcs_data.host_read_ptr + session.buffer_size() <= host_buffer_end); + session.HandleSampleData(pcs_data.host_read_ptr, session.buffer_size(), nullptr, 0, + pcs_data.lost_sample_count); + pcs_data.host_read_ptr += session.buffer_size(); + bytes_before_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr; + pcs_data.lost_sample_count = 0; + } } } - } thread_exit: - debug_print("PcSamplingThread::Exiting\n"); + debug_print("%s::Exiting\n", thread_name); +} catch (const std::exception& e) { + debug_print("Exception in %s: %s\n", thread_name, e.what()); +} catch (...) { + debug_print("Unknown exception in %s\n", thread_name); } - -void GpuAgent::PcSamplingThreadRun(void* _agent) { - GpuAgent* agent = (GpuAgent*)_agent; - agent->PcSamplingThread(); - debug_print("PcSamplingThread exiting..."); } hsa_status_t GpuAgent::PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) { - pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; + pcs_data_t* pcs_data = nullptr; - uint8_t* host_buffer_begin = ht_data.host_buffer; - uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size; + if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { + pcs_data = &pcs_hosttrap_data_; + } else if (session.method() == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) { + pcs_data = &pcs_stochastic_data_; + } else { + return HSA_STATUS_SUCCESS; // Unsupported sampling method + } + + uint8_t* host_buffer_begin = pcs_data->host_buffer; + uint8_t* host_buffer_end = pcs_data->host_buffer + pcs_data->host_buffer_size; size_t bytes_before_wrap; size_t bytes_after_wrap; - std::lock_guard lock(ht_data.host_buffer_mutex); - if (PcSamplingFlushHostTrapDeviceBuffers(session) != HSA_STATUS_SUCCESS) - return HSA_STATUS_ERROR; + std::lock_guard lock(pcs_data->host_buffer_mutex); + // Flush device buffers + if (PcSamplingFlushDeviceBuffers(session) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR; - assert(ht_data.host_read_ptr >= host_buffer_begin && ht_data.host_read_ptr < host_buffer_end); - assert(ht_data.host_write_ptr >= host_buffer_begin && ht_data.host_write_ptr < host_buffer_end); - assert(ht_data.host_buffer_wrap_pos ? (ht_data.host_read_ptr > ht_data.host_write_ptr) - : (ht_data.host_read_ptr <= ht_data.host_write_ptr)); + assert(pcs_data->host_read_ptr >= host_buffer_begin && pcs_data->host_read_ptr < host_buffer_end); + assert(pcs_data->host_write_ptr >= host_buffer_begin && + pcs_data->host_write_ptr < host_buffer_end); + assert(pcs_data->host_buffer_wrap_pos ? (pcs_data->host_read_ptr > pcs_data->host_write_ptr) + : (pcs_data->host_read_ptr <= pcs_data->host_write_ptr)); - if (ht_data.host_buffer_wrap_pos) { - assert(ht_data.host_buffer_wrap_pos <= host_buffer_end && - ht_data.host_buffer_wrap_pos > host_buffer_begin); - assert(ht_data.host_read_ptr <= ht_data.host_buffer_wrap_pos); + if (pcs_data->host_buffer_wrap_pos) { + assert(pcs_data->host_buffer_wrap_pos <= host_buffer_end && + pcs_data->host_buffer_wrap_pos > host_buffer_begin); + assert(pcs_data->host_read_ptr <= pcs_data->host_buffer_wrap_pos); - // Wrapped around - bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr; - bytes_after_wrap = ht_data.host_write_ptr - host_buffer_begin; + // Handle wrapped-around buffer + bytes_before_wrap = pcs_data->host_buffer_wrap_pos - pcs_data->host_read_ptr; + bytes_after_wrap = pcs_data->host_write_ptr - host_buffer_begin; while (bytes_before_wrap > 0) { size_t bytes_to_copy = std::min(bytes_before_wrap, session.buffer_size()); - session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0, - ht_data.lost_sample_count); - ht_data.host_read_ptr += bytes_to_copy; - bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr; - ht_data.lost_sample_count = 0; + session.HandleSampleData(pcs_data->host_read_ptr, bytes_to_copy, nullptr, 0, + pcs_data->lost_sample_count); + pcs_data->host_read_ptr += bytes_to_copy; + bytes_before_wrap = pcs_data->host_buffer_wrap_pos - pcs_data->host_read_ptr; + pcs_data->lost_sample_count = 0; } - assert(ht_data.host_read_ptr == ht_data.host_buffer_wrap_pos); - ht_data.host_buffer_wrap_pos = 0; - ht_data.host_read_ptr = host_buffer_begin; + assert(pcs_data->host_read_ptr == pcs_data->host_buffer_wrap_pos); + pcs_data->host_buffer_wrap_pos = 0; + pcs_data->host_read_ptr = host_buffer_begin; while (bytes_after_wrap > 0) { size_t bytes_to_copy = std::min(bytes_after_wrap, session.buffer_size()); - session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0, - ht_data.lost_sample_count); - ht_data.host_read_ptr += bytes_to_copy; - bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr; - ht_data.lost_sample_count = 0; + session.HandleSampleData(pcs_data->host_read_ptr, bytes_to_copy, nullptr, 0, + pcs_data->lost_sample_count); + pcs_data->host_read_ptr += bytes_to_copy; + bytes_after_wrap = pcs_data->host_write_ptr - pcs_data->host_read_ptr; + pcs_data->lost_sample_count = 0; } } else { - bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr; + bytes_before_wrap = pcs_data->host_write_ptr - pcs_data->host_read_ptr; - while (bytes_before_wrap) { + while (bytes_before_wrap > 0) { size_t bytes_to_copy = std::min(bytes_before_wrap, session.buffer_size()); - assert(ht_data.host_read_ptr >= host_buffer_begin && - ht_data.host_read_ptr + bytes_to_copy <= host_buffer_end); + assert(pcs_data->host_read_ptr >= host_buffer_begin && + pcs_data->host_read_ptr + bytes_to_copy <= host_buffer_end); - session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0, - ht_data.lost_sample_count); - ht_data.host_read_ptr += bytes_to_copy; - bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr; - ht_data.lost_sample_count = 0; + session.HandleSampleData(pcs_data->host_read_ptr, bytes_to_copy, nullptr, 0, + pcs_data->lost_sample_count); + pcs_data->host_read_ptr += bytes_to_copy; + bytes_before_wrap = pcs_data->host_write_ptr - pcs_data->host_read_ptr; + pcs_data->lost_sample_count = 0; } } return HSA_STATUS_SUCCESS; diff --git a/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler.s b/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler.s index 70ffd6dd03..66bc82ec21 100644 --- a/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler.s +++ b/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler.s @@ -53,6 +53,7 @@ .set SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT , 11 .set SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT , 28 .set SQ_WAVE_TRAPSTS_MATH_EXCP , 0x7F +.set SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT , 26 .set SQ_WAVE_MODE_EXCP_EN_SHIFT , 12 .set SQ_WAVE_MODE_EXCP_EN_SIZE , 8 .set TRAP_ID_ABORT , 2 @@ -95,12 +96,23 @@ // TTMP_REG1 means ttmp6 register if gfx>=942 and means ttmp13 register if gfx<942 // TTMP_REG2 means ttmp11 register if gfx>=942 and means ttmp6 register if gfx<942 -.if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4 +.if .amdgcn.gfx_generation_number == 9 .set TTMP11_TTMPS_SETUP_SHIFT , 31 - // Bit to indicate that this is a hosttrap trap instead of stochastic trap - // Currently not used - .set TTMP13_PCS_IS_STOCHASTIC , 24 +.if (.amdgcn.gfx_generation_minor >= 4) + .set TTMP11_WAVE_IN_WG_MASK , 0x3F + + // Bit to indicate that this is a stochastic trap + .set TTMP13_PCS_IS_STOCHASTIC , 21 + + // Bit to indicate that this is a host trap + .set TTMP13_PCS_IS_HOSTTRAP , 22 + +.else + + // Bit to indicate that this is a host trap + .set TTMP11_PCS_IS_HOSTTRAP , 22 +.endif .endif .if (.amdgcn.gfx_generation_number == 9) @@ -205,27 +217,45 @@ // ttmp15 = TMA[63:32] // gfx9: // ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32] -// all gfx9 (except gfx942): +// For all gfx9 (except gfx940, gfx941, gfx942): // ttmp6 = 0[6:0], DispatchPktIndx[24:0] // ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveInWg[5:0] -// Note: Once stochastic sampling is implemented, L2 Trap Handler will use Bit 23 -// (TTMP11_PCS_IS_STOCHASTIC) to differentiate between stochastic and hosttrap -// gfx942: +// +// For gfx940/gfx941/gfx942: // ttmp11 = 0[0], DispatchPktIndx[24:0], WaveIdInWg[5:0] -// ttmp13 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[22:0] +// ttmp13: +// Bits 31:26 : SQ_WAVE_IB_STS[20:15] (1TH) +// 25:24 : 0 on 2TH entry. Used by 1st level TH but also +// free to be used in the 2nd level TH +// 23 : Debug Enabled (1TH) +// 22:0 : values are unspecified on 2TH entry. Free. +// // gfx10: // ttmp1 = 0[0], PCRewind[5:0], HostTrap[0], TrapId[7:0], PC[47:32] +// // gfx10/gfx11: // ttmp6 = 0[6:0], DispatchPktIndx[24:0] +// // gfx1010: // ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0] +// // gfx1030/gfx1100: // ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0] +// +// ttmp[14:15] points to TMA2; Available: ttmp[2:3], ttmp[4:5] +// +// ttmp7 : gfx9, gfx1010, gfx1030, gfx11 - 31:0 : PC[31:0] (2TH, DBG); +// : gfx940 - free; +// : gfx12 - ttmp7 - 31:16 : workgroup_z[15:0] (SPI) and 15:0 : workgroup_y[15:0] (SPI) trap_entry: - // Branch if not a trap (an exception instead). - s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE - s_cbranch_scc0 .no_skip_debugtrap + // Extract trap_id from ttmp2 + s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE + s_cbranch_scc0 .not_s_trap // If trap_id == 0, it's not an s_trap nor host trap + + // Check if the it was an host trap. + s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT + s_cbranch_scc0 .not_host_trap .if (.amdgcn.gfx_generation_number == 9) // PC_SAMPLING_GFX9 // ttmp[14:15] is TMA2; Available: ttmp[2:3], ttmp[4:5], ttmp7, TTMP_REG1 @@ -236,29 +266,42 @@ trap_entry: // [0x08] out_buf_t* stochastic_trap_buffers; // // --- Start profile trap handlers GFX9 --- // - // if (host_trap) { - // if (stochastic) // Not implemented yet - // ttmp11.bit23 = 1; // Not implemented yet - // profiling_trap_handler(tma->host_trap_buffers); - // } + // If the wave entered the trap handler: + // If on gfx9: + // - Check SQ_WAVE_PC_HI_HT_SHIFT bit on TTMP1 register to + // identify if it was a host trap. + // If a host trap is detected: + // - Mark TTMP13(gfx94x) or TTMP11(gfx9) hosttrap bit + // - Load host_trap_buffers + // - Branch to the profile trap handler logic. + // + // If on gfx9.4+: + // - Check TRAPSTS bit 26 (SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT) to + // identify stochastic traps. + // If a stochastic trap is detected: + // - Set bit 21 in TTMP13 to indicate a stochastic trap. + // - Branch to the profile trap handler logic. - s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT - s_cbranch_scc0 .not_host_trap_gfx9 - s_load_dwordx2 ttmp[14:15], ttmp[14:15], 0 glc // ttmp[14:15]=&host_trap_buffers - // TODO: When implementing stochastic sampling, need to set TTMP11_PCS_IS_STOCHASTIC - // or TTMP13_PCS_IS_STOCHASTIC to differentiate between hosttrap and stochastic sampling + s_load_dwordx2 ttmp[2:3], ttmp[14:15], 0 glc // ttmp[14:15]=*host_trap_buffers +.if .amdgcn.gfx_generation_minor >= 4 + s_bitset0_b32 ttmp13, TTMP13_PCS_IS_STOCHASTIC + s_bitset1_b32 ttmp13, TTMP13_PCS_IS_HOSTTRAP // set bit 22 in TTMP13 +.else + s_bitset1_b32 ttmp11, TTMP11_PCS_IS_HOSTTRAP // Set bit 22 in TTMP11 +.endif s_waitcnt lgkmcnt(0) - s_branch .profile_trap_handlers_gfx9 // Off to the profile handlers + s_mov_b64 ttmp[14:15], ttmp[2:3] //now ttmp[14:15] = host_trap_buffers + s_branch .profile_trap_handlers_gfx9 // Off to the profile handlers +.else + // Ignore host traps. They should be masked by the driver anyway. + s_branch .not_s_trap +.endif -.not_host_trap_gfx9: -.endif // PC_SAMPLING_GFX9 - // If caused by s_trap then advance PC. - s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT - s_cbranch_scc1 .not_s_trap +.not_host_trap: + // It's an s_trap; advance the PC s_add_u32 ttmp0, ttmp0, 0x4 s_addc_u32 ttmp1, ttmp1, 0x0 -.not_s_trap: // If llvm.debugtrap and debugger is not attached. s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP s_cbranch_scc0 .no_skip_debugtrap @@ -272,6 +315,24 @@ trap_entry: // Ignore llvm.debugtrap. s_branch .exit_trap +.not_s_trap: +.if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4 + //Check for stochastic trap on gfx9.4+ + s_getreg_b32 ttmp7, hwreg(HW_REG_TRAPSTS) // On gfx94x, TRAPSTS bit 26 ... + s_bitcmp1_b32 ttmp7, SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT // is stochastic_sample_trap + s_cbranch_scc0 .no_skip_debugtrap + + // Handle stochastic trap + s_load_dwordx2 ttmp[2:3], ttmp[14:15], 0x8 glc // ttmp[14:15]=*stoch_trap_buf + s_bitset0_b32 ttmp13, TTMP13_PCS_IS_HOSTTRAP + s_bitset1_b32 ttmp13, TTMP13_PCS_IS_STOCHASTIC // set bit 25 in TTMP13 + s_waitcnt lgkmcnt(0) + s_mov_b64 ttmp[14:15], ttmp[2:3] + s_branch .profile_trap_handlers_gfx9 // Off to the profile handlers +.else + s_branch .no_skip_debugtrap +.endif // PC_SAMPLING_GFX9 + .if (.amdgcn.gfx_generation_number == 9) // PC_SAMPLING_GFX9 // tma->host_trap_buffers Offsets: // [0x00] uint64_t buf_write_val; @@ -348,6 +409,26 @@ trap_entry: s_addc_u32 ttmp5, ttmp15, ttmp5 // buffer0 or buffer1 s_mov_b32 ttmp7, ttmp2 + .if .amdgcn.gfx_generation_number == 9 + + .if .amdgcn.gfx_generation_minor >= 4 + // Check if it's a stochastic trap + s_bitcmp1_b32 ttmp13, TTMP13_PCS_IS_STOCHASTIC + s_cbranch_scc1 .fill_sample_stochastic + // Check if it's a host trap + s_bitcmp1_b32 ttmp13, TTMP13_PCS_IS_HOSTTRAP + s_cbranch_scc1 .fill_sample_hosttrap +.else + // Check if it's a host trap + s_bitcmp1_b32 ttmp11, TTMP11_PCS_IS_HOSTTRAP + s_cbranch_scc1 .fill_sample_hosttrap + +.endif +.endif + // If neither bit is set, this is unexpected. + // This branch is not expected to be taken. + s_branch .no_skip_debugtrap + // ttmp7 contains local_entry, ttmp[4:5] contains "&bufferX", // ttmp[14:15] holds 'tma->host_trap_buffers' pointer // ttmp[2:3] and ttmp13 are available for gathering perf sample info @@ -381,7 +462,7 @@ trap_entry: // buf->timestamp = s_memrealtime; // buf->correlation_id = get_correlation_id(); // } - +.fill_sample_hosttrap: s_mul_i32 ttmp2, ttmp7, 0x40 // offset into buffer for 64B objects s_mul_hi_u32 ttmp3, ttmp7, 0x40 // ttmp[2:3] will contain byte ... s_add_u32 ttmp2, ttmp2, ttmp4 @@ -401,19 +482,56 @@ trap_entry: .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4) s_getreg_b32 ttmp4, hwreg(HW_REG_XCC_ID) //store XCC_ID s_lshl_b32 ttmp4, ttmp4, 8 - s_and_b32 ttmp5, ttmp11, 0x3f + s_and_b32 ttmp5, ttmp11, TTMP11_WAVE_IN_WG_MASK s_or_b32 ttmp4, ttmp4, ttmp5 s_store_dword ttmp4, ttmp[2:3], 0x1c // store wave_in_wg .else s_and_b32 ttmp4, ttmp11, 0x3f s_store_dword ttmp4, ttmp[2:3], 0x1c // store wave_in_wg .endif - // Get HW_ID using S_GETREG_B32 with size=32 (F8 in upper bits), offset=0, and HW_ID = 4 (0x4) s_getreg_b32 ttmp4, hwreg(HW_REG_HW_ID) s_store_dword ttmp4, ttmp[2:3], 0x20 // store HW_ID - // ttmp[2:3] = &buffer[local_entry]; ttmp[4:5], ttmp7, and ttmp13 are free - // ttmp[14:15] = tma->host_trap_buffers and is live out; ttmp6.b31 is buf_to_use, 0 or 1 + s_branch .get_correlation_id + +.if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4 +.fill_sample_stochastic: + s_mul_i32 ttmp2, ttmp7, 0x40 // offset into buffer for 64B objects + s_mul_hi_u32 ttmp3, ttmp7, 0x40 + s_add_u32 ttmp2, ttmp2, ttmp4 + s_addc_u32 ttmp3, ttmp3, ttmp5 // ttmp[2:3]=&buffer[local_entry] + s_memrealtime ttmp[4:5] + s_waitcnt lgkmcnt(0) // Wait for timestamp + s_store_dwordx2 ttmp[4:5], ttmp[2:3] 0x30 // Store timestamp + + s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA) + s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA1) + s_store_dwordx2 ttmp[4:5], ttmp[2:3], 0x24 // store snapshot PC + s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_LO) + s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_HI) + s_store_dwordx2 ttmp[4:5], ttmp[2:3] 0x00 // store snapshot data + + s_mov_b32 ttmp6, exec_lo + s_store_dword ttmp6, ttmp[2:3], 0x8 // store EXEC_LO + s_mov_b32 ttmp6, exec_hi + s_store_dword ttmp6, ttmp[2:3], 0xc // store EXEC_HI + + s_store_dwordx2 ttmp[8:9], ttmp[2:3], 0x10 // store wg_id_x and wg_id_y + s_store_dword ttmp10, ttmp[2:3], 0x18 // store wg_id_z + s_getreg_b32 ttmp4, hwreg(HW_REG_XCC_ID) + s_lshl_b32 ttmp4, ttmp4, 8 + s_and_b32 ttmp5, ttmp11, TTMP11_WAVE_IN_WG_MASK + s_or_b32 ttmp4, ttmp4, ttmp5 + s_store_dword ttmp4, ttmp[2:3], 0x1c // store chiplet_and_wave_id + s_getreg_b32 ttmp4, hwreg(HW_REG_HW_ID) + s_store_dword ttmp4, ttmp[2:3], 0x20 // store HW_ID + // ttmp[2:3]=&buffer[local_entry]; ttmp[4:5], ttmp[6:7] are free + // ttmp[14:15]=ptr to ‘tma’ and is live out; ttmp11.b31 is buf_to_use, 0 or 1 + s_branch .get_correlation_id + +.endif + +.get_correlation_id: // get_correlation_id() -- begin // // Returns a value to use as a correlation ID. @@ -437,6 +555,7 @@ trap_entry: // ttmp[4:5], ttmp7, and ttmp13 are free // ttmp[14:15] = tma->host_trap_buffers and is live out // ttmp6.b31 is buf_to_use, 0 or 1 and is live out + s_mov_b64 ttmp[4:5], exec // back up EXEC mask s_mov_b32 exec_lo, 0x80000000 // prepare EXEC for doorbell spin s_sendmsg sendmsg(MSG_GET_DOORBELL) // message 10, puts doorbell in EXEC @@ -519,7 +638,6 @@ trap_entry: s_getreg_b32 ttmp3, hwreg(HW_REG_MODE, SQ_WAVE_MODE_EXCP_EN_SHIFT, SQ_WAVE_MODE_EXCP_EN_SIZE) // ttmp3[7:0] = MODE.EXCP_EN // Set bits corresponding to TRAPSTS.MEM_VIOL, TRAPSTS.ILLEGAL_INST and TRAPSTS.XNACK_ERROR s_or_b32 ttmp3, ttmp3, (1 << SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT | 1 << SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT | 1 << SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT) - s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) s_and_b32 ttmp2, ttmp2, ttmp3 // SCC will be 1 if either a maskable instruction was set, or one of MEM_VIOL, ILL_INST, XNACK_ERROR s_cbranch_scc1 .no_skip_debugtrap // if any of those are set, handle exceptions @@ -539,6 +657,7 @@ trap_entry: .no_skip_debugtrap: // Save trap id and halt status in ttmp6. s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK) + s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE s_min_u32 ttmp2, ttmp2, 0xF s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT s_or_b32 ttmp6, ttmp6, ttmp2