rocr: Adding support for Stochastic PC Sampling for gfx94x (#47)
Change-Id: Ide4c2e25b88f1f25ea4ce35a619b93963c0355ee
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
a9f6bc8d0e
Коммит
322a794cf6
@@ -508,24 +508,20 @@ class GpuAgent : public GpuAgentInt {
|
||||
|
||||
// @brief Binds the second-level trap handler to this node.
|
||||
void BindTrapHandler();
|
||||
hsa_status_t UpdateTrapHandlerWithPCS(void* pcs_hosttrap_buffers, void* stochastic_hosttrap_buffers);
|
||||
|
||||
// @brief Override from core::Agent.
|
||||
hsa_status_t EnableDmaProfiling(bool enable) override;
|
||||
|
||||
hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
|
||||
void* cb_data) override;
|
||||
hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) override;
|
||||
hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session);
|
||||
hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId,
|
||||
pcs::PcsRuntime::PcSamplingSession& session) override;
|
||||
hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) override;
|
||||
hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) override;
|
||||
hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) override;
|
||||
hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) override;
|
||||
hsa_status_t PcSamplingFlushHostTrapDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session);
|
||||
|
||||
static void PcSamplingThreadRun(void* agent);
|
||||
void PcSamplingThread();
|
||||
pcs::PcsRuntime::PcSamplingSession& session);
|
||||
hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session);
|
||||
hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session);
|
||||
hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session);
|
||||
hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session);
|
||||
hsa_status_t PcSamplingFlushDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session);
|
||||
|
||||
// @brief Node properties.
|
||||
const HsaNodeProperties properties_;
|
||||
@@ -749,13 +745,13 @@ class GpuAgent : public GpuAgentInt {
|
||||
uint8_t reserved1[16];
|
||||
/* pc_sample_t buffer0[buf_size]; */
|
||||
/* pc_sample_t buffer1[buf_size]; */
|
||||
} pcs_hosttrap_sampling_data_t;
|
||||
} pcs_sampling_data_t;
|
||||
|
||||
typedef struct {
|
||||
/* Hosttrap data - stored on device so that trap_handler code can access efficiently */
|
||||
pcs_hosttrap_sampling_data_t* device_data;
|
||||
/* Sampling data - stored on device for trap handler access */
|
||||
pcs_sampling_data_t* device_data;
|
||||
|
||||
/* Hosttrap host buffer - stored on host */
|
||||
/* Sampling host buffer - stored on host */
|
||||
uint8_t* host_buffer;
|
||||
size_t host_buffer_size;
|
||||
uint8_t* host_buffer_wrap_pos;
|
||||
@@ -774,11 +770,16 @@ class GpuAgent : public GpuAgentInt {
|
||||
|
||||
os::Thread thread;
|
||||
pcs::PcsRuntime::PcSamplingSession* session;
|
||||
} pcs_hosttrap_t;
|
||||
|
||||
pcs_hosttrap_t pcs_hosttrap_data_;
|
||||
} pcs_data_t;
|
||||
/* PC Sampling fields - end */
|
||||
|
||||
hsa_status_t UpdateTrapHandlerWithPCS(pcs_sampling_data_t* pcs_hosttrap_buffers,
|
||||
pcs_sampling_data_t* pcs_stochastic_buffers);
|
||||
|
||||
// @brief Thread function to process PC sampling data collected via host-trap
|
||||
// or Stochastic sampling.
|
||||
void PcSamplingThread(pcs_data_t& pcs_data, const char* thread_name);
|
||||
|
||||
// @brief device handle
|
||||
amdgpu_device_handle ldrm_dev_;
|
||||
|
||||
@@ -793,6 +794,12 @@ class GpuAgent : public GpuAgentInt {
|
||||
|
||||
bool uses_rec_sdma_eng_id_mask_;
|
||||
|
||||
// structure for host trap sampling
|
||||
pcs_data_t pcs_hosttrap_data_;
|
||||
|
||||
// structure for stochastic sampling
|
||||
pcs_data_t pcs_stochastic_data_;
|
||||
|
||||
// @bried XGMI CPU<->GPU
|
||||
bool xgmi_cpu_gpu_;
|
||||
};
|
||||
|
||||
@@ -116,6 +116,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
|
||||
[this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }),
|
||||
trap_handler_tma_region_(NULL),
|
||||
pcs_hosttrap_data_(),
|
||||
pcs_stochastic_data_(),
|
||||
xgmi_cpu_gpu_(false) {
|
||||
const bool is_apu_node = (properties_.NumCPUCores > 0);
|
||||
profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE;
|
||||
@@ -2166,7 +2167,7 @@ void GpuAgent::SyncClocks() {
|
||||
assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error");
|
||||
}
|
||||
|
||||
hsa_status_t GpuAgent::UpdateTrapHandlerWithPCS(void* pcs_hosttrap_buffers, void* pcs_stochastic_buffers) {
|
||||
hsa_status_t GpuAgent::UpdateTrapHandlerWithPCS(pcs_sampling_data_t* pcs_hosttrap_buffers, pcs_sampling_data_t* pcs_stochastic_buffers) {
|
||||
// Assemble the trap handler source code.
|
||||
void* tma_addr = nullptr;
|
||||
uint64_t tma_size = 0;
|
||||
@@ -2541,7 +2542,11 @@ hsa_status_t GpuAgent::PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& sess
|
||||
ret = PcSamplingCreateFromId(0, session);
|
||||
if (ret != HSA_STATUS_SUCCESS) return ret;
|
||||
|
||||
// Obtain the sampling information from the session.
|
||||
session.GetHsaKmtSamplingInfo(&sampleInfo);
|
||||
|
||||
// Pass the sampling information to the kernel driver to create PC
|
||||
// sampling session.
|
||||
HSAKMT_STATUS retkmt = hsaKmtPcSamplingCreate(node_id(), &sampleInfo, &thunkId);
|
||||
if (retkmt != HSAKMT_STATUS_SUCCESS) {
|
||||
return (retkmt == HSAKMT_STATUS_KERNEL_ALREADY_OPENED) ? (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY
|
||||
@@ -2557,114 +2562,133 @@ hsa_status_t GpuAgent::PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& sess
|
||||
|
||||
hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId,
|
||||
pcs::PcsRuntime::PcSamplingSession& session) {
|
||||
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
|
||||
// Determine the sampling method from the session
|
||||
hsa_ven_amd_pcs_method_kind_t sampling_method = session.method();
|
||||
|
||||
if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
|
||||
// TODO: For now can only have 1 hosttrap session at a time. As a final solution, we want to be
|
||||
// able to support multiple sessions at a time. But this makes the session.HandleSampleData more
|
||||
// complicated if multiple sessions have different buffer sizes.
|
||||
if (ht_data.session) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
pcs_data_t* pcs_data = nullptr;
|
||||
|
||||
// This is current amd_aql_queue->pm4_ib_size_b_
|
||||
ht_data.cmd_data_sz = 0x1000;
|
||||
ht_data.cmd_data = (uint32_t*)malloc(ht_data.cmd_data_sz);
|
||||
assert(ht_data.cmd_data);
|
||||
if (sampling_method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
|
||||
pcs_data = &pcs_hosttrap_data_;
|
||||
} else if (sampling_method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
|
||||
pcs_data = &pcs_stochastic_data_;
|
||||
} else {
|
||||
// Unsupported sampling method
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
if (HSA::hsa_signal_create(1, 0, NULL, &ht_data.exec_pm4_signal) != HSA_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR;
|
||||
// Ensure only one session is active at a time for the given method
|
||||
if (pcs_data->session)
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES; // TODO: For now, we can only have
|
||||
// 1 pc sampling session at a
|
||||
// time. As a final solution, we
|
||||
// want to be able to support
|
||||
// multiple sessions at a time.
|
||||
// But this makes the
|
||||
// session.HandleSampleData more
|
||||
// complicated if multiple
|
||||
// sessions have different buffer
|
||||
// sizes.
|
||||
|
||||
ht_data.old_val = (uint64_t*)system_allocator()(sizeof(uint64_t), 0x1000, 0);
|
||||
assert(ht_data.old_val);
|
||||
// This is current amd_aql_queue->pm4_ib_size_b_
|
||||
pcs_data->cmd_data_sz = 0x1000; // 4KB
|
||||
pcs_data->cmd_data = (uint32_t*)malloc(pcs_data->cmd_data_sz);
|
||||
if (!pcs_data->cmd_data) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
|
||||
if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, ht_data.old_val))
|
||||
return HSA_STATUS_ERROR;
|
||||
if (HSA::hsa_signal_create(1, 0, NULL, &pcs_data->exec_pm4_signal) != HSA_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR;
|
||||
|
||||
// Local copy of hosttrap data - we cannot access device memory directly on non-large BAR
|
||||
// systems
|
||||
pcs_hosttrap_sampling_data_t* device_datahost =
|
||||
(pcs_hosttrap_sampling_data_t*)system_allocator()(sizeof(*device_datahost), 0x1000, 0);
|
||||
if (!device_datahost) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
pcs_data->old_val = (uint64_t*)system_allocator()(sizeof(uint64_t), 0x1000, 0);
|
||||
if (!pcs_data->old_val) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
|
||||
MAKE_SCOPE_GUARD([&]() { system_deallocator()(device_datahost); });
|
||||
if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, pcs_data->old_val))
|
||||
return HSA_STATUS_ERROR;
|
||||
|
||||
memset(device_datahost, 0, sizeof(*device_datahost));
|
||||
// Local copy of pc sampling data - we cannot access device memory directly on non-large BAR
|
||||
// systems
|
||||
pcs_sampling_data_t* device_datahost =
|
||||
(pcs_sampling_data_t*)system_allocator()(sizeof(pcs_sampling_data_t), 0x1000, 0);
|
||||
if (!device_datahost) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
|
||||
if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, device_datahost) !=
|
||||
HSA_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR;
|
||||
MAKE_SCOPE_GUARD([&]() { system_deallocator()(device_datahost); });
|
||||
|
||||
MAKE_NAMED_SCOPE_GUARD(freeHostTrapResources, [&]() {
|
||||
if (ht_data.device_data) {
|
||||
if (ht_data.device_data->done_sig0.handle)
|
||||
HSA::hsa_signal_destroy(ht_data.device_data->done_sig0);
|
||||
if (ht_data.device_data->done_sig1.handle)
|
||||
HSA::hsa_signal_destroy(ht_data.device_data->done_sig1);
|
||||
memset(device_datahost, 0, sizeof(*device_datahost));
|
||||
|
||||
finegrain_deallocator()(ht_data.device_data);
|
||||
}
|
||||
if (ht_data.host_buffer) system_deallocator()(ht_data.host_buffer);
|
||||
});
|
||||
if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, device_datahost) !=
|
||||
HSA_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR;
|
||||
|
||||
// Force creating of PC Sampling queue to trigger exception early in case we exceed max availble
|
||||
// CP queues on this agent
|
||||
queues_[QueuePCSampling].touch();
|
||||
MAKE_NAMED_SCOPE_GUARD(freeResources, [&]() {
|
||||
if (pcs_data->device_data) {
|
||||
if (pcs_data->device_data->done_sig0.handle)
|
||||
HSA::hsa_signal_destroy(pcs_data->device_data->done_sig0);
|
||||
if (pcs_data->device_data->done_sig1.handle)
|
||||
HSA::hsa_signal_destroy(pcs_data->device_data->done_sig1);
|
||||
|
||||
/*
|
||||
* When calling queue->ExecutePM4() Indirect Buffer size which is 0x1000 bytes (1024 DW).
|
||||
* The maximum indirect buffer size we need occurs when we enqueue the
|
||||
* WAIT_REG_MEM, DMA_COPY(s), WRITE_DATA ops:
|
||||
* For WAIT_REG_MEM = 7 DW
|
||||
* For each DMA_COPY = 7 DW
|
||||
* For WRITE_DATA_CMD = 6 DW
|
||||
*
|
||||
* So maximum number of DMA_COPY ops is:
|
||||
* (MAX_IB_SIZE - sizeof(WAIT_REG_MEM) - sizeof(WRITE_DATA_CMD)) / sizeof(DMA_COPY)
|
||||
* (1024 - 7 - 6) / 7 = 144
|
||||
*
|
||||
* Each DMA_COPY op can transfer (1 << 26) bytes, which is 9 GB. trap_buffer_size is a 32-bit
|
||||
* number, so the buffer must be < 4 GB. So we are not limited by Indirect Buffer size.
|
||||
* Set current limit to 256 MB to limit device VRAM usage
|
||||
*/
|
||||
const size_t max_trap_buffer_size =
|
||||
core::Runtime::runtime_singleton_->flag().pc_sampling_max_device_buffer_size();
|
||||
finegrain_deallocator()(pcs_data->device_data);
|
||||
}
|
||||
if (pcs_data->host_buffer) system_deallocator()(pcs_data->host_buffer);
|
||||
});
|
||||
|
||||
/*
|
||||
* We use a double-buffer mechanism where there are 2 trap-buffers and 1 host-buffer
|
||||
* Warning: This currently assumes that client latency is smaller than time to fill 1
|
||||
* trap-buffer If latency is bigger, we have to increate host-buffer
|
||||
*
|
||||
* host-buffer must be >= client-buffer so that we can copy full size of client-buffer each
|
||||
* time. To avoid having to deal with wrap-arounds, host-buffer must be a multiple of
|
||||
* trap-buffers
|
||||
*
|
||||
* if client-buffer size is greater than 2x max_trap_buffer_size:
|
||||
* We are limited by max_trap_buffer_size.
|
||||
* trap-buffer = max-trap-buffer-size
|
||||
* host-buffer = 2*smallest size greater than client-buffer but multiple of 1 trap-buffer
|
||||
* else:
|
||||
* We reduce the trap-buffers so that:
|
||||
* trap-buffer = half of user-buffer
|
||||
* host-buffer = 2*user-buffer
|
||||
*
|
||||
* TODO: We are currently using a temporary host-buffer so that we can increase host-buffer to
|
||||
* factor in client latency. Using a direct-copy to the client buffer would be more efficient.
|
||||
* Revisit this once we have empirical data of latency vs how long it takes to fill 1
|
||||
* trap-buffer.
|
||||
*/
|
||||
// Force creating of PC Sampling queue to trigger exception early in case we exceed max availble
|
||||
// CP queues on this agent
|
||||
queues_[QueuePCSampling].touch();
|
||||
|
||||
size_t trap_buffer_size = 0;
|
||||
if (session.buffer_size() > 2 * max_trap_buffer_size) {
|
||||
trap_buffer_size = max_trap_buffer_size;
|
||||
ht_data.host_buffer_size = 2 * AlignUp(session.buffer_size(), trap_buffer_size);
|
||||
/*
|
||||
* When calling queue->ExecutePM4() Indirect Buffer size which is 0x1000 bytes (1024 DW).
|
||||
* The maximum indirect buffer size we need occurs when we enqueue the
|
||||
* WAIT_REG_MEM, DMA_COPY(s), WRITE_DATA ops:
|
||||
* For WAIT_REG_MEM = 7 DW
|
||||
* For each DMA_COPY = 7 DW
|
||||
* For WRITE_DATA_CMD = 6 DW
|
||||
*
|
||||
* So maximum number of DMA_COPY ops is:
|
||||
* (MAX_IB_SIZE - sizeof(WAIT_REG_MEM) - sizeof(WRITE_DATA_CMD)) / sizeof(DMA_COPY)
|
||||
* (1024 - 7 - 6) / 7 = 144
|
||||
*
|
||||
* Each DMA_COPY op can transfer (1 << 26) bytes, which is 9 GB. trap_buffer_size is a 32-bit
|
||||
* number, so the buffer must be < 4 GB. So we are not limited by Indirect Buffer size.
|
||||
* Set current limit to 256 MB to limit device VRAM usage
|
||||
*/
|
||||
const size_t max_trap_buffer_size =
|
||||
core::Runtime::runtime_singleton_->flag().pc_sampling_max_device_buffer_size();
|
||||
|
||||
/*
|
||||
* We use a double-buffer mechanism where there are 2 trap-buffers and 1 host-buffer
|
||||
* Warning: This currently assumes that client latency is smaller than time to fill 1
|
||||
* trap-buffer If latency is bigger, we have to increate host-buffer
|
||||
*
|
||||
* host-buffer must be >= client-buffer so that we can copy full size of client-buffer each
|
||||
* time. To avoid having to deal with wrap-arounds, host-buffer must be a multiple of
|
||||
* trap-buffers
|
||||
*
|
||||
* if client-buffer size is greater than 2x max_trap_buffer_size:
|
||||
* We are limited by max_trap_buffer_size.
|
||||
* trap-buffer = max-trap-buffer-size
|
||||
* host-buffer = 2*smallest size greater than client-buffer but multiple of 1 trap-buffer
|
||||
* else:
|
||||
* We reduce the trap-buffers so that:
|
||||
* trap-buffer = half of user-buffer
|
||||
* host-buffer = 2*user-buffer
|
||||
*
|
||||
* TODO: We are currently using a temporary host-buffer so that we can increase host-buffer to
|
||||
* factor in client latency. Using a direct-copy to the client buffer would be more efficient.
|
||||
* Revisit this once we have empirical data of latency vs how long it takes to fill 1
|
||||
* trap-buffer.
|
||||
*/
|
||||
|
||||
size_t trap_buffer_size = 0;
|
||||
if (session.buffer_size() > 2 * max_trap_buffer_size) {
|
||||
trap_buffer_size = max_trap_buffer_size;
|
||||
pcs_data->host_buffer_size = 2 * AlignUp(session.buffer_size(), trap_buffer_size);
|
||||
} else {
|
||||
trap_buffer_size = session.buffer_size() / 2;
|
||||
ht_data.host_buffer_size = 2 * session.buffer_size();
|
||||
pcs_data->host_buffer_size = 2 * session.buffer_size();
|
||||
}
|
||||
|
||||
ht_data.host_buffer = (uint8_t*)system_allocator()(ht_data.host_buffer_size, 0x1000, 0);
|
||||
if (!ht_data.host_buffer) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
pcs_data->host_buffer = (uint8_t*)system_allocator()(pcs_data->host_buffer_size, 0x1000, 0);
|
||||
if (!pcs_data->host_buffer) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
|
||||
if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, ht_data.host_buffer) !=
|
||||
if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, pcs_data->host_buffer) !=
|
||||
HSA_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR;
|
||||
|
||||
@@ -2682,101 +2706,162 @@ hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId,
|
||||
device_datahost->buf_watermark1 = 0.8 * device_datahost->buf_size;
|
||||
|
||||
// Allocate device memory for 2nd level trap handler TMA
|
||||
size_t deviceAllocSize = sizeof(*ht_data.device_data) + (2 * trap_buffer_size);
|
||||
ht_data.device_data = (pcs_hosttrap_sampling_data_t*)finegrain_allocator()(deviceAllocSize, 0);
|
||||
if (ht_data.device_data == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
size_t deviceAllocSize = sizeof(*pcs_data->device_data) + (2 * trap_buffer_size);
|
||||
pcs_data->device_data = (pcs_sampling_data_t*)finegrain_allocator()(deviceAllocSize, 0);
|
||||
if (pcs_data->device_data == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
|
||||
// This cpuAgent is the owner of the system_allocator() pool
|
||||
auto cpuAgent = GetNearestCpuAgent()->public_handle();
|
||||
hsa_status_t ret = AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, ht_data.device_data);
|
||||
assert(ret == HSA_STATUS_SUCCESS);
|
||||
if (AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, pcs_data->device_data) != HSA_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR;
|
||||
|
||||
if (DmaCopy(ht_data.device_data, device_datahost, sizeof(*device_datahost)) !=
|
||||
if (DmaCopy(pcs_data->device_data, device_datahost, sizeof(*device_datahost)) !=
|
||||
HSA_STATUS_SUCCESS) {
|
||||
debug_print("Failed to dmaCopy!\n");
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
uint8_t* device_buf_ptr =
|
||||
((uint8_t*)ht_data.device_data) + sizeof(pcs_hosttrap_sampling_data_t);
|
||||
if (DmaFill(device_buf_ptr, 0, deviceAllocSize - sizeof(pcs_hosttrap_sampling_data_t)) !=
|
||||
((uint8_t*)pcs_data->device_data) + sizeof(pcs_sampling_data_t);
|
||||
if (DmaFill(device_buf_ptr, 0, deviceAllocSize - sizeof(pcs_sampling_data_t)) !=
|
||||
HSA_STATUS_SUCCESS) {
|
||||
debug_print("Failed to dmaFill!\n");
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
ht_data.lost_sample_count = 0;
|
||||
ht_data.host_buffer_wrap_pos = 0;
|
||||
ht_data.host_write_ptr = ht_data.host_buffer;
|
||||
ht_data.host_read_ptr = ht_data.host_write_ptr;
|
||||
pcs_data->lost_sample_count = 0;
|
||||
pcs_data->host_buffer_wrap_pos = 0;
|
||||
pcs_data->host_write_ptr = pcs_data->host_buffer;
|
||||
pcs_data->host_read_ptr = pcs_data->host_write_ptr;
|
||||
|
||||
ht_data.session = &session;
|
||||
freeHostTrapResources.Dismiss();
|
||||
pcs_data->session = &session;
|
||||
|
||||
if (UpdateTrapHandlerWithPCS(ht_data.device_data, NULL) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR;
|
||||
}
|
||||
if (UpdateTrapHandlerWithPCS(
|
||||
sampling_method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1 ? pcs_data->device_data : nullptr,
|
||||
sampling_method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1
|
||||
? pcs_data->device_data
|
||||
: nullptr) != HSA_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR;
|
||||
|
||||
session.SetThunkId(ioctlId);
|
||||
ht_data.session = &session;
|
||||
session.SetThunkId(ioctlId);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
freeResources.Dismiss();
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t GpuAgent::PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) {
|
||||
if (PcSamplingStop(session) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR;
|
||||
|
||||
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
|
||||
HSAKMT_STATUS retKmt = hsaKmtPcSamplingDestroy(node_id(), session.ThunkId());
|
||||
ht_data.session = NULL;
|
||||
hsa_ven_amd_pcs_method_kind_t sampling_method = session.method();
|
||||
|
||||
if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
|
||||
free(ht_data.cmd_data);
|
||||
system_deallocator()(ht_data.old_val);
|
||||
HSA::hsa_signal_destroy(ht_data.exec_pm4_signal);
|
||||
HSA::hsa_signal_destroy(ht_data.device_data->done_sig0);
|
||||
HSA::hsa_signal_destroy(ht_data.device_data->done_sig1);
|
||||
finegrain_deallocator()(ht_data.device_data);
|
||||
system_deallocator()(ht_data.host_buffer);
|
||||
pcs_data_t* pcs_data = nullptr;
|
||||
|
||||
ht_data.device_data = NULL;
|
||||
ht_data.host_buffer = NULL;
|
||||
ht_data.session = NULL;
|
||||
|
||||
UpdateTrapHandlerWithPCS(NULL, NULL);
|
||||
if (sampling_method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
|
||||
pcs_data = &pcs_hosttrap_data_;
|
||||
} else if (sampling_method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
|
||||
pcs_data = &pcs_stochastic_data_;
|
||||
} else {
|
||||
// Unsupported sampling method
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
// Mark session as inactive
|
||||
pcs_data->session = nullptr;
|
||||
|
||||
free(pcs_data->cmd_data);
|
||||
system_deallocator()(pcs_data->old_val);
|
||||
HSA::hsa_signal_destroy(pcs_data->exec_pm4_signal);
|
||||
HSA::hsa_signal_destroy(pcs_data->device_data->done_sig0);
|
||||
HSA::hsa_signal_destroy(pcs_data->device_data->done_sig1);
|
||||
finegrain_deallocator()(pcs_data->device_data);
|
||||
system_deallocator()(pcs_data->host_buffer);
|
||||
|
||||
pcs_data->device_data = NULL;
|
||||
pcs_data->host_buffer = NULL;
|
||||
pcs_data->session = NULL;
|
||||
|
||||
// Update the trap handler to clear any associated device data
|
||||
UpdateTrapHandlerWithPCS(nullptr, nullptr);
|
||||
|
||||
return (retKmt == HSAKMT_STATUS_SUCCESS) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
hsa_status_t GpuAgent::PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) {
|
||||
if (session.isActive()) return HSA_STATUS_SUCCESS;
|
||||
|
||||
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
|
||||
|
||||
auto method = session.method();
|
||||
|
||||
pcs_data_t* pcs_data = nullptr;
|
||||
const char* thread_name = nullptr;
|
||||
if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
|
||||
if (ht_data.session->isActive()) {
|
||||
debug_warning("Already have a Host trap session in progress!");
|
||||
return (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY;
|
||||
}
|
||||
ht_data.session->start();
|
||||
// This thread will handle all hosttrap sessions on this agent
|
||||
// In the future, there will be another thread to handle stochastic sessions.
|
||||
ht_data.thread = os::CreateThread(PcSamplingThreadRun, (void*)this);
|
||||
if (!ht_data.thread)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
|
||||
"Failed to start PC Sampling thread.");
|
||||
pcs_data = &pcs_hosttrap_data_;
|
||||
thread_name = "PcSamplingHostTrapThread";
|
||||
} else if (method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
|
||||
pcs_data = &pcs_stochastic_data_;
|
||||
thread_name = "PcSamplingStochasticThread";
|
||||
} else {
|
||||
// Unsupported sampling method
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
// Check if a session is already active
|
||||
if (pcs_data->session && pcs_data->session->isActive()) {
|
||||
debug_warning("Already have a PC sampling session in progress!");
|
||||
return (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY;
|
||||
}
|
||||
|
||||
// Assign the new session and mark it as active
|
||||
pcs_data->session = &session;
|
||||
pcs_data->session->start();
|
||||
|
||||
// Creating thread data
|
||||
struct ThreadData {
|
||||
GpuAgent* agent;
|
||||
pcs_data_t* pcs_data;
|
||||
const char* thread_name;
|
||||
};
|
||||
|
||||
auto* thread_data = new ThreadData{this, pcs_data, thread_name};
|
||||
|
||||
// This thread will handle all PC Sampling sessions on this agent
|
||||
pcs_data->thread = os::CreateThread(
|
||||
[](void* arg) -> void {
|
||||
auto* thread_data = static_cast<ThreadData*>(arg);
|
||||
try {
|
||||
GpuAgent* agent = thread_data->agent;
|
||||
pcs_data_t* pcs_data = thread_data->pcs_data;
|
||||
const char* thread_name = thread_data->thread_name;
|
||||
|
||||
agent->PcSamplingThread(*pcs_data, thread_name);
|
||||
} catch (...) {
|
||||
fprintf(stdout, "Exception caught in PcSamplingThread. Exiting the thread!");
|
||||
}
|
||||
|
||||
delete thread_data;
|
||||
},
|
||||
thread_data);
|
||||
|
||||
if (!pcs_data->thread) {
|
||||
// if thread creation failed
|
||||
delete thread_data;
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
|
||||
"Failed to start PC Sampling thread.");
|
||||
}
|
||||
|
||||
// Start the sampling session in the kernel driver
|
||||
if (hsaKmtPcSamplingStart(node_id(), session.ThunkId()) == HSAKMT_STATUS_SUCCESS)
|
||||
return HSA_STATUS_SUCCESS;
|
||||
|
||||
debug_print("Failed to start PC sampling session with thunkId:%d\n", session.ThunkId());
|
||||
if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
|
||||
ht_data.session->stop();
|
||||
os::WaitForThread(ht_data.thread);
|
||||
os::CloseThread(ht_data.thread);
|
||||
ht_data.thread = NULL;
|
||||
}
|
||||
// Clean up if starting the session failed
|
||||
pcs_data->session->stop();
|
||||
os::WaitForThread(pcs_data->thread);
|
||||
os::CloseThread(pcs_data->thread);
|
||||
pcs_data->thread = nullptr;
|
||||
pcs_data->session = nullptr;
|
||||
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
@@ -2784,35 +2869,51 @@ hsa_status_t GpuAgent::PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& sessi
|
||||
hsa_status_t GpuAgent::PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) {
|
||||
if (!session.isActive()) return HSA_STATUS_SUCCESS;
|
||||
|
||||
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
|
||||
|
||||
// Stop the session
|
||||
session.stop();
|
||||
|
||||
// Stop PC sampling in the kernel driver
|
||||
HSAKMT_STATUS retKmt = hsaKmtPcSamplingStop(node_id(), session.ThunkId());
|
||||
if (retKmt != HSAKMT_STATUS_SUCCESS)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR, "Failed to stop PC Sampling session.");
|
||||
|
||||
if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
|
||||
// Wake up pcs_hosttrap_thread_ if it is waiting for data
|
||||
HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig0, -1);
|
||||
HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig1, -1);
|
||||
// Determine the sampling method and corresponding data
|
||||
pcs_data_t* pcs_data = nullptr;
|
||||
auto method = session.method();
|
||||
|
||||
os::WaitForThread(ht_data.thread);
|
||||
os::CloseThread(ht_data.thread);
|
||||
ht_data.thread = NULL;
|
||||
if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
|
||||
pcs_data = &pcs_hosttrap_data_;
|
||||
} else if (method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
|
||||
pcs_data = &pcs_stochastic_data_;
|
||||
} else {
|
||||
// Unsupported sampling method
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
// Wake up pcs_hosttrap_thread_ if it is waiting for data
|
||||
HSA::hsa_signal_store_screlease(pcs_data->device_data->done_sig0, -1);
|
||||
HSA::hsa_signal_store_screlease(pcs_data->device_data->done_sig1, -1);
|
||||
|
||||
// Wait for the thread to finish and clean up
|
||||
os::WaitForThread(pcs_data->thread);
|
||||
os::CloseThread(pcs_data->thread);
|
||||
pcs_data->thread = nullptr;
|
||||
pcs_data->session = nullptr;
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
|
||||
hsa_status_t GpuAgent::PcSamplingFlushDeviceBuffers(
|
||||
pcs::PcsRuntime::PcSamplingSession& session) {
|
||||
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
|
||||
uint32_t& which_buffer = ht_data.which_buffer;
|
||||
uint32_t* cmd_data = ht_data.cmd_data;
|
||||
size_t& cmd_data_sz = ht_data.cmd_data_sz;
|
||||
uint64_t* old_val = ht_data.old_val;
|
||||
hsa_signal_t& exec_pm4_signal = ht_data.exec_pm4_signal;
|
||||
pcs_data_t* pcs_data = nullptr;
|
||||
|
||||
if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
|
||||
pcs_data = &pcs_hosttrap_data_;
|
||||
} else if (session.method() == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
|
||||
pcs_data = &pcs_stochastic_data_;
|
||||
} else {
|
||||
// No sampling session active
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Device-buffer to Host-buffer to User-Buffer copy logic
|
||||
@@ -2951,19 +3052,33 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
|
||||
|
||||
uint32_t pred_exec_cmd_sz = 0;
|
||||
|
||||
uint8_t* host_buffer_begin = ht_data.host_buffer;
|
||||
uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size;
|
||||
uint64_t buf_write_val;
|
||||
uint64_t buf_written_val[2];
|
||||
size_t buf_offset;
|
||||
uint8_t* buffer[2];
|
||||
size_t buf_size;
|
||||
|
||||
uint64_t buf_write_val = (uint64_t) & (ht_data.device_data->buf_write_val);
|
||||
uint64_t buf_written_val[] = {(uint64_t) & (ht_data.device_data->buf_written_val0),
|
||||
(uint64_t) & (ht_data.device_data->buf_written_val1)};
|
||||
uint32_t& which_buffer = pcs_data->which_buffer;
|
||||
uint32_t* cmd_data = pcs_data->cmd_data;
|
||||
size_t cmd_data_sz = pcs_data->cmd_data_sz;
|
||||
uint64_t* old_val = pcs_data->old_val;
|
||||
hsa_signal_t& exec_pm4_signal = pcs_data->exec_pm4_signal;
|
||||
|
||||
size_t const buf_offset = offsetof(pcs_hosttrap_sampling_data_t, reserved1) +
|
||||
sizeof(((pcs_hosttrap_sampling_data_t*)0)->reserved1);
|
||||
uint8_t* host_buffer_begin = pcs_data->host_buffer;
|
||||
size_t& host_buffer_size = pcs_data->host_buffer_size;
|
||||
uint8_t*& host_write_ptr = pcs_data->host_write_ptr;
|
||||
uint8_t* host_buffer_end = host_buffer_begin + host_buffer_size;
|
||||
|
||||
uint8_t* buffer[] = {(uint8_t*)ht_data.device_data + buf_offset,
|
||||
(uint8_t*)ht_data.device_data + buf_offset +
|
||||
ht_data.device_data->buf_size * session.sample_size()};
|
||||
buf_write_val = reinterpret_cast<uint64_t>(&pcs_data->device_data->buf_write_val);
|
||||
buf_written_val[0] = reinterpret_cast<uint64_t>(&pcs_data->device_data->buf_written_val0);
|
||||
buf_written_val[1] = reinterpret_cast<uint64_t>(&pcs_data->device_data->buf_written_val1);
|
||||
buf_size = pcs_data->device_data->buf_size;
|
||||
|
||||
buf_offset =
|
||||
offsetof(pcs_sampling_data_t, reserved1) + sizeof(((pcs_sampling_data_t*)0)->reserved1);
|
||||
|
||||
buffer[0] = reinterpret_cast<uint8_t*>(pcs_data->device_data) + buf_offset;
|
||||
buffer[1] = buffer[0] + buf_size * session.sample_size();
|
||||
|
||||
next_buffer = (which_buffer + 1) % 2;
|
||||
reset_write_val = (uint64_t)next_buffer << 63;
|
||||
@@ -3022,25 +3137,25 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
|
||||
/* If the number of entries in old_val is larger than buf_size, then there was a buffer overflow
|
||||
* and the 2nd level trap handler code will skip recording samples, causing lost samples
|
||||
*/
|
||||
if (*old_val > (uint64_t)ht_data.device_data->buf_size) {
|
||||
ht_data.lost_sample_count = *old_val - (uint64_t)ht_data.device_data->buf_size;
|
||||
*old_val = (uint64_t)ht_data.device_data->buf_size;
|
||||
if (*old_val > buf_size) {
|
||||
pcs_data->lost_sample_count = *old_val - buf_size;
|
||||
*old_val = buf_size;
|
||||
}
|
||||
|
||||
to_copy = *old_val * session.sample_size();
|
||||
|
||||
/* Make sure there is enough space after host_write_ptr */
|
||||
if (ht_data.host_write_ptr + to_copy >= host_buffer_end) {
|
||||
if (host_write_ptr + to_copy >= host_buffer_end) {
|
||||
// Need to wrap around
|
||||
ht_data.host_buffer_wrap_pos = ht_data.host_write_ptr;
|
||||
ht_data.host_write_ptr = host_buffer_begin;
|
||||
pcs_data->host_buffer_wrap_pos = host_write_ptr;
|
||||
host_write_ptr = host_buffer_begin;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
memset(cmd_data, 0, cmd_data_sz);
|
||||
|
||||
if (properties_.NumXcc > 1) {
|
||||
const uint32_t n = ceil(to_copy / (32 * 1024 * 1024));
|
||||
const uint64_t n = ceil(to_copy / (32 * 1024 * 1024));
|
||||
pred_exec_cmd_sz = 2;
|
||||
cmd_data[i++] = PM4_HDR(PM4_HDR_IT_OPCODE_PRED_EXEC, pred_exec_cmd_sz, isa_->GetMajorVersion());
|
||||
cmd_data[i++] =
|
||||
@@ -3073,7 +3188,8 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
|
||||
unsigned int num_copy_command = 0;
|
||||
uint8_t* buffer_temp = buffer[which_buffer];
|
||||
|
||||
for (copy_bytes = CP_DMA_DATA_TRANSFER_CNT_MAX; 0 < to_copy; to_copy -= copy_bytes) {
|
||||
for (copy_bytes = std::min(to_copy, (uint32_t)CP_DMA_DATA_TRANSFER_CNT_MAX); 0 < to_copy;
|
||||
to_copy -= copy_bytes) {
|
||||
num_copy_command++;
|
||||
|
||||
/* DMA_DATA PACKETS, copy buffer using CPDMA */
|
||||
@@ -3082,9 +3198,8 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
|
||||
PM4_DMA_DATA_SRC_SEL_SRC_ADDR_USING_L2);
|
||||
cmd_data[i++] = PM4_DMA_DATA_DW2_SRC_ADDR_LO((uint64_t)buffer_temp);
|
||||
cmd_data[i++] = PM4_DMA_DATA_DW3_SRC_ADDR_HI(((uint64_t)buffer_temp) >> 32);
|
||||
cmd_data[i++] = PM4_DMA_DATA_DW4_DST_ADDR_LO((uint64_t)ht_data.host_write_ptr);
|
||||
cmd_data[i++] = PM4_DMA_DATA_DW5_DST_ADDR_HI(((uint64_t)ht_data.host_write_ptr) >> 32);
|
||||
|
||||
cmd_data[i++] = PM4_DMA_DATA_DW4_DST_ADDR_LO((uint64_t)host_write_ptr);
|
||||
cmd_data[i++] = PM4_DMA_DATA_DW5_DST_ADDR_HI(((uint64_t)host_write_ptr) >> 32);
|
||||
if (copy_bytes >= to_copy) {
|
||||
copy_bytes = to_copy;
|
||||
cmd_data[i++] =
|
||||
@@ -3093,7 +3208,7 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
|
||||
cmd_data[i++] = PM4_DMA_DATA_DW6(PM4_DMA_DATA_BYTE_COUNT(copy_bytes) | PM4_DMA_DATA_DIS_WC);
|
||||
}
|
||||
buffer_temp += copy_bytes;
|
||||
ht_data.host_write_ptr += copy_bytes;
|
||||
host_write_ptr += copy_bytes;
|
||||
}
|
||||
|
||||
/* WRITE_DATA, Reset buf_written_val */
|
||||
@@ -3117,167 +3232,180 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
|
||||
if (val == 0) break;
|
||||
} while (true);
|
||||
|
||||
// save the position of next buffer
|
||||
which_buffer = next_buffer;
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void GpuAgent::PcSamplingThread() {
|
||||
void GpuAgent::PcSamplingThread(pcs_data_t& pcs_data, const char* thread_name) {
|
||||
// TODO: Implement lost sample count
|
||||
// TODO: Implement latency
|
||||
|
||||
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
|
||||
pcs::PcsRuntime::PcSamplingSession& session = *ht_data.session;
|
||||
uint32_t& which_buffer = ht_data.which_buffer;
|
||||
try {
|
||||
pcs::PcsRuntime::PcSamplingSession& session = *pcs_data.session;
|
||||
uint32_t& which_buffer = pcs_data.which_buffer;
|
||||
|
||||
uint8_t* host_buffer_begin = ht_data.host_buffer;
|
||||
uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size;
|
||||
uint8_t* host_buffer_begin = pcs_data.host_buffer;
|
||||
uint8_t* host_buffer_end = pcs_data.host_buffer + pcs_data.host_buffer_size;
|
||||
|
||||
hsa_signal_t done_sig[] = {ht_data.device_data->done_sig0, ht_data.device_data->done_sig1};
|
||||
hsa_signal_t done_sig[] = {pcs_data.device_data->done_sig0, pcs_data.device_data->done_sig1};
|
||||
|
||||
while (ht_data.session->isActive()) {
|
||||
do {
|
||||
hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire(
|
||||
done_sig[which_buffer], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
||||
if (val == -1) goto thread_exit;
|
||||
if (val == 0) break;
|
||||
} while (true);
|
||||
HSA::hsa_signal_store_screlease(done_sig[which_buffer], 1);
|
||||
while (pcs_data.session->isActive()) {
|
||||
// Wait for the signal to process the buffer
|
||||
do {
|
||||
hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire(
|
||||
done_sig[which_buffer], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
||||
if (val == -1) goto thread_exit;
|
||||
if (val == 0) break;
|
||||
} while (true);
|
||||
HSA::hsa_signal_store_screlease(done_sig[which_buffer], 1);
|
||||
|
||||
std::lock_guard<std::mutex> lock(ht_data.host_buffer_mutex);
|
||||
if (PcSamplingFlushHostTrapDeviceBuffers(session) != HSA_STATUS_SUCCESS)
|
||||
goto thread_exit;
|
||||
// Lock buffer to ensure thread-safe access
|
||||
std::lock_guard<std::mutex> lock(pcs_data.host_buffer_mutex);
|
||||
// Flush device buffers
|
||||
if (PcSamplingFlushDeviceBuffers(session) != HSA_STATUS_SUCCESS)
|
||||
goto thread_exit;
|
||||
|
||||
size_t bytes_before_wrap;
|
||||
size_t bytes_after_wrap;
|
||||
size_t bytes_before_wrap;
|
||||
size_t bytes_after_wrap;
|
||||
|
||||
assert(ht_data.host_read_ptr >= host_buffer_begin && ht_data.host_read_ptr < host_buffer_end);
|
||||
assert(ht_data.host_write_ptr >= host_buffer_begin && ht_data.host_write_ptr < host_buffer_end);
|
||||
assert(ht_data.host_buffer_wrap_pos ? (ht_data.host_read_ptr > ht_data.host_write_ptr)
|
||||
: (ht_data.host_read_ptr <= ht_data.host_write_ptr));
|
||||
assert(pcs_data.host_read_ptr >= host_buffer_begin && pcs_data.host_read_ptr < host_buffer_end);
|
||||
assert(pcs_data.host_write_ptr >= host_buffer_begin && pcs_data.host_write_ptr < host_buffer_end);
|
||||
assert(pcs_data.host_buffer_wrap_pos ? (pcs_data.host_read_ptr > pcs_data.host_write_ptr)
|
||||
: (pcs_data.host_read_ptr <= pcs_data.host_write_ptr));
|
||||
|
||||
if (ht_data.host_buffer_wrap_pos) {
|
||||
assert(ht_data.host_buffer_wrap_pos <= host_buffer_end &&
|
||||
ht_data.host_buffer_wrap_pos > host_buffer_begin);
|
||||
assert(ht_data.host_read_ptr <= ht_data.host_buffer_wrap_pos);
|
||||
if (pcs_data.host_buffer_wrap_pos) {
|
||||
assert(pcs_data.host_buffer_wrap_pos <= host_buffer_end &&
|
||||
pcs_data.host_buffer_wrap_pos > host_buffer_begin);
|
||||
assert(pcs_data.host_read_ptr <= pcs_data.host_buffer_wrap_pos);
|
||||
|
||||
// Wrapped around
|
||||
bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
|
||||
bytes_after_wrap = ht_data.host_write_ptr - host_buffer_begin;
|
||||
// Wrapped around
|
||||
bytes_before_wrap = pcs_data.host_buffer_wrap_pos - pcs_data.host_read_ptr;
|
||||
bytes_after_wrap = pcs_data.host_write_ptr - host_buffer_begin;
|
||||
|
||||
while (bytes_before_wrap >= session.buffer_size()) {
|
||||
session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0,
|
||||
ht_data.lost_sample_count);
|
||||
ht_data.host_read_ptr += session.buffer_size();
|
||||
bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
|
||||
ht_data.lost_sample_count = 0;
|
||||
}
|
||||
while (bytes_before_wrap >= session.buffer_size()) {
|
||||
session.HandleSampleData(pcs_data.host_read_ptr, session.buffer_size(), nullptr, 0,
|
||||
pcs_data.lost_sample_count);
|
||||
pcs_data.host_read_ptr += session.buffer_size();
|
||||
bytes_before_wrap = pcs_data.host_buffer_wrap_pos - pcs_data.host_read_ptr;
|
||||
pcs_data.lost_sample_count = 0;
|
||||
}
|
||||
|
||||
if (bytes_before_wrap + bytes_after_wrap >= session.buffer_size()) {
|
||||
session.HandleSampleData(ht_data.host_read_ptr, bytes_before_wrap, host_buffer_begin,
|
||||
(session.buffer_size() - bytes_before_wrap), 0);
|
||||
ht_data.host_read_ptr = host_buffer_begin + (session.buffer_size() - bytes_before_wrap);
|
||||
bytes_before_wrap = 0;
|
||||
ht_data.host_buffer_wrap_pos = 0;
|
||||
bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
|
||||
ht_data.lost_sample_count = 0;
|
||||
}
|
||||
if (bytes_before_wrap + bytes_after_wrap >= session.buffer_size()) {
|
||||
session.HandleSampleData(pcs_data.host_read_ptr, bytes_before_wrap, host_buffer_begin,
|
||||
(session.buffer_size() - bytes_before_wrap), 0);
|
||||
pcs_data.host_read_ptr = host_buffer_begin + (session.buffer_size() - bytes_before_wrap);
|
||||
bytes_before_wrap = 0;
|
||||
pcs_data.host_buffer_wrap_pos = 0;
|
||||
bytes_after_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr;
|
||||
pcs_data.lost_sample_count = 0;
|
||||
}
|
||||
|
||||
while (bytes_after_wrap >= session.buffer_size()) {
|
||||
session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0,
|
||||
ht_data.lost_sample_count);
|
||||
ht_data.host_read_ptr += session.buffer_size();
|
||||
bytes_before_wrap = 0;
|
||||
bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
|
||||
ht_data.lost_sample_count = 0;
|
||||
}
|
||||
} else {
|
||||
bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
|
||||
while (bytes_after_wrap >= session.buffer_size()) {
|
||||
session.HandleSampleData(pcs_data.host_read_ptr, session.buffer_size(), nullptr, 0,
|
||||
pcs_data.lost_sample_count);
|
||||
pcs_data.host_read_ptr += session.buffer_size();
|
||||
bytes_before_wrap = 0;
|
||||
bytes_after_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr;
|
||||
pcs_data.lost_sample_count = 0;
|
||||
}
|
||||
} else {
|
||||
// Handle non-wrapped buffer
|
||||
bytes_before_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr;
|
||||
|
||||
while (bytes_before_wrap >= session.buffer_size()) {
|
||||
assert(ht_data.host_read_ptr >= host_buffer_begin &&
|
||||
ht_data.host_read_ptr + session.buffer_size() < host_buffer_end);
|
||||
session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0,
|
||||
ht_data.lost_sample_count);
|
||||
ht_data.host_read_ptr += session.buffer_size();
|
||||
bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
|
||||
ht_data.lost_sample_count = 0;
|
||||
while (bytes_before_wrap >= session.buffer_size()) {
|
||||
assert(pcs_data.host_read_ptr >= host_buffer_begin &&
|
||||
pcs_data.host_read_ptr + session.buffer_size() <= host_buffer_end);
|
||||
session.HandleSampleData(pcs_data.host_read_ptr, session.buffer_size(), nullptr, 0,
|
||||
pcs_data.lost_sample_count);
|
||||
pcs_data.host_read_ptr += session.buffer_size();
|
||||
bytes_before_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr;
|
||||
pcs_data.lost_sample_count = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
thread_exit:
|
||||
debug_print("PcSamplingThread::Exiting\n");
|
||||
debug_print("%s::Exiting\n", thread_name);
|
||||
} catch (const std::exception& e) {
|
||||
debug_print("Exception in %s: %s\n", thread_name, e.what());
|
||||
} catch (...) {
|
||||
debug_print("Unknown exception in %s\n", thread_name);
|
||||
}
|
||||
|
||||
void GpuAgent::PcSamplingThreadRun(void* _agent) {
|
||||
GpuAgent* agent = (GpuAgent*)_agent;
|
||||
agent->PcSamplingThread();
|
||||
debug_print("PcSamplingThread exiting...");
|
||||
}
|
||||
|
||||
hsa_status_t GpuAgent::PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) {
|
||||
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
|
||||
pcs_data_t* pcs_data = nullptr;
|
||||
|
||||
uint8_t* host_buffer_begin = ht_data.host_buffer;
|
||||
uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size;
|
||||
if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
|
||||
pcs_data = &pcs_hosttrap_data_;
|
||||
} else if (session.method() == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
|
||||
pcs_data = &pcs_stochastic_data_;
|
||||
} else {
|
||||
return HSA_STATUS_SUCCESS; // Unsupported sampling method
|
||||
}
|
||||
|
||||
uint8_t* host_buffer_begin = pcs_data->host_buffer;
|
||||
uint8_t* host_buffer_end = pcs_data->host_buffer + pcs_data->host_buffer_size;
|
||||
|
||||
size_t bytes_before_wrap;
|
||||
size_t bytes_after_wrap;
|
||||
|
||||
std::lock_guard<std::mutex> lock(ht_data.host_buffer_mutex);
|
||||
if (PcSamplingFlushHostTrapDeviceBuffers(session) != HSA_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR;
|
||||
std::lock_guard<std::mutex> lock(pcs_data->host_buffer_mutex);
|
||||
// Flush device buffers
|
||||
if (PcSamplingFlushDeviceBuffers(session) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR;
|
||||
|
||||
assert(ht_data.host_read_ptr >= host_buffer_begin && ht_data.host_read_ptr < host_buffer_end);
|
||||
assert(ht_data.host_write_ptr >= host_buffer_begin && ht_data.host_write_ptr < host_buffer_end);
|
||||
assert(ht_data.host_buffer_wrap_pos ? (ht_data.host_read_ptr > ht_data.host_write_ptr)
|
||||
: (ht_data.host_read_ptr <= ht_data.host_write_ptr));
|
||||
assert(pcs_data->host_read_ptr >= host_buffer_begin && pcs_data->host_read_ptr < host_buffer_end);
|
||||
assert(pcs_data->host_write_ptr >= host_buffer_begin &&
|
||||
pcs_data->host_write_ptr < host_buffer_end);
|
||||
assert(pcs_data->host_buffer_wrap_pos ? (pcs_data->host_read_ptr > pcs_data->host_write_ptr)
|
||||
: (pcs_data->host_read_ptr <= pcs_data->host_write_ptr));
|
||||
|
||||
if (ht_data.host_buffer_wrap_pos) {
|
||||
assert(ht_data.host_buffer_wrap_pos <= host_buffer_end &&
|
||||
ht_data.host_buffer_wrap_pos > host_buffer_begin);
|
||||
assert(ht_data.host_read_ptr <= ht_data.host_buffer_wrap_pos);
|
||||
if (pcs_data->host_buffer_wrap_pos) {
|
||||
assert(pcs_data->host_buffer_wrap_pos <= host_buffer_end &&
|
||||
pcs_data->host_buffer_wrap_pos > host_buffer_begin);
|
||||
assert(pcs_data->host_read_ptr <= pcs_data->host_buffer_wrap_pos);
|
||||
|
||||
// Wrapped around
|
||||
bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
|
||||
bytes_after_wrap = ht_data.host_write_ptr - host_buffer_begin;
|
||||
// Handle wrapped-around buffer
|
||||
bytes_before_wrap = pcs_data->host_buffer_wrap_pos - pcs_data->host_read_ptr;
|
||||
bytes_after_wrap = pcs_data->host_write_ptr - host_buffer_begin;
|
||||
|
||||
while (bytes_before_wrap > 0) {
|
||||
size_t bytes_to_copy = std::min(bytes_before_wrap, session.buffer_size());
|
||||
|
||||
session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0,
|
||||
ht_data.lost_sample_count);
|
||||
ht_data.host_read_ptr += bytes_to_copy;
|
||||
bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
|
||||
ht_data.lost_sample_count = 0;
|
||||
session.HandleSampleData(pcs_data->host_read_ptr, bytes_to_copy, nullptr, 0,
|
||||
pcs_data->lost_sample_count);
|
||||
pcs_data->host_read_ptr += bytes_to_copy;
|
||||
bytes_before_wrap = pcs_data->host_buffer_wrap_pos - pcs_data->host_read_ptr;
|
||||
pcs_data->lost_sample_count = 0;
|
||||
}
|
||||
|
||||
assert(ht_data.host_read_ptr == ht_data.host_buffer_wrap_pos);
|
||||
ht_data.host_buffer_wrap_pos = 0;
|
||||
ht_data.host_read_ptr = host_buffer_begin;
|
||||
assert(pcs_data->host_read_ptr == pcs_data->host_buffer_wrap_pos);
|
||||
pcs_data->host_buffer_wrap_pos = 0;
|
||||
pcs_data->host_read_ptr = host_buffer_begin;
|
||||
|
||||
while (bytes_after_wrap > 0) {
|
||||
size_t bytes_to_copy = std::min(bytes_after_wrap, session.buffer_size());
|
||||
|
||||
session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0,
|
||||
ht_data.lost_sample_count);
|
||||
ht_data.host_read_ptr += bytes_to_copy;
|
||||
bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
|
||||
ht_data.lost_sample_count = 0;
|
||||
session.HandleSampleData(pcs_data->host_read_ptr, bytes_to_copy, nullptr, 0,
|
||||
pcs_data->lost_sample_count);
|
||||
pcs_data->host_read_ptr += bytes_to_copy;
|
||||
bytes_after_wrap = pcs_data->host_write_ptr - pcs_data->host_read_ptr;
|
||||
pcs_data->lost_sample_count = 0;
|
||||
}
|
||||
} else {
|
||||
bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
|
||||
bytes_before_wrap = pcs_data->host_write_ptr - pcs_data->host_read_ptr;
|
||||
|
||||
while (bytes_before_wrap) {
|
||||
while (bytes_before_wrap > 0) {
|
||||
size_t bytes_to_copy = std::min(bytes_before_wrap, session.buffer_size());
|
||||
assert(ht_data.host_read_ptr >= host_buffer_begin &&
|
||||
ht_data.host_read_ptr + bytes_to_copy <= host_buffer_end);
|
||||
assert(pcs_data->host_read_ptr >= host_buffer_begin &&
|
||||
pcs_data->host_read_ptr + bytes_to_copy <= host_buffer_end);
|
||||
|
||||
session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0,
|
||||
ht_data.lost_sample_count);
|
||||
ht_data.host_read_ptr += bytes_to_copy;
|
||||
bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
|
||||
ht_data.lost_sample_count = 0;
|
||||
session.HandleSampleData(pcs_data->host_read_ptr, bytes_to_copy, nullptr, 0,
|
||||
pcs_data->lost_sample_count);
|
||||
pcs_data->host_read_ptr += bytes_to_copy;
|
||||
bytes_before_wrap = pcs_data->host_write_ptr - pcs_data->host_read_ptr;
|
||||
pcs_data->lost_sample_count = 0;
|
||||
}
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
|
||||
@@ -53,6 +53,7 @@
|
||||
.set SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT , 11
|
||||
.set SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT , 28
|
||||
.set SQ_WAVE_TRAPSTS_MATH_EXCP , 0x7F
|
||||
.set SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT , 26
|
||||
.set SQ_WAVE_MODE_EXCP_EN_SHIFT , 12
|
||||
.set SQ_WAVE_MODE_EXCP_EN_SIZE , 8
|
||||
.set TRAP_ID_ABORT , 2
|
||||
@@ -95,12 +96,23 @@
|
||||
// TTMP_REG1 means ttmp6 register if gfx>=942 and means ttmp13 register if gfx<942
|
||||
// TTMP_REG2 means ttmp11 register if gfx>=942 and means ttmp6 register if gfx<942
|
||||
|
||||
.if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4
|
||||
.if .amdgcn.gfx_generation_number == 9
|
||||
.set TTMP11_TTMPS_SETUP_SHIFT , 31
|
||||
|
||||
// Bit to indicate that this is a hosttrap trap instead of stochastic trap
|
||||
// Currently not used
|
||||
.set TTMP13_PCS_IS_STOCHASTIC , 24
|
||||
.if (.amdgcn.gfx_generation_minor >= 4)
|
||||
.set TTMP11_WAVE_IN_WG_MASK , 0x3F
|
||||
|
||||
// Bit to indicate that this is a stochastic trap
|
||||
.set TTMP13_PCS_IS_STOCHASTIC , 21
|
||||
|
||||
// Bit to indicate that this is a host trap
|
||||
.set TTMP13_PCS_IS_HOSTTRAP , 22
|
||||
|
||||
.else
|
||||
|
||||
// Bit to indicate that this is a host trap
|
||||
.set TTMP11_PCS_IS_HOSTTRAP , 22
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.if (.amdgcn.gfx_generation_number == 9)
|
||||
@@ -205,27 +217,45 @@
|
||||
// ttmp15 = TMA[63:32]
|
||||
// gfx9:
|
||||
// ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32]
|
||||
// all gfx9 (except gfx942):
|
||||
// For all gfx9 (except gfx940, gfx941, gfx942):
|
||||
// ttmp6 = 0[6:0], DispatchPktIndx[24:0]
|
||||
// ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveInWg[5:0]
|
||||
// Note: Once stochastic sampling is implemented, L2 Trap Handler will use Bit 23
|
||||
// (TTMP11_PCS_IS_STOCHASTIC) to differentiate between stochastic and hosttrap
|
||||
// gfx942:
|
||||
//
|
||||
// For gfx940/gfx941/gfx942:
|
||||
// ttmp11 = 0[0], DispatchPktIndx[24:0], WaveIdInWg[5:0]
|
||||
// ttmp13 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[22:0]
|
||||
// ttmp13:
|
||||
// Bits 31:26 : SQ_WAVE_IB_STS[20:15] (1TH)
|
||||
// 25:24 : 0 on 2TH entry. Used by 1st level TH but also
|
||||
// free to be used in the 2nd level TH
|
||||
// 23 : Debug Enabled (1TH)
|
||||
// 22:0 : values are unspecified on 2TH entry. Free.
|
||||
//
|
||||
// gfx10:
|
||||
// ttmp1 = 0[0], PCRewind[5:0], HostTrap[0], TrapId[7:0], PC[47:32]
|
||||
//
|
||||
// gfx10/gfx11:
|
||||
// ttmp6 = 0[6:0], DispatchPktIndx[24:0]
|
||||
//
|
||||
// gfx1010:
|
||||
// ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
|
||||
//
|
||||
// gfx1030/gfx1100:
|
||||
// ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
|
||||
//
|
||||
// ttmp[14:15] points to TMA2; Available: ttmp[2:3], ttmp[4:5]
|
||||
//
|
||||
// ttmp7 : gfx9, gfx1010, gfx1030, gfx11 - 31:0 : PC[31:0] (2TH, DBG);
|
||||
// : gfx940 - free;
|
||||
// : gfx12 - ttmp7 - 31:16 : workgroup_z[15:0] (SPI) and 15:0 : workgroup_y[15:0] (SPI)
|
||||
|
||||
trap_entry:
|
||||
// Branch if not a trap (an exception instead).
|
||||
s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
|
||||
s_cbranch_scc0 .no_skip_debugtrap
|
||||
// Extract trap_id from ttmp2
|
||||
s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
|
||||
s_cbranch_scc0 .not_s_trap // If trap_id == 0, it's not an s_trap nor host trap
|
||||
|
||||
// Check if the it was an host trap.
|
||||
s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT
|
||||
s_cbranch_scc0 .not_host_trap
|
||||
|
||||
.if (.amdgcn.gfx_generation_number == 9) // PC_SAMPLING_GFX9
|
||||
// ttmp[14:15] is TMA2; Available: ttmp[2:3], ttmp[4:5], ttmp7, TTMP_REG1
|
||||
@@ -236,29 +266,42 @@ trap_entry:
|
||||
// [0x08] out_buf_t* stochastic_trap_buffers;
|
||||
//
|
||||
// --- Start profile trap handlers GFX9 --- //
|
||||
// if (host_trap) {
|
||||
// if (stochastic) // Not implemented yet
|
||||
// ttmp11.bit23 = 1; // Not implemented yet
|
||||
// profiling_trap_handler(tma->host_trap_buffers);
|
||||
// }
|
||||
// If the wave entered the trap handler:
|
||||
// If on gfx9:
|
||||
// - Check SQ_WAVE_PC_HI_HT_SHIFT bit on TTMP1 register to
|
||||
// identify if it was a host trap.
|
||||
// If a host trap is detected:
|
||||
// - Mark TTMP13(gfx94x) or TTMP11(gfx9) hosttrap bit
|
||||
// - Load host_trap_buffers
|
||||
// - Branch to the profile trap handler logic.
|
||||
//
|
||||
// If on gfx9.4+:
|
||||
// - Check TRAPSTS bit 26 (SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT) to
|
||||
// identify stochastic traps.
|
||||
// If a stochastic trap is detected:
|
||||
// - Set bit 21 in TTMP13 to indicate a stochastic trap.
|
||||
// - Branch to the profile trap handler logic.
|
||||
|
||||
s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT
|
||||
s_cbranch_scc0 .not_host_trap_gfx9
|
||||
s_load_dwordx2 ttmp[14:15], ttmp[14:15], 0 glc // ttmp[14:15]=&host_trap_buffers
|
||||
// TODO: When implementing stochastic sampling, need to set TTMP11_PCS_IS_STOCHASTIC
|
||||
// or TTMP13_PCS_IS_STOCHASTIC to differentiate between hosttrap and stochastic sampling
|
||||
s_load_dwordx2 ttmp[2:3], ttmp[14:15], 0 glc // ttmp[14:15]=*host_trap_buffers
|
||||
.if .amdgcn.gfx_generation_minor >= 4
|
||||
s_bitset0_b32 ttmp13, TTMP13_PCS_IS_STOCHASTIC
|
||||
s_bitset1_b32 ttmp13, TTMP13_PCS_IS_HOSTTRAP // set bit 22 in TTMP13
|
||||
.else
|
||||
s_bitset1_b32 ttmp11, TTMP11_PCS_IS_HOSTTRAP // Set bit 22 in TTMP11
|
||||
.endif
|
||||
s_waitcnt lgkmcnt(0)
|
||||
s_branch .profile_trap_handlers_gfx9 // Off to the profile handlers
|
||||
s_mov_b64 ttmp[14:15], ttmp[2:3] //now ttmp[14:15] = host_trap_buffers
|
||||
s_branch .profile_trap_handlers_gfx9 // Off to the profile handlers
|
||||
.else
|
||||
// Ignore host traps. They should be masked by the driver anyway.
|
||||
s_branch .not_s_trap
|
||||
.endif
|
||||
|
||||
.not_host_trap_gfx9:
|
||||
.endif // PC_SAMPLING_GFX9
|
||||
// If caused by s_trap then advance PC.
|
||||
s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT
|
||||
s_cbranch_scc1 .not_s_trap
|
||||
.not_host_trap:
|
||||
// It's an s_trap; advance the PC
|
||||
s_add_u32 ttmp0, ttmp0, 0x4
|
||||
s_addc_u32 ttmp1, ttmp1, 0x0
|
||||
|
||||
.not_s_trap:
|
||||
// If llvm.debugtrap and debugger is not attached.
|
||||
s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP
|
||||
s_cbranch_scc0 .no_skip_debugtrap
|
||||
@@ -272,6 +315,24 @@ trap_entry:
|
||||
// Ignore llvm.debugtrap.
|
||||
s_branch .exit_trap
|
||||
|
||||
.not_s_trap:
|
||||
.if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4
|
||||
//Check for stochastic trap on gfx9.4+
|
||||
s_getreg_b32 ttmp7, hwreg(HW_REG_TRAPSTS) // On gfx94x, TRAPSTS bit 26 ...
|
||||
s_bitcmp1_b32 ttmp7, SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT // is stochastic_sample_trap
|
||||
s_cbranch_scc0 .no_skip_debugtrap
|
||||
|
||||
// Handle stochastic trap
|
||||
s_load_dwordx2 ttmp[2:3], ttmp[14:15], 0x8 glc // ttmp[14:15]=*stoch_trap_buf
|
||||
s_bitset0_b32 ttmp13, TTMP13_PCS_IS_HOSTTRAP
|
||||
s_bitset1_b32 ttmp13, TTMP13_PCS_IS_STOCHASTIC // set bit 25 in TTMP13
|
||||
s_waitcnt lgkmcnt(0)
|
||||
s_mov_b64 ttmp[14:15], ttmp[2:3]
|
||||
s_branch .profile_trap_handlers_gfx9 // Off to the profile handlers
|
||||
.else
|
||||
s_branch .no_skip_debugtrap
|
||||
.endif // PC_SAMPLING_GFX9
|
||||
|
||||
.if (.amdgcn.gfx_generation_number == 9) // PC_SAMPLING_GFX9
|
||||
// tma->host_trap_buffers Offsets:
|
||||
// [0x00] uint64_t buf_write_val;
|
||||
@@ -348,6 +409,26 @@ trap_entry:
|
||||
s_addc_u32 ttmp5, ttmp15, ttmp5 // buffer0 or buffer1
|
||||
s_mov_b32 ttmp7, ttmp2
|
||||
|
||||
.if .amdgcn.gfx_generation_number == 9
|
||||
|
||||
.if .amdgcn.gfx_generation_minor >= 4
|
||||
// Check if it's a stochastic trap
|
||||
s_bitcmp1_b32 ttmp13, TTMP13_PCS_IS_STOCHASTIC
|
||||
s_cbranch_scc1 .fill_sample_stochastic
|
||||
// Check if it's a host trap
|
||||
s_bitcmp1_b32 ttmp13, TTMP13_PCS_IS_HOSTTRAP
|
||||
s_cbranch_scc1 .fill_sample_hosttrap
|
||||
.else
|
||||
// Check if it's a host trap
|
||||
s_bitcmp1_b32 ttmp11, TTMP11_PCS_IS_HOSTTRAP
|
||||
s_cbranch_scc1 .fill_sample_hosttrap
|
||||
|
||||
.endif
|
||||
.endif
|
||||
// If neither bit is set, this is unexpected.
|
||||
// This branch is not expected to be taken.
|
||||
s_branch .no_skip_debugtrap
|
||||
|
||||
// ttmp7 contains local_entry, ttmp[4:5] contains "&bufferX",
|
||||
// ttmp[14:15] holds 'tma->host_trap_buffers' pointer
|
||||
// ttmp[2:3] and ttmp13 are available for gathering perf sample info
|
||||
@@ -381,7 +462,7 @@ trap_entry:
|
||||
// buf->timestamp = s_memrealtime;
|
||||
// buf->correlation_id = get_correlation_id();
|
||||
// }
|
||||
|
||||
.fill_sample_hosttrap:
|
||||
s_mul_i32 ttmp2, ttmp7, 0x40 // offset into buffer for 64B objects
|
||||
s_mul_hi_u32 ttmp3, ttmp7, 0x40 // ttmp[2:3] will contain byte ...
|
||||
s_add_u32 ttmp2, ttmp2, ttmp4
|
||||
@@ -401,19 +482,56 @@ trap_entry:
|
||||
.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4)
|
||||
s_getreg_b32 ttmp4, hwreg(HW_REG_XCC_ID) //store XCC_ID
|
||||
s_lshl_b32 ttmp4, ttmp4, 8
|
||||
s_and_b32 ttmp5, ttmp11, 0x3f
|
||||
s_and_b32 ttmp5, ttmp11, TTMP11_WAVE_IN_WG_MASK
|
||||
s_or_b32 ttmp4, ttmp4, ttmp5
|
||||
s_store_dword ttmp4, ttmp[2:3], 0x1c // store wave_in_wg
|
||||
.else
|
||||
s_and_b32 ttmp4, ttmp11, 0x3f
|
||||
s_store_dword ttmp4, ttmp[2:3], 0x1c // store wave_in_wg
|
||||
.endif
|
||||
// Get HW_ID using S_GETREG_B32 with size=32 (F8 in upper bits), offset=0, and HW_ID = 4 (0x4)
|
||||
s_getreg_b32 ttmp4, hwreg(HW_REG_HW_ID)
|
||||
s_store_dword ttmp4, ttmp[2:3], 0x20 // store HW_ID
|
||||
|
||||
// ttmp[2:3] = &buffer[local_entry]; ttmp[4:5], ttmp7, and ttmp13 are free
|
||||
// ttmp[14:15] = tma->host_trap_buffers and is live out; ttmp6.b31 is buf_to_use, 0 or 1
|
||||
s_branch .get_correlation_id
|
||||
|
||||
.if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4
|
||||
.fill_sample_stochastic:
|
||||
s_mul_i32 ttmp2, ttmp7, 0x40 // offset into buffer for 64B objects
|
||||
s_mul_hi_u32 ttmp3, ttmp7, 0x40
|
||||
s_add_u32 ttmp2, ttmp2, ttmp4
|
||||
s_addc_u32 ttmp3, ttmp3, ttmp5 // ttmp[2:3]=&buffer[local_entry]
|
||||
s_memrealtime ttmp[4:5]
|
||||
s_waitcnt lgkmcnt(0) // Wait for timestamp
|
||||
s_store_dwordx2 ttmp[4:5], ttmp[2:3] 0x30 // Store timestamp
|
||||
|
||||
s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA)
|
||||
s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA1)
|
||||
s_store_dwordx2 ttmp[4:5], ttmp[2:3], 0x24 // store snapshot PC
|
||||
s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_LO)
|
||||
s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_HI)
|
||||
s_store_dwordx2 ttmp[4:5], ttmp[2:3] 0x00 // store snapshot data
|
||||
|
||||
s_mov_b32 ttmp6, exec_lo
|
||||
s_store_dword ttmp6, ttmp[2:3], 0x8 // store EXEC_LO
|
||||
s_mov_b32 ttmp6, exec_hi
|
||||
s_store_dword ttmp6, ttmp[2:3], 0xc // store EXEC_HI
|
||||
|
||||
s_store_dwordx2 ttmp[8:9], ttmp[2:3], 0x10 // store wg_id_x and wg_id_y
|
||||
s_store_dword ttmp10, ttmp[2:3], 0x18 // store wg_id_z
|
||||
s_getreg_b32 ttmp4, hwreg(HW_REG_XCC_ID)
|
||||
s_lshl_b32 ttmp4, ttmp4, 8
|
||||
s_and_b32 ttmp5, ttmp11, TTMP11_WAVE_IN_WG_MASK
|
||||
s_or_b32 ttmp4, ttmp4, ttmp5
|
||||
s_store_dword ttmp4, ttmp[2:3], 0x1c // store chiplet_and_wave_id
|
||||
s_getreg_b32 ttmp4, hwreg(HW_REG_HW_ID)
|
||||
s_store_dword ttmp4, ttmp[2:3], 0x20 // store HW_ID
|
||||
// ttmp[2:3]=&buffer[local_entry]; ttmp[4:5], ttmp[6:7] are free
|
||||
// ttmp[14:15]=ptr to ‘tma’ and is live out; ttmp11.b31 is buf_to_use, 0 or 1
|
||||
s_branch .get_correlation_id
|
||||
|
||||
.endif
|
||||
|
||||
.get_correlation_id:
|
||||
|
||||
// get_correlation_id() -- begin //
|
||||
// Returns a value to use as a correlation ID.
|
||||
@@ -437,6 +555,7 @@ trap_entry:
|
||||
// ttmp[4:5], ttmp7, and ttmp13 are free
|
||||
// ttmp[14:15] = tma->host_trap_buffers and is live out
|
||||
// ttmp6.b31 is buf_to_use, 0 or 1 and is live out
|
||||
|
||||
s_mov_b64 ttmp[4:5], exec // back up EXEC mask
|
||||
s_mov_b32 exec_lo, 0x80000000 // prepare EXEC for doorbell spin
|
||||
s_sendmsg sendmsg(MSG_GET_DOORBELL) // message 10, puts doorbell in EXEC
|
||||
@@ -519,7 +638,6 @@ trap_entry:
|
||||
s_getreg_b32 ttmp3, hwreg(HW_REG_MODE, SQ_WAVE_MODE_EXCP_EN_SHIFT, SQ_WAVE_MODE_EXCP_EN_SIZE) // ttmp3[7:0] = MODE.EXCP_EN
|
||||
// Set bits corresponding to TRAPSTS.MEM_VIOL, TRAPSTS.ILLEGAL_INST and TRAPSTS.XNACK_ERROR
|
||||
s_or_b32 ttmp3, ttmp3, (1 << SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT | 1 << SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT | 1 << SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT)
|
||||
s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS)
|
||||
s_and_b32 ttmp2, ttmp2, ttmp3
|
||||
// SCC will be 1 if either a maskable instruction was set, or one of MEM_VIOL, ILL_INST, XNACK_ERROR
|
||||
s_cbranch_scc1 .no_skip_debugtrap // if any of those are set, handle exceptions
|
||||
@@ -539,6 +657,7 @@ trap_entry:
|
||||
.no_skip_debugtrap:
|
||||
// Save trap id and halt status in ttmp6.
|
||||
s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK)
|
||||
s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
|
||||
s_min_u32 ttmp2, ttmp2, 0xF
|
||||
s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT
|
||||
s_or_b32 ttmp6, ttmp6, ttmp2
|
||||
|
||||
Ссылка в новой задаче
Block a user