rocr: Adding support for Stochastic PC Sampling for gfx94x (#47)

Change-Id: Ide4c2e25b88f1f25ea4ce35a619b93963c0355ee
Этот коммит содержится в:
Khatri, Shweta
2025-02-22 00:13:08 -05:00
коммит произвёл GitHub
родитель a9f6bc8d0e
Коммит 322a794cf6
3 изменённых файлов: 597 добавлений и 343 удалений
+25 -18
Просмотреть файл
@@ -508,24 +508,20 @@ class GpuAgent : public GpuAgentInt {
// @brief Binds the second-level trap handler to this node.
void BindTrapHandler();
hsa_status_t UpdateTrapHandlerWithPCS(void* pcs_hosttrap_buffers, void* stochastic_hosttrap_buffers);
// @brief Override from core::Agent.
hsa_status_t EnableDmaProfiling(bool enable) override;
hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
void* cb_data) override;
hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) override;
hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId,
pcs::PcsRuntime::PcSamplingSession& session) override;
hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) override;
hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) override;
hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) override;
hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) override;
hsa_status_t PcSamplingFlushHostTrapDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session);
static void PcSamplingThreadRun(void* agent);
void PcSamplingThread();
pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingFlushDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session);
// @brief Node properties.
const HsaNodeProperties properties_;
@@ -749,13 +745,13 @@ class GpuAgent : public GpuAgentInt {
uint8_t reserved1[16];
/* pc_sample_t buffer0[buf_size]; */
/* pc_sample_t buffer1[buf_size]; */
} pcs_hosttrap_sampling_data_t;
} pcs_sampling_data_t;
typedef struct {
/* Hosttrap data - stored on device so that trap_handler code can access efficiently */
pcs_hosttrap_sampling_data_t* device_data;
/* Sampling data - stored on device for trap handler access */
pcs_sampling_data_t* device_data;
/* Hosttrap host buffer - stored on host */
/* Sampling host buffer - stored on host */
uint8_t* host_buffer;
size_t host_buffer_size;
uint8_t* host_buffer_wrap_pos;
@@ -774,11 +770,16 @@ class GpuAgent : public GpuAgentInt {
os::Thread thread;
pcs::PcsRuntime::PcSamplingSession* session;
} pcs_hosttrap_t;
pcs_hosttrap_t pcs_hosttrap_data_;
} pcs_data_t;
/* PC Sampling fields - end */
hsa_status_t UpdateTrapHandlerWithPCS(pcs_sampling_data_t* pcs_hosttrap_buffers,
pcs_sampling_data_t* pcs_stochastic_buffers);
// @brief Thread function to process PC sampling data collected via host-trap
// or Stochastic sampling.
void PcSamplingThread(pcs_data_t& pcs_data, const char* thread_name);
// @brief device handle
amdgpu_device_handle ldrm_dev_;
@@ -793,6 +794,12 @@ class GpuAgent : public GpuAgentInt {
bool uses_rec_sdma_eng_id_mask_;
// structure for host trap sampling
pcs_data_t pcs_hosttrap_data_;
// structure for stochastic sampling
pcs_data_t pcs_stochastic_data_;
// @bried XGMI CPU<->GPU
bool xgmi_cpu_gpu_;
};
+418 -290
Просмотреть файл
@@ -116,6 +116,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
[this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }),
trap_handler_tma_region_(NULL),
pcs_hosttrap_data_(),
pcs_stochastic_data_(),
xgmi_cpu_gpu_(false) {
const bool is_apu_node = (properties_.NumCPUCores > 0);
profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE;
@@ -2166,7 +2167,7 @@ void GpuAgent::SyncClocks() {
assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error");
}
hsa_status_t GpuAgent::UpdateTrapHandlerWithPCS(void* pcs_hosttrap_buffers, void* pcs_stochastic_buffers) {
hsa_status_t GpuAgent::UpdateTrapHandlerWithPCS(pcs_sampling_data_t* pcs_hosttrap_buffers, pcs_sampling_data_t* pcs_stochastic_buffers) {
// Assemble the trap handler source code.
void* tma_addr = nullptr;
uint64_t tma_size = 0;
@@ -2541,7 +2542,11 @@ hsa_status_t GpuAgent::PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& sess
ret = PcSamplingCreateFromId(0, session);
if (ret != HSA_STATUS_SUCCESS) return ret;
// Obtain the sampling information from the session.
session.GetHsaKmtSamplingInfo(&sampleInfo);
// Pass the sampling information to the kernel driver to create PC
// sampling session.
HSAKMT_STATUS retkmt = hsaKmtPcSamplingCreate(node_id(), &sampleInfo, &thunkId);
if (retkmt != HSAKMT_STATUS_SUCCESS) {
return (retkmt == HSAKMT_STATUS_KERNEL_ALREADY_OPENED) ? (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY
@@ -2557,114 +2562,133 @@ hsa_status_t GpuAgent::PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& sess
hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId,
pcs::PcsRuntime::PcSamplingSession& session) {
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
// Determine the sampling method from the session
hsa_ven_amd_pcs_method_kind_t sampling_method = session.method();
if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
// TODO: For now can only have 1 hosttrap session at a time. As a final solution, we want to be
// able to support multiple sessions at a time. But this makes the session.HandleSampleData more
// complicated if multiple sessions have different buffer sizes.
if (ht_data.session) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
pcs_data_t* pcs_data = nullptr;
// This is current amd_aql_queue->pm4_ib_size_b_
ht_data.cmd_data_sz = 0x1000;
ht_data.cmd_data = (uint32_t*)malloc(ht_data.cmd_data_sz);
assert(ht_data.cmd_data);
if (sampling_method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
pcs_data = &pcs_hosttrap_data_;
} else if (sampling_method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
pcs_data = &pcs_stochastic_data_;
} else {
// Unsupported sampling method
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
if (HSA::hsa_signal_create(1, 0, NULL, &ht_data.exec_pm4_signal) != HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR;
// Ensure only one session is active at a time for the given method
if (pcs_data->session)
return HSA_STATUS_ERROR_OUT_OF_RESOURCES; // TODO: For now, we can only have
// 1 pc sampling session at a
// time. As a final solution, we
// want to be able to support
// multiple sessions at a time.
// But this makes the
// session.HandleSampleData more
// complicated if multiple
// sessions have different buffer
// sizes.
ht_data.old_val = (uint64_t*)system_allocator()(sizeof(uint64_t), 0x1000, 0);
assert(ht_data.old_val);
// This is current amd_aql_queue->pm4_ib_size_b_
pcs_data->cmd_data_sz = 0x1000; // 4KB
pcs_data->cmd_data = (uint32_t*)malloc(pcs_data->cmd_data_sz);
if (!pcs_data->cmd_data) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, ht_data.old_val))
return HSA_STATUS_ERROR;
if (HSA::hsa_signal_create(1, 0, NULL, &pcs_data->exec_pm4_signal) != HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR;
// Local copy of hosttrap data - we cannot access device memory directly on non-large BAR
// systems
pcs_hosttrap_sampling_data_t* device_datahost =
(pcs_hosttrap_sampling_data_t*)system_allocator()(sizeof(*device_datahost), 0x1000, 0);
if (!device_datahost) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
pcs_data->old_val = (uint64_t*)system_allocator()(sizeof(uint64_t), 0x1000, 0);
if (!pcs_data->old_val) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
MAKE_SCOPE_GUARD([&]() { system_deallocator()(device_datahost); });
if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, pcs_data->old_val))
return HSA_STATUS_ERROR;
memset(device_datahost, 0, sizeof(*device_datahost));
// Local copy of pc sampling data - we cannot access device memory directly on non-large BAR
// systems
pcs_sampling_data_t* device_datahost =
(pcs_sampling_data_t*)system_allocator()(sizeof(pcs_sampling_data_t), 0x1000, 0);
if (!device_datahost) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, device_datahost) !=
HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR;
MAKE_SCOPE_GUARD([&]() { system_deallocator()(device_datahost); });
MAKE_NAMED_SCOPE_GUARD(freeHostTrapResources, [&]() {
if (ht_data.device_data) {
if (ht_data.device_data->done_sig0.handle)
HSA::hsa_signal_destroy(ht_data.device_data->done_sig0);
if (ht_data.device_data->done_sig1.handle)
HSA::hsa_signal_destroy(ht_data.device_data->done_sig1);
memset(device_datahost, 0, sizeof(*device_datahost));
finegrain_deallocator()(ht_data.device_data);
}
if (ht_data.host_buffer) system_deallocator()(ht_data.host_buffer);
});
if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, device_datahost) !=
HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR;
// Force creating of PC Sampling queue to trigger exception early in case we exceed max availble
// CP queues on this agent
queues_[QueuePCSampling].touch();
MAKE_NAMED_SCOPE_GUARD(freeResources, [&]() {
if (pcs_data->device_data) {
if (pcs_data->device_data->done_sig0.handle)
HSA::hsa_signal_destroy(pcs_data->device_data->done_sig0);
if (pcs_data->device_data->done_sig1.handle)
HSA::hsa_signal_destroy(pcs_data->device_data->done_sig1);
/*
* When calling queue->ExecutePM4() Indirect Buffer size which is 0x1000 bytes (1024 DW).
* The maximum indirect buffer size we need occurs when we enqueue the
* WAIT_REG_MEM, DMA_COPY(s), WRITE_DATA ops:
* For WAIT_REG_MEM = 7 DW
* For each DMA_COPY = 7 DW
* For WRITE_DATA_CMD = 6 DW
*
* So maximum number of DMA_COPY ops is:
* (MAX_IB_SIZE - sizeof(WAIT_REG_MEM) - sizeof(WRITE_DATA_CMD)) / sizeof(DMA_COPY)
* (1024 - 7 - 6) / 7 = 144
*
* Each DMA_COPY op can transfer (1 << 26) bytes, which is 9 GB. trap_buffer_size is a 32-bit
* number, so the buffer must be < 4 GB. So we are not limited by Indirect Buffer size.
* Set current limit to 256 MB to limit device VRAM usage
*/
const size_t max_trap_buffer_size =
core::Runtime::runtime_singleton_->flag().pc_sampling_max_device_buffer_size();
finegrain_deallocator()(pcs_data->device_data);
}
if (pcs_data->host_buffer) system_deallocator()(pcs_data->host_buffer);
});
/*
* We use a double-buffer mechanism where there are 2 trap-buffers and 1 host-buffer
* Warning: This currently assumes that client latency is smaller than time to fill 1
* trap-buffer If latency is bigger, we have to increate host-buffer
*
* host-buffer must be >= client-buffer so that we can copy full size of client-buffer each
* time. To avoid having to deal with wrap-arounds, host-buffer must be a multiple of
* trap-buffers
*
* if client-buffer size is greater than 2x max_trap_buffer_size:
* We are limited by max_trap_buffer_size.
* trap-buffer = max-trap-buffer-size
* host-buffer = 2*smallest size greater than client-buffer but multiple of 1 trap-buffer
* else:
* We reduce the trap-buffers so that:
* trap-buffer = half of user-buffer
* host-buffer = 2*user-buffer
*
* TODO: We are currently using a temporary host-buffer so that we can increase host-buffer to
* factor in client latency. Using a direct-copy to the client buffer would be more efficient.
* Revisit this once we have empirical data of latency vs how long it takes to fill 1
* trap-buffer.
*/
// Force creating of PC Sampling queue to trigger exception early in case we exceed max availble
// CP queues on this agent
queues_[QueuePCSampling].touch();
size_t trap_buffer_size = 0;
if (session.buffer_size() > 2 * max_trap_buffer_size) {
trap_buffer_size = max_trap_buffer_size;
ht_data.host_buffer_size = 2 * AlignUp(session.buffer_size(), trap_buffer_size);
/*
* When calling queue->ExecutePM4() Indirect Buffer size which is 0x1000 bytes (1024 DW).
* The maximum indirect buffer size we need occurs when we enqueue the
* WAIT_REG_MEM, DMA_COPY(s), WRITE_DATA ops:
* For WAIT_REG_MEM = 7 DW
* For each DMA_COPY = 7 DW
* For WRITE_DATA_CMD = 6 DW
*
* So maximum number of DMA_COPY ops is:
* (MAX_IB_SIZE - sizeof(WAIT_REG_MEM) - sizeof(WRITE_DATA_CMD)) / sizeof(DMA_COPY)
* (1024 - 7 - 6) / 7 = 144
*
* Each DMA_COPY op can transfer (1 << 26) bytes, which is 9 GB. trap_buffer_size is a 32-bit
* number, so the buffer must be < 4 GB. So we are not limited by Indirect Buffer size.
* Set current limit to 256 MB to limit device VRAM usage
*/
const size_t max_trap_buffer_size =
core::Runtime::runtime_singleton_->flag().pc_sampling_max_device_buffer_size();
/*
* We use a double-buffer mechanism where there are 2 trap-buffers and 1 host-buffer
* Warning: This currently assumes that client latency is smaller than time to fill 1
* trap-buffer If latency is bigger, we have to increate host-buffer
*
* host-buffer must be >= client-buffer so that we can copy full size of client-buffer each
* time. To avoid having to deal with wrap-arounds, host-buffer must be a multiple of
* trap-buffers
*
* if client-buffer size is greater than 2x max_trap_buffer_size:
* We are limited by max_trap_buffer_size.
* trap-buffer = max-trap-buffer-size
* host-buffer = 2*smallest size greater than client-buffer but multiple of 1 trap-buffer
* else:
* We reduce the trap-buffers so that:
* trap-buffer = half of user-buffer
* host-buffer = 2*user-buffer
*
* TODO: We are currently using a temporary host-buffer so that we can increase host-buffer to
* factor in client latency. Using a direct-copy to the client buffer would be more efficient.
* Revisit this once we have empirical data of latency vs how long it takes to fill 1
* trap-buffer.
*/
size_t trap_buffer_size = 0;
if (session.buffer_size() > 2 * max_trap_buffer_size) {
trap_buffer_size = max_trap_buffer_size;
pcs_data->host_buffer_size = 2 * AlignUp(session.buffer_size(), trap_buffer_size);
} else {
trap_buffer_size = session.buffer_size() / 2;
ht_data.host_buffer_size = 2 * session.buffer_size();
pcs_data->host_buffer_size = 2 * session.buffer_size();
}
ht_data.host_buffer = (uint8_t*)system_allocator()(ht_data.host_buffer_size, 0x1000, 0);
if (!ht_data.host_buffer) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
pcs_data->host_buffer = (uint8_t*)system_allocator()(pcs_data->host_buffer_size, 0x1000, 0);
if (!pcs_data->host_buffer) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, ht_data.host_buffer) !=
if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, pcs_data->host_buffer) !=
HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR;
@@ -2682,101 +2706,162 @@ hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId,
device_datahost->buf_watermark1 = 0.8 * device_datahost->buf_size;
// Allocate device memory for 2nd level trap handler TMA
size_t deviceAllocSize = sizeof(*ht_data.device_data) + (2 * trap_buffer_size);
ht_data.device_data = (pcs_hosttrap_sampling_data_t*)finegrain_allocator()(deviceAllocSize, 0);
if (ht_data.device_data == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
size_t deviceAllocSize = sizeof(*pcs_data->device_data) + (2 * trap_buffer_size);
pcs_data->device_data = (pcs_sampling_data_t*)finegrain_allocator()(deviceAllocSize, 0);
if (pcs_data->device_data == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
// This cpuAgent is the owner of the system_allocator() pool
auto cpuAgent = GetNearestCpuAgent()->public_handle();
hsa_status_t ret = AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, ht_data.device_data);
assert(ret == HSA_STATUS_SUCCESS);
if (AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, pcs_data->device_data) != HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR;
if (DmaCopy(ht_data.device_data, device_datahost, sizeof(*device_datahost)) !=
if (DmaCopy(pcs_data->device_data, device_datahost, sizeof(*device_datahost)) !=
HSA_STATUS_SUCCESS) {
debug_print("Failed to dmaCopy!\n");
return HSA_STATUS_ERROR;
}
uint8_t* device_buf_ptr =
((uint8_t*)ht_data.device_data) + sizeof(pcs_hosttrap_sampling_data_t);
if (DmaFill(device_buf_ptr, 0, deviceAllocSize - sizeof(pcs_hosttrap_sampling_data_t)) !=
((uint8_t*)pcs_data->device_data) + sizeof(pcs_sampling_data_t);
if (DmaFill(device_buf_ptr, 0, deviceAllocSize - sizeof(pcs_sampling_data_t)) !=
HSA_STATUS_SUCCESS) {
debug_print("Failed to dmaFill!\n");
return HSA_STATUS_ERROR;
}
ht_data.lost_sample_count = 0;
ht_data.host_buffer_wrap_pos = 0;
ht_data.host_write_ptr = ht_data.host_buffer;
ht_data.host_read_ptr = ht_data.host_write_ptr;
pcs_data->lost_sample_count = 0;
pcs_data->host_buffer_wrap_pos = 0;
pcs_data->host_write_ptr = pcs_data->host_buffer;
pcs_data->host_read_ptr = pcs_data->host_write_ptr;
ht_data.session = &session;
freeHostTrapResources.Dismiss();
pcs_data->session = &session;
if (UpdateTrapHandlerWithPCS(ht_data.device_data, NULL) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR;
}
if (UpdateTrapHandlerWithPCS(
sampling_method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1 ? pcs_data->device_data : nullptr,
sampling_method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1
? pcs_data->device_data
: nullptr) != HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR;
session.SetThunkId(ioctlId);
ht_data.session = &session;
session.SetThunkId(ioctlId);
return HSA_STATUS_SUCCESS;
freeResources.Dismiss();
return HSA_STATUS_SUCCESS;
}
hsa_status_t GpuAgent::PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) {
if (PcSamplingStop(session) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR;
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
HSAKMT_STATUS retKmt = hsaKmtPcSamplingDestroy(node_id(), session.ThunkId());
ht_data.session = NULL;
hsa_ven_amd_pcs_method_kind_t sampling_method = session.method();
if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
free(ht_data.cmd_data);
system_deallocator()(ht_data.old_val);
HSA::hsa_signal_destroy(ht_data.exec_pm4_signal);
HSA::hsa_signal_destroy(ht_data.device_data->done_sig0);
HSA::hsa_signal_destroy(ht_data.device_data->done_sig1);
finegrain_deallocator()(ht_data.device_data);
system_deallocator()(ht_data.host_buffer);
pcs_data_t* pcs_data = nullptr;
ht_data.device_data = NULL;
ht_data.host_buffer = NULL;
ht_data.session = NULL;
UpdateTrapHandlerWithPCS(NULL, NULL);
if (sampling_method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
pcs_data = &pcs_hosttrap_data_;
} else if (sampling_method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
pcs_data = &pcs_stochastic_data_;
} else {
// Unsupported sampling method
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
// Mark session as inactive
pcs_data->session = nullptr;
free(pcs_data->cmd_data);
system_deallocator()(pcs_data->old_val);
HSA::hsa_signal_destroy(pcs_data->exec_pm4_signal);
HSA::hsa_signal_destroy(pcs_data->device_data->done_sig0);
HSA::hsa_signal_destroy(pcs_data->device_data->done_sig1);
finegrain_deallocator()(pcs_data->device_data);
system_deallocator()(pcs_data->host_buffer);
pcs_data->device_data = NULL;
pcs_data->host_buffer = NULL;
pcs_data->session = NULL;
// Update the trap handler to clear any associated device data
UpdateTrapHandlerWithPCS(nullptr, nullptr);
return (retKmt == HSAKMT_STATUS_SUCCESS) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
}
hsa_status_t GpuAgent::PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) {
if (session.isActive()) return HSA_STATUS_SUCCESS;
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
auto method = session.method();
pcs_data_t* pcs_data = nullptr;
const char* thread_name = nullptr;
if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
if (ht_data.session->isActive()) {
debug_warning("Already have a Host trap session in progress!");
return (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY;
}
ht_data.session->start();
// This thread will handle all hosttrap sessions on this agent
// In the future, there will be another thread to handle stochastic sessions.
ht_data.thread = os::CreateThread(PcSamplingThreadRun, (void*)this);
if (!ht_data.thread)
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
"Failed to start PC Sampling thread.");
pcs_data = &pcs_hosttrap_data_;
thread_name = "PcSamplingHostTrapThread";
} else if (method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
pcs_data = &pcs_stochastic_data_;
thread_name = "PcSamplingStochasticThread";
} else {
// Unsupported sampling method
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
// Check if a session is already active
if (pcs_data->session && pcs_data->session->isActive()) {
debug_warning("Already have a PC sampling session in progress!");
return (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY;
}
// Assign the new session and mark it as active
pcs_data->session = &session;
pcs_data->session->start();
// Creating thread data
struct ThreadData {
GpuAgent* agent;
pcs_data_t* pcs_data;
const char* thread_name;
};
auto* thread_data = new ThreadData{this, pcs_data, thread_name};
// This thread will handle all PC Sampling sessions on this agent
pcs_data->thread = os::CreateThread(
[](void* arg) -> void {
auto* thread_data = static_cast<ThreadData*>(arg);
try {
GpuAgent* agent = thread_data->agent;
pcs_data_t* pcs_data = thread_data->pcs_data;
const char* thread_name = thread_data->thread_name;
agent->PcSamplingThread(*pcs_data, thread_name);
} catch (...) {
fprintf(stdout, "Exception caught in PcSamplingThread. Exiting the thread!");
}
delete thread_data;
},
thread_data);
if (!pcs_data->thread) {
// if thread creation failed
delete thread_data;
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
"Failed to start PC Sampling thread.");
}
// Start the sampling session in the kernel driver
if (hsaKmtPcSamplingStart(node_id(), session.ThunkId()) == HSAKMT_STATUS_SUCCESS)
return HSA_STATUS_SUCCESS;
debug_print("Failed to start PC sampling session with thunkId:%d\n", session.ThunkId());
if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
ht_data.session->stop();
os::WaitForThread(ht_data.thread);
os::CloseThread(ht_data.thread);
ht_data.thread = NULL;
}
// Clean up if starting the session failed
pcs_data->session->stop();
os::WaitForThread(pcs_data->thread);
os::CloseThread(pcs_data->thread);
pcs_data->thread = nullptr;
pcs_data->session = nullptr;
return HSA_STATUS_ERROR;
}
@@ -2784,35 +2869,51 @@ hsa_status_t GpuAgent::PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& sessi
hsa_status_t GpuAgent::PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) {
if (!session.isActive()) return HSA_STATUS_SUCCESS;
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
// Stop the session
session.stop();
// Stop PC sampling in the kernel driver
HSAKMT_STATUS retKmt = hsaKmtPcSamplingStop(node_id(), session.ThunkId());
if (retKmt != HSAKMT_STATUS_SUCCESS)
throw AMD::hsa_exception(HSA_STATUS_ERROR, "Failed to stop PC Sampling session.");
if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
// Wake up pcs_hosttrap_thread_ if it is waiting for data
HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig0, -1);
HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig1, -1);
// Determine the sampling method and corresponding data
pcs_data_t* pcs_data = nullptr;
auto method = session.method();
os::WaitForThread(ht_data.thread);
os::CloseThread(ht_data.thread);
ht_data.thread = NULL;
if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
pcs_data = &pcs_hosttrap_data_;
} else if (method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
pcs_data = &pcs_stochastic_data_;
} else {
// Unsupported sampling method
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
// Wake up pcs_hosttrap_thread_ if it is waiting for data
HSA::hsa_signal_store_screlease(pcs_data->device_data->done_sig0, -1);
HSA::hsa_signal_store_screlease(pcs_data->device_data->done_sig1, -1);
// Wait for the thread to finish and clean up
os::WaitForThread(pcs_data->thread);
os::CloseThread(pcs_data->thread);
pcs_data->thread = nullptr;
pcs_data->session = nullptr;
return HSA_STATUS_SUCCESS;
}
hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
hsa_status_t GpuAgent::PcSamplingFlushDeviceBuffers(
pcs::PcsRuntime::PcSamplingSession& session) {
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
uint32_t& which_buffer = ht_data.which_buffer;
uint32_t* cmd_data = ht_data.cmd_data;
size_t& cmd_data_sz = ht_data.cmd_data_sz;
uint64_t* old_val = ht_data.old_val;
hsa_signal_t& exec_pm4_signal = ht_data.exec_pm4_signal;
pcs_data_t* pcs_data = nullptr;
if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
pcs_data = &pcs_hosttrap_data_;
} else if (session.method() == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
pcs_data = &pcs_stochastic_data_;
} else {
// No sampling session active
return HSA_STATUS_SUCCESS;
}
/*
* Device-buffer to Host-buffer to User-Buffer copy logic
@@ -2951,19 +3052,33 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
uint32_t pred_exec_cmd_sz = 0;
uint8_t* host_buffer_begin = ht_data.host_buffer;
uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size;
uint64_t buf_write_val;
uint64_t buf_written_val[2];
size_t buf_offset;
uint8_t* buffer[2];
size_t buf_size;
uint64_t buf_write_val = (uint64_t) & (ht_data.device_data->buf_write_val);
uint64_t buf_written_val[] = {(uint64_t) & (ht_data.device_data->buf_written_val0),
(uint64_t) & (ht_data.device_data->buf_written_val1)};
uint32_t& which_buffer = pcs_data->which_buffer;
uint32_t* cmd_data = pcs_data->cmd_data;
size_t cmd_data_sz = pcs_data->cmd_data_sz;
uint64_t* old_val = pcs_data->old_val;
hsa_signal_t& exec_pm4_signal = pcs_data->exec_pm4_signal;
size_t const buf_offset = offsetof(pcs_hosttrap_sampling_data_t, reserved1) +
sizeof(((pcs_hosttrap_sampling_data_t*)0)->reserved1);
uint8_t* host_buffer_begin = pcs_data->host_buffer;
size_t& host_buffer_size = pcs_data->host_buffer_size;
uint8_t*& host_write_ptr = pcs_data->host_write_ptr;
uint8_t* host_buffer_end = host_buffer_begin + host_buffer_size;
uint8_t* buffer[] = {(uint8_t*)ht_data.device_data + buf_offset,
(uint8_t*)ht_data.device_data + buf_offset +
ht_data.device_data->buf_size * session.sample_size()};
buf_write_val = reinterpret_cast<uint64_t>(&pcs_data->device_data->buf_write_val);
buf_written_val[0] = reinterpret_cast<uint64_t>(&pcs_data->device_data->buf_written_val0);
buf_written_val[1] = reinterpret_cast<uint64_t>(&pcs_data->device_data->buf_written_val1);
buf_size = pcs_data->device_data->buf_size;
buf_offset =
offsetof(pcs_sampling_data_t, reserved1) + sizeof(((pcs_sampling_data_t*)0)->reserved1);
buffer[0] = reinterpret_cast<uint8_t*>(pcs_data->device_data) + buf_offset;
buffer[1] = buffer[0] + buf_size * session.sample_size();
next_buffer = (which_buffer + 1) % 2;
reset_write_val = (uint64_t)next_buffer << 63;
@@ -3022,25 +3137,25 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
/* If the number of entries in old_val is larger than buf_size, then there was a buffer overflow
* and the 2nd level trap handler code will skip recording samples, causing lost samples
*/
if (*old_val > (uint64_t)ht_data.device_data->buf_size) {
ht_data.lost_sample_count = *old_val - (uint64_t)ht_data.device_data->buf_size;
*old_val = (uint64_t)ht_data.device_data->buf_size;
if (*old_val > buf_size) {
pcs_data->lost_sample_count = *old_val - buf_size;
*old_val = buf_size;
}
to_copy = *old_val * session.sample_size();
/* Make sure there is enough space after host_write_ptr */
if (ht_data.host_write_ptr + to_copy >= host_buffer_end) {
if (host_write_ptr + to_copy >= host_buffer_end) {
// Need to wrap around
ht_data.host_buffer_wrap_pos = ht_data.host_write_ptr;
ht_data.host_write_ptr = host_buffer_begin;
pcs_data->host_buffer_wrap_pos = host_write_ptr;
host_write_ptr = host_buffer_begin;
}
i = 0;
memset(cmd_data, 0, cmd_data_sz);
if (properties_.NumXcc > 1) {
const uint32_t n = ceil(to_copy / (32 * 1024 * 1024));
const uint64_t n = ceil(to_copy / (32 * 1024 * 1024));
pred_exec_cmd_sz = 2;
cmd_data[i++] = PM4_HDR(PM4_HDR_IT_OPCODE_PRED_EXEC, pred_exec_cmd_sz, isa_->GetMajorVersion());
cmd_data[i++] =
@@ -3073,7 +3188,8 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
unsigned int num_copy_command = 0;
uint8_t* buffer_temp = buffer[which_buffer];
for (copy_bytes = CP_DMA_DATA_TRANSFER_CNT_MAX; 0 < to_copy; to_copy -= copy_bytes) {
for (copy_bytes = std::min(to_copy, (uint32_t)CP_DMA_DATA_TRANSFER_CNT_MAX); 0 < to_copy;
to_copy -= copy_bytes) {
num_copy_command++;
/* DMA_DATA PACKETS, copy buffer using CPDMA */
@@ -3082,9 +3198,8 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
PM4_DMA_DATA_SRC_SEL_SRC_ADDR_USING_L2);
cmd_data[i++] = PM4_DMA_DATA_DW2_SRC_ADDR_LO((uint64_t)buffer_temp);
cmd_data[i++] = PM4_DMA_DATA_DW3_SRC_ADDR_HI(((uint64_t)buffer_temp) >> 32);
cmd_data[i++] = PM4_DMA_DATA_DW4_DST_ADDR_LO((uint64_t)ht_data.host_write_ptr);
cmd_data[i++] = PM4_DMA_DATA_DW5_DST_ADDR_HI(((uint64_t)ht_data.host_write_ptr) >> 32);
cmd_data[i++] = PM4_DMA_DATA_DW4_DST_ADDR_LO((uint64_t)host_write_ptr);
cmd_data[i++] = PM4_DMA_DATA_DW5_DST_ADDR_HI(((uint64_t)host_write_ptr) >> 32);
if (copy_bytes >= to_copy) {
copy_bytes = to_copy;
cmd_data[i++] =
@@ -3093,7 +3208,7 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
cmd_data[i++] = PM4_DMA_DATA_DW6(PM4_DMA_DATA_BYTE_COUNT(copy_bytes) | PM4_DMA_DATA_DIS_WC);
}
buffer_temp += copy_bytes;
ht_data.host_write_ptr += copy_bytes;
host_write_ptr += copy_bytes;
}
/* WRITE_DATA, Reset buf_written_val */
@@ -3117,167 +3232,180 @@ hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
if (val == 0) break;
} while (true);
// save the position of next buffer
which_buffer = next_buffer;
return HSA_STATUS_SUCCESS;
}
void GpuAgent::PcSamplingThread() {
void GpuAgent::PcSamplingThread(pcs_data_t& pcs_data, const char* thread_name) {
// TODO: Implement lost sample count
// TODO: Implement latency
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
pcs::PcsRuntime::PcSamplingSession& session = *ht_data.session;
uint32_t& which_buffer = ht_data.which_buffer;
try {
pcs::PcsRuntime::PcSamplingSession& session = *pcs_data.session;
uint32_t& which_buffer = pcs_data.which_buffer;
uint8_t* host_buffer_begin = ht_data.host_buffer;
uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size;
uint8_t* host_buffer_begin = pcs_data.host_buffer;
uint8_t* host_buffer_end = pcs_data.host_buffer + pcs_data.host_buffer_size;
hsa_signal_t done_sig[] = {ht_data.device_data->done_sig0, ht_data.device_data->done_sig1};
hsa_signal_t done_sig[] = {pcs_data.device_data->done_sig0, pcs_data.device_data->done_sig1};
while (ht_data.session->isActive()) {
do {
hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire(
done_sig[which_buffer], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
if (val == -1) goto thread_exit;
if (val == 0) break;
} while (true);
HSA::hsa_signal_store_screlease(done_sig[which_buffer], 1);
while (pcs_data.session->isActive()) {
// Wait for the signal to process the buffer
do {
hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire(
done_sig[which_buffer], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
if (val == -1) goto thread_exit;
if (val == 0) break;
} while (true);
HSA::hsa_signal_store_screlease(done_sig[which_buffer], 1);
std::lock_guard<std::mutex> lock(ht_data.host_buffer_mutex);
if (PcSamplingFlushHostTrapDeviceBuffers(session) != HSA_STATUS_SUCCESS)
goto thread_exit;
// Lock buffer to ensure thread-safe access
std::lock_guard<std::mutex> lock(pcs_data.host_buffer_mutex);
// Flush device buffers
if (PcSamplingFlushDeviceBuffers(session) != HSA_STATUS_SUCCESS)
goto thread_exit;
size_t bytes_before_wrap;
size_t bytes_after_wrap;
size_t bytes_before_wrap;
size_t bytes_after_wrap;
assert(ht_data.host_read_ptr >= host_buffer_begin && ht_data.host_read_ptr < host_buffer_end);
assert(ht_data.host_write_ptr >= host_buffer_begin && ht_data.host_write_ptr < host_buffer_end);
assert(ht_data.host_buffer_wrap_pos ? (ht_data.host_read_ptr > ht_data.host_write_ptr)
: (ht_data.host_read_ptr <= ht_data.host_write_ptr));
assert(pcs_data.host_read_ptr >= host_buffer_begin && pcs_data.host_read_ptr < host_buffer_end);
assert(pcs_data.host_write_ptr >= host_buffer_begin && pcs_data.host_write_ptr < host_buffer_end);
assert(pcs_data.host_buffer_wrap_pos ? (pcs_data.host_read_ptr > pcs_data.host_write_ptr)
: (pcs_data.host_read_ptr <= pcs_data.host_write_ptr));
if (ht_data.host_buffer_wrap_pos) {
assert(ht_data.host_buffer_wrap_pos <= host_buffer_end &&
ht_data.host_buffer_wrap_pos > host_buffer_begin);
assert(ht_data.host_read_ptr <= ht_data.host_buffer_wrap_pos);
if (pcs_data.host_buffer_wrap_pos) {
assert(pcs_data.host_buffer_wrap_pos <= host_buffer_end &&
pcs_data.host_buffer_wrap_pos > host_buffer_begin);
assert(pcs_data.host_read_ptr <= pcs_data.host_buffer_wrap_pos);
// Wrapped around
bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
bytes_after_wrap = ht_data.host_write_ptr - host_buffer_begin;
// Wrapped around
bytes_before_wrap = pcs_data.host_buffer_wrap_pos - pcs_data.host_read_ptr;
bytes_after_wrap = pcs_data.host_write_ptr - host_buffer_begin;
while (bytes_before_wrap >= session.buffer_size()) {
session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0,
ht_data.lost_sample_count);
ht_data.host_read_ptr += session.buffer_size();
bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
ht_data.lost_sample_count = 0;
}
while (bytes_before_wrap >= session.buffer_size()) {
session.HandleSampleData(pcs_data.host_read_ptr, session.buffer_size(), nullptr, 0,
pcs_data.lost_sample_count);
pcs_data.host_read_ptr += session.buffer_size();
bytes_before_wrap = pcs_data.host_buffer_wrap_pos - pcs_data.host_read_ptr;
pcs_data.lost_sample_count = 0;
}
if (bytes_before_wrap + bytes_after_wrap >= session.buffer_size()) {
session.HandleSampleData(ht_data.host_read_ptr, bytes_before_wrap, host_buffer_begin,
(session.buffer_size() - bytes_before_wrap), 0);
ht_data.host_read_ptr = host_buffer_begin + (session.buffer_size() - bytes_before_wrap);
bytes_before_wrap = 0;
ht_data.host_buffer_wrap_pos = 0;
bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
ht_data.lost_sample_count = 0;
}
if (bytes_before_wrap + bytes_after_wrap >= session.buffer_size()) {
session.HandleSampleData(pcs_data.host_read_ptr, bytes_before_wrap, host_buffer_begin,
(session.buffer_size() - bytes_before_wrap), 0);
pcs_data.host_read_ptr = host_buffer_begin + (session.buffer_size() - bytes_before_wrap);
bytes_before_wrap = 0;
pcs_data.host_buffer_wrap_pos = 0;
bytes_after_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr;
pcs_data.lost_sample_count = 0;
}
while (bytes_after_wrap >= session.buffer_size()) {
session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0,
ht_data.lost_sample_count);
ht_data.host_read_ptr += session.buffer_size();
bytes_before_wrap = 0;
bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
ht_data.lost_sample_count = 0;
}
} else {
bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
while (bytes_after_wrap >= session.buffer_size()) {
session.HandleSampleData(pcs_data.host_read_ptr, session.buffer_size(), nullptr, 0,
pcs_data.lost_sample_count);
pcs_data.host_read_ptr += session.buffer_size();
bytes_before_wrap = 0;
bytes_after_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr;
pcs_data.lost_sample_count = 0;
}
} else {
// Handle non-wrapped buffer
bytes_before_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr;
while (bytes_before_wrap >= session.buffer_size()) {
assert(ht_data.host_read_ptr >= host_buffer_begin &&
ht_data.host_read_ptr + session.buffer_size() < host_buffer_end);
session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0,
ht_data.lost_sample_count);
ht_data.host_read_ptr += session.buffer_size();
bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
ht_data.lost_sample_count = 0;
while (bytes_before_wrap >= session.buffer_size()) {
assert(pcs_data.host_read_ptr >= host_buffer_begin &&
pcs_data.host_read_ptr + session.buffer_size() <= host_buffer_end);
session.HandleSampleData(pcs_data.host_read_ptr, session.buffer_size(), nullptr, 0,
pcs_data.lost_sample_count);
pcs_data.host_read_ptr += session.buffer_size();
bytes_before_wrap = pcs_data.host_write_ptr - pcs_data.host_read_ptr;
pcs_data.lost_sample_count = 0;
}
}
}
}
thread_exit:
debug_print("PcSamplingThread::Exiting\n");
debug_print("%s::Exiting\n", thread_name);
} catch (const std::exception& e) {
debug_print("Exception in %s: %s\n", thread_name, e.what());
} catch (...) {
debug_print("Unknown exception in %s\n", thread_name);
}
void GpuAgent::PcSamplingThreadRun(void* _agent) {
GpuAgent* agent = (GpuAgent*)_agent;
agent->PcSamplingThread();
debug_print("PcSamplingThread exiting...");
}
hsa_status_t GpuAgent::PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) {
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
pcs_data_t* pcs_data = nullptr;
uint8_t* host_buffer_begin = ht_data.host_buffer;
uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size;
if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
pcs_data = &pcs_hosttrap_data_;
} else if (session.method() == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
pcs_data = &pcs_stochastic_data_;
} else {
return HSA_STATUS_SUCCESS; // Unsupported sampling method
}
uint8_t* host_buffer_begin = pcs_data->host_buffer;
uint8_t* host_buffer_end = pcs_data->host_buffer + pcs_data->host_buffer_size;
size_t bytes_before_wrap;
size_t bytes_after_wrap;
std::lock_guard<std::mutex> lock(ht_data.host_buffer_mutex);
if (PcSamplingFlushHostTrapDeviceBuffers(session) != HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR;
std::lock_guard<std::mutex> lock(pcs_data->host_buffer_mutex);
// Flush device buffers
if (PcSamplingFlushDeviceBuffers(session) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR;
assert(ht_data.host_read_ptr >= host_buffer_begin && ht_data.host_read_ptr < host_buffer_end);
assert(ht_data.host_write_ptr >= host_buffer_begin && ht_data.host_write_ptr < host_buffer_end);
assert(ht_data.host_buffer_wrap_pos ? (ht_data.host_read_ptr > ht_data.host_write_ptr)
: (ht_data.host_read_ptr <= ht_data.host_write_ptr));
assert(pcs_data->host_read_ptr >= host_buffer_begin && pcs_data->host_read_ptr < host_buffer_end);
assert(pcs_data->host_write_ptr >= host_buffer_begin &&
pcs_data->host_write_ptr < host_buffer_end);
assert(pcs_data->host_buffer_wrap_pos ? (pcs_data->host_read_ptr > pcs_data->host_write_ptr)
: (pcs_data->host_read_ptr <= pcs_data->host_write_ptr));
if (ht_data.host_buffer_wrap_pos) {
assert(ht_data.host_buffer_wrap_pos <= host_buffer_end &&
ht_data.host_buffer_wrap_pos > host_buffer_begin);
assert(ht_data.host_read_ptr <= ht_data.host_buffer_wrap_pos);
if (pcs_data->host_buffer_wrap_pos) {
assert(pcs_data->host_buffer_wrap_pos <= host_buffer_end &&
pcs_data->host_buffer_wrap_pos > host_buffer_begin);
assert(pcs_data->host_read_ptr <= pcs_data->host_buffer_wrap_pos);
// Wrapped around
bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
bytes_after_wrap = ht_data.host_write_ptr - host_buffer_begin;
// Handle wrapped-around buffer
bytes_before_wrap = pcs_data->host_buffer_wrap_pos - pcs_data->host_read_ptr;
bytes_after_wrap = pcs_data->host_write_ptr - host_buffer_begin;
while (bytes_before_wrap > 0) {
size_t bytes_to_copy = std::min(bytes_before_wrap, session.buffer_size());
session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0,
ht_data.lost_sample_count);
ht_data.host_read_ptr += bytes_to_copy;
bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
ht_data.lost_sample_count = 0;
session.HandleSampleData(pcs_data->host_read_ptr, bytes_to_copy, nullptr, 0,
pcs_data->lost_sample_count);
pcs_data->host_read_ptr += bytes_to_copy;
bytes_before_wrap = pcs_data->host_buffer_wrap_pos - pcs_data->host_read_ptr;
pcs_data->lost_sample_count = 0;
}
assert(ht_data.host_read_ptr == ht_data.host_buffer_wrap_pos);
ht_data.host_buffer_wrap_pos = 0;
ht_data.host_read_ptr = host_buffer_begin;
assert(pcs_data->host_read_ptr == pcs_data->host_buffer_wrap_pos);
pcs_data->host_buffer_wrap_pos = 0;
pcs_data->host_read_ptr = host_buffer_begin;
while (bytes_after_wrap > 0) {
size_t bytes_to_copy = std::min(bytes_after_wrap, session.buffer_size());
session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0,
ht_data.lost_sample_count);
ht_data.host_read_ptr += bytes_to_copy;
bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
ht_data.lost_sample_count = 0;
session.HandleSampleData(pcs_data->host_read_ptr, bytes_to_copy, nullptr, 0,
pcs_data->lost_sample_count);
pcs_data->host_read_ptr += bytes_to_copy;
bytes_after_wrap = pcs_data->host_write_ptr - pcs_data->host_read_ptr;
pcs_data->lost_sample_count = 0;
}
} else {
bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
bytes_before_wrap = pcs_data->host_write_ptr - pcs_data->host_read_ptr;
while (bytes_before_wrap) {
while (bytes_before_wrap > 0) {
size_t bytes_to_copy = std::min(bytes_before_wrap, session.buffer_size());
assert(ht_data.host_read_ptr >= host_buffer_begin &&
ht_data.host_read_ptr + bytes_to_copy <= host_buffer_end);
assert(pcs_data->host_read_ptr >= host_buffer_begin &&
pcs_data->host_read_ptr + bytes_to_copy <= host_buffer_end);
session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0,
ht_data.lost_sample_count);
ht_data.host_read_ptr += bytes_to_copy;
bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
ht_data.lost_sample_count = 0;
session.HandleSampleData(pcs_data->host_read_ptr, bytes_to_copy, nullptr, 0,
pcs_data->lost_sample_count);
pcs_data->host_read_ptr += bytes_to_copy;
bytes_before_wrap = pcs_data->host_write_ptr - pcs_data->host_read_ptr;
pcs_data->lost_sample_count = 0;
}
}
return HSA_STATUS_SUCCESS;
+154 -35
Просмотреть файл
@@ -53,6 +53,7 @@
.set SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT , 11
.set SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT , 28
.set SQ_WAVE_TRAPSTS_MATH_EXCP , 0x7F
.set SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT , 26
.set SQ_WAVE_MODE_EXCP_EN_SHIFT , 12
.set SQ_WAVE_MODE_EXCP_EN_SIZE , 8
.set TRAP_ID_ABORT , 2
@@ -95,12 +96,23 @@
// TTMP_REG1 means ttmp6 register if gfx>=942 and means ttmp13 register if gfx<942
// TTMP_REG2 means ttmp11 register if gfx>=942 and means ttmp6 register if gfx<942
.if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4
.if .amdgcn.gfx_generation_number == 9
.set TTMP11_TTMPS_SETUP_SHIFT , 31
// Bit to indicate that this is a hosttrap trap instead of stochastic trap
// Currently not used
.set TTMP13_PCS_IS_STOCHASTIC , 24
.if (.amdgcn.gfx_generation_minor >= 4)
.set TTMP11_WAVE_IN_WG_MASK , 0x3F
// Bit to indicate that this is a stochastic trap
.set TTMP13_PCS_IS_STOCHASTIC , 21
// Bit to indicate that this is a host trap
.set TTMP13_PCS_IS_HOSTTRAP , 22
.else
// Bit to indicate that this is a host trap
.set TTMP11_PCS_IS_HOSTTRAP , 22
.endif
.endif
.if (.amdgcn.gfx_generation_number == 9)
@@ -205,27 +217,45 @@
// ttmp15 = TMA[63:32]
// gfx9:
// ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32]
// all gfx9 (except gfx942):
// For all gfx9 (except gfx940, gfx941, gfx942):
// ttmp6 = 0[6:0], DispatchPktIndx[24:0]
// ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveInWg[5:0]
// Note: Once stochastic sampling is implemented, L2 Trap Handler will use Bit 23
// (TTMP11_PCS_IS_STOCHASTIC) to differentiate between stochastic and hosttrap
// gfx942:
//
// For gfx940/gfx941/gfx942:
// ttmp11 = 0[0], DispatchPktIndx[24:0], WaveIdInWg[5:0]
// ttmp13 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[22:0]
// ttmp13:
// Bits 31:26 : SQ_WAVE_IB_STS[20:15] (1TH)
// 25:24 : 0 on 2TH entry. Used by 1st level TH but also
// free to be used in the 2nd level TH
// 23 : Debug Enabled (1TH)
// 22:0 : values are unspecified on 2TH entry. Free.
//
// gfx10:
// ttmp1 = 0[0], PCRewind[5:0], HostTrap[0], TrapId[7:0], PC[47:32]
//
// gfx10/gfx11:
// ttmp6 = 0[6:0], DispatchPktIndx[24:0]
//
// gfx1010:
// ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
//
// gfx1030/gfx1100:
// ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
//
// ttmp[14:15] points to TMA2; Available: ttmp[2:3], ttmp[4:5]
//
// ttmp7 : gfx9, gfx1010, gfx1030, gfx11 - 31:0 : PC[31:0] (2TH, DBG);
// : gfx940 - free;
// : gfx12 - ttmp7 - 31:16 : workgroup_z[15:0] (SPI) and 15:0 : workgroup_y[15:0] (SPI)
trap_entry:
// Branch if not a trap (an exception instead).
s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
s_cbranch_scc0 .no_skip_debugtrap
// Extract trap_id from ttmp2
s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
s_cbranch_scc0 .not_s_trap // If trap_id == 0, it's not an s_trap nor host trap
// Check if the it was an host trap.
s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT
s_cbranch_scc0 .not_host_trap
.if (.amdgcn.gfx_generation_number == 9) // PC_SAMPLING_GFX9
// ttmp[14:15] is TMA2; Available: ttmp[2:3], ttmp[4:5], ttmp7, TTMP_REG1
@@ -236,29 +266,42 @@ trap_entry:
// [0x08] out_buf_t* stochastic_trap_buffers;
//
// --- Start profile trap handlers GFX9 --- //
// if (host_trap) {
// if (stochastic) // Not implemented yet
// ttmp11.bit23 = 1; // Not implemented yet
// profiling_trap_handler(tma->host_trap_buffers);
// }
// If the wave entered the trap handler:
// If on gfx9:
// - Check SQ_WAVE_PC_HI_HT_SHIFT bit on TTMP1 register to
// identify if it was a host trap.
// If a host trap is detected:
// - Mark TTMP13(gfx94x) or TTMP11(gfx9) hosttrap bit
// - Load host_trap_buffers
// - Branch to the profile trap handler logic.
//
// If on gfx9.4+:
// - Check TRAPSTS bit 26 (SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT) to
// identify stochastic traps.
// If a stochastic trap is detected:
// - Set bit 21 in TTMP13 to indicate a stochastic trap.
// - Branch to the profile trap handler logic.
s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT
s_cbranch_scc0 .not_host_trap_gfx9
s_load_dwordx2 ttmp[14:15], ttmp[14:15], 0 glc // ttmp[14:15]=&host_trap_buffers
// TODO: When implementing stochastic sampling, need to set TTMP11_PCS_IS_STOCHASTIC
// or TTMP13_PCS_IS_STOCHASTIC to differentiate between hosttrap and stochastic sampling
s_load_dwordx2 ttmp[2:3], ttmp[14:15], 0 glc // ttmp[14:15]=*host_trap_buffers
.if .amdgcn.gfx_generation_minor >= 4
s_bitset0_b32 ttmp13, TTMP13_PCS_IS_STOCHASTIC
s_bitset1_b32 ttmp13, TTMP13_PCS_IS_HOSTTRAP // set bit 22 in TTMP13
.else
s_bitset1_b32 ttmp11, TTMP11_PCS_IS_HOSTTRAP // Set bit 22 in TTMP11
.endif
s_waitcnt lgkmcnt(0)
s_branch .profile_trap_handlers_gfx9 // Off to the profile handlers
s_mov_b64 ttmp[14:15], ttmp[2:3] //now ttmp[14:15] = host_trap_buffers
s_branch .profile_trap_handlers_gfx9 // Off to the profile handlers
.else
// Ignore host traps. They should be masked by the driver anyway.
s_branch .not_s_trap
.endif
.not_host_trap_gfx9:
.endif // PC_SAMPLING_GFX9
// If caused by s_trap then advance PC.
s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT
s_cbranch_scc1 .not_s_trap
.not_host_trap:
// It's an s_trap; advance the PC
s_add_u32 ttmp0, ttmp0, 0x4
s_addc_u32 ttmp1, ttmp1, 0x0
.not_s_trap:
// If llvm.debugtrap and debugger is not attached.
s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP
s_cbranch_scc0 .no_skip_debugtrap
@@ -272,6 +315,24 @@ trap_entry:
// Ignore llvm.debugtrap.
s_branch .exit_trap
.not_s_trap:
.if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4
//Check for stochastic trap on gfx9.4+
s_getreg_b32 ttmp7, hwreg(HW_REG_TRAPSTS) // On gfx94x, TRAPSTS bit 26 ...
s_bitcmp1_b32 ttmp7, SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT // is stochastic_sample_trap
s_cbranch_scc0 .no_skip_debugtrap
// Handle stochastic trap
s_load_dwordx2 ttmp[2:3], ttmp[14:15], 0x8 glc // ttmp[14:15]=*stoch_trap_buf
s_bitset0_b32 ttmp13, TTMP13_PCS_IS_HOSTTRAP
s_bitset1_b32 ttmp13, TTMP13_PCS_IS_STOCHASTIC // set bit 25 in TTMP13
s_waitcnt lgkmcnt(0)
s_mov_b64 ttmp[14:15], ttmp[2:3]
s_branch .profile_trap_handlers_gfx9 // Off to the profile handlers
.else
s_branch .no_skip_debugtrap
.endif // PC_SAMPLING_GFX9
.if (.amdgcn.gfx_generation_number == 9) // PC_SAMPLING_GFX9
// tma->host_trap_buffers Offsets:
// [0x00] uint64_t buf_write_val;
@@ -348,6 +409,26 @@ trap_entry:
s_addc_u32 ttmp5, ttmp15, ttmp5 // buffer0 or buffer1
s_mov_b32 ttmp7, ttmp2
.if .amdgcn.gfx_generation_number == 9
.if .amdgcn.gfx_generation_minor >= 4
// Check if it's a stochastic trap
s_bitcmp1_b32 ttmp13, TTMP13_PCS_IS_STOCHASTIC
s_cbranch_scc1 .fill_sample_stochastic
// Check if it's a host trap
s_bitcmp1_b32 ttmp13, TTMP13_PCS_IS_HOSTTRAP
s_cbranch_scc1 .fill_sample_hosttrap
.else
// Check if it's a host trap
s_bitcmp1_b32 ttmp11, TTMP11_PCS_IS_HOSTTRAP
s_cbranch_scc1 .fill_sample_hosttrap
.endif
.endif
// If neither bit is set, this is unexpected.
// This branch is not expected to be taken.
s_branch .no_skip_debugtrap
// ttmp7 contains local_entry, ttmp[4:5] contains "&bufferX",
// ttmp[14:15] holds 'tma->host_trap_buffers' pointer
// ttmp[2:3] and ttmp13 are available for gathering perf sample info
@@ -381,7 +462,7 @@ trap_entry:
// buf->timestamp = s_memrealtime;
// buf->correlation_id = get_correlation_id();
// }
.fill_sample_hosttrap:
s_mul_i32 ttmp2, ttmp7, 0x40 // offset into buffer for 64B objects
s_mul_hi_u32 ttmp3, ttmp7, 0x40 // ttmp[2:3] will contain byte ...
s_add_u32 ttmp2, ttmp2, ttmp4
@@ -401,19 +482,56 @@ trap_entry:
.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4)
s_getreg_b32 ttmp4, hwreg(HW_REG_XCC_ID) //store XCC_ID
s_lshl_b32 ttmp4, ttmp4, 8
s_and_b32 ttmp5, ttmp11, 0x3f
s_and_b32 ttmp5, ttmp11, TTMP11_WAVE_IN_WG_MASK
s_or_b32 ttmp4, ttmp4, ttmp5
s_store_dword ttmp4, ttmp[2:3], 0x1c // store wave_in_wg
.else
s_and_b32 ttmp4, ttmp11, 0x3f
s_store_dword ttmp4, ttmp[2:3], 0x1c // store wave_in_wg
.endif
// Get HW_ID using S_GETREG_B32 with size=32 (F8 in upper bits), offset=0, and HW_ID = 4 (0x4)
s_getreg_b32 ttmp4, hwreg(HW_REG_HW_ID)
s_store_dword ttmp4, ttmp[2:3], 0x20 // store HW_ID
// ttmp[2:3] = &buffer[local_entry]; ttmp[4:5], ttmp7, and ttmp13 are free
// ttmp[14:15] = tma->host_trap_buffers and is live out; ttmp6.b31 is buf_to_use, 0 or 1
s_branch .get_correlation_id
.if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4
.fill_sample_stochastic:
s_mul_i32 ttmp2, ttmp7, 0x40 // offset into buffer for 64B objects
s_mul_hi_u32 ttmp3, ttmp7, 0x40
s_add_u32 ttmp2, ttmp2, ttmp4
s_addc_u32 ttmp3, ttmp3, ttmp5 // ttmp[2:3]=&buffer[local_entry]
s_memrealtime ttmp[4:5]
s_waitcnt lgkmcnt(0) // Wait for timestamp
s_store_dwordx2 ttmp[4:5], ttmp[2:3] 0x30 // Store timestamp
s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA)
s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA1)
s_store_dwordx2 ttmp[4:5], ttmp[2:3], 0x24 // store snapshot PC
s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_LO)
s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_HI)
s_store_dwordx2 ttmp[4:5], ttmp[2:3] 0x00 // store snapshot data
s_mov_b32 ttmp6, exec_lo
s_store_dword ttmp6, ttmp[2:3], 0x8 // store EXEC_LO
s_mov_b32 ttmp6, exec_hi
s_store_dword ttmp6, ttmp[2:3], 0xc // store EXEC_HI
s_store_dwordx2 ttmp[8:9], ttmp[2:3], 0x10 // store wg_id_x and wg_id_y
s_store_dword ttmp10, ttmp[2:3], 0x18 // store wg_id_z
s_getreg_b32 ttmp4, hwreg(HW_REG_XCC_ID)
s_lshl_b32 ttmp4, ttmp4, 8
s_and_b32 ttmp5, ttmp11, TTMP11_WAVE_IN_WG_MASK
s_or_b32 ttmp4, ttmp4, ttmp5
s_store_dword ttmp4, ttmp[2:3], 0x1c // store chiplet_and_wave_id
s_getreg_b32 ttmp4, hwreg(HW_REG_HW_ID)
s_store_dword ttmp4, ttmp[2:3], 0x20 // store HW_ID
// ttmp[2:3]=&buffer[local_entry]; ttmp[4:5], ttmp[6:7] are free
// ttmp[14:15]=ptr to tma and is live out; ttmp11.b31 is buf_to_use, 0 or 1
s_branch .get_correlation_id
.endif
.get_correlation_id:
// get_correlation_id() -- begin //
// Returns a value to use as a correlation ID.
@@ -437,6 +555,7 @@ trap_entry:
// ttmp[4:5], ttmp7, and ttmp13 are free
// ttmp[14:15] = tma->host_trap_buffers and is live out
// ttmp6.b31 is buf_to_use, 0 or 1 and is live out
s_mov_b64 ttmp[4:5], exec // back up EXEC mask
s_mov_b32 exec_lo, 0x80000000 // prepare EXEC for doorbell spin
s_sendmsg sendmsg(MSG_GET_DOORBELL) // message 10, puts doorbell in EXEC
@@ -519,7 +638,6 @@ trap_entry:
s_getreg_b32 ttmp3, hwreg(HW_REG_MODE, SQ_WAVE_MODE_EXCP_EN_SHIFT, SQ_WAVE_MODE_EXCP_EN_SIZE) // ttmp3[7:0] = MODE.EXCP_EN
// Set bits corresponding to TRAPSTS.MEM_VIOL, TRAPSTS.ILLEGAL_INST and TRAPSTS.XNACK_ERROR
s_or_b32 ttmp3, ttmp3, (1 << SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT | 1 << SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT | 1 << SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT)
s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS)
s_and_b32 ttmp2, ttmp2, ttmp3
// SCC will be 1 if either a maskable instruction was set, or one of MEM_VIOL, ILL_INST, XNACK_ERROR
s_cbranch_scc1 .no_skip_debugtrap // if any of those are set, handle exceptions
@@ -539,6 +657,7 @@ trap_entry:
.no_skip_debugtrap:
// Save trap id and halt status in ttmp6.
s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK)
s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
s_min_u32 ttmp2, ttmp2, 0xF
s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT
s_or_b32 ttmp6, ttmp6, ttmp2