PC Sampling: Allocate resources to retrieve data from trap handler

Allocate required device and host buffers to be able to interact with
the 2nd level trap handler.

Change-Id: If99de5aacf956ca57ecafc7b04b797be9c9decaa
Этот коммит содержится в:
David Yat Sin
2023-08-17 15:04:58 +00:00
родитель 431a70471e
Коммит 8d666dea01
3 изменённых файлов: 184 добавлений и 3 удалений
+8
Просмотреть файл
@@ -46,6 +46,7 @@
#define HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_
#include <vector>
#include <list>
#include <map>
#include "hsakmt/hsakmt.h"
@@ -720,6 +721,13 @@ class GpuAgent : public GpuAgentInt {
} pcs_hosttrap_sampling_data_t;
typedef struct {
/* Hosttrap data - stored on device so that trap_handler code can access efficiently */
pcs_hosttrap_sampling_data_t* device_data;
/* Hosttrap host buffer - stored on host */
uint8_t* host_buffer;
size_t host_buffer_size;
os::Thread thread;
pcs::PcsRuntime::PcSamplingSession* session;
} pcs_hosttrap_t;
+164 -3
Просмотреть файл
@@ -2375,12 +2375,13 @@ hsa_status_t GpuAgent::PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configura
}
hsa_status_t GpuAgent::PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) {
hsa_status_t ret;
HsaPcSamplingInfo sampleInfo = {};
HsaPcSamplingTraceId thunkId;
// IOCTL id does not exist at the moment, so passing 0 is OK,
// since it will be overridden later in this function.
hsa_status_t ret = PcSamplingCreateFromId(0, session);
ret = PcSamplingCreateFromId(0, session);
if (ret != HSA_STATUS_SUCCESS) return ret;
session.GetHsaKmtSamplingInfo(&sampleInfo);
@@ -2401,8 +2402,141 @@ hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId,
pcs::PcsRuntime::PcSamplingSession& session) {
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1 && ht_data.session)
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
// TODO: For now can only have 1 hosttrap session at a time. As a final solution, we want to be
// able to support multiple sessions at a time. But this makes the session->HandleSampleData
// more complicated if multiple sessions have different buffer sizes.
if (ht_data.session) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
// Local copy of hosttrap data - we cannot access device memory directly on non-large BAR
// systems
pcs_hosttrap_sampling_data_t* device_datahost =
(pcs_hosttrap_sampling_data_t*)system_allocator()(sizeof(*device_datahost), 0x1000, 0);
if (!device_datahost) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
MAKE_SCOPE_GUARD([&]() { system_deallocator()(device_datahost); });
memset(device_datahost, 0, sizeof(*device_datahost));
if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, device_datahost) !=
HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR;
MAKE_NAMED_SCOPE_GUARD(freeHostTrapResources, [&]() {
if (ht_data.device_data) {
if (ht_data.device_data->done_sig0.handle)
HSA::hsa_signal_destroy(ht_data.device_data->done_sig0);
if (ht_data.device_data->done_sig1.handle)
HSA::hsa_signal_destroy(ht_data.device_data->done_sig1);
finegrain_deallocator()(ht_data.device_data);
}
if (ht_data.host_buffer) system_deallocator()(ht_data.host_buffer);
});
// Force creating of PC Sampling queue to trigger exception early in case we exceed max availble
// CP queues on this agent
queues_[QueuePCSampling].touch();
/*
* When calling queue->ExecutePM4() Indirect Buffer size which is 0x1000 bytes (1024 DW).
* The maximum indirect buffer size we need occurs when we enqueue the
* WAIT_REG_MEM, DMA_COPY(s), WRITE_DATA ops:
* For WAIT_REG_MEM = 7 DW
* For each DMA_COPY = 7 DW
* For WRITE_DATA_CMD = 6 DW
*
* So maximum number of DMA_COPY ops is:
* (MAX_IB_SIZE - sizeof(WAIT_REG_MEM) - sizeof(WRITE_DATA_CMD)) / sizeof(DMA_COPY)
* (1024 - 7 - 6) / 7 = 144
*
* Each DMA_COPY op can transfer (1 << 26) bytes, which is 9 GB. trap_buffer_size is a 32-bit
* number, so the buffer must be < 4 GB. So we are not limited by Indirect Buffer size.
* Set current limit to 256 MB to limit device VRAM usage
*/
const size_t max_trap_buffer_size =
core::Runtime::runtime_singleton_->flag().pc_sampling_max_device_buffer_size();
/*
* We use a double-buffer mechanism where there are 2 trap-buffers and 1 host-buffer
* Warning: This currently assumes that client latency is smaller than time to fill 1
* trap-buffer If latency is bigger, we have to increate host-buffer
*
* host-buffer must be >= client-buffer so that we can copy full size of client-buffer each
* time. To avoid having to deal with wrap-arounds, host-buffer must be a multiple of
* trap-buffers
*
* if client-buffer size is greater than 2x max_trap_buffer_size:
* We are limited by max_trap_buffer_size.
* trap-buffer = max-trap-buffer-size
* host-buffer = 2*smallest size greater than client-buffer but multiple of 1 trap-buffer
* else:
* We reduce the trap-buffers so that:
* trap-buffer = half of user-buffer
* host-buffer = 2*user-buffer
*
* TODO: We are currently using a temporary host-buffer so that we can increase host-buffer to
* factor in client latency. Using a direct-copy to the client buffer would be more efficient.
* Revisit this once we have empirical data of latency vs how long it takes to fill 1
* trap-buffer.
*/
size_t trap_buffer_size = 0;
if (session.buffer_size() > 2 * max_trap_buffer_size) {
trap_buffer_size = max_trap_buffer_size;
ht_data.host_buffer_size = 2 * AlignUp(session.buffer_size(), trap_buffer_size);
} else {
trap_buffer_size = session.buffer_size() / 2;
ht_data.host_buffer_size = 2 * session.buffer_size();
}
ht_data.host_buffer = (uint8_t*)system_allocator()(ht_data.host_buffer_size, 0x1000, 0);
if (!ht_data.host_buffer) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, ht_data.host_buffer) !=
HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR;
device_datahost->buf_size = trap_buffer_size / session.sample_size();
if (HSA::hsa_signal_create(1, 0, NULL, &device_datahost->done_sig0) != HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
if (HSA::hsa_signal_create(1, 0, NULL, &device_datahost->done_sig1) != HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
// TODO: Once we have things working and can measure
// latency after 2nd level trap handler decrements signals and set watermark accordingly
device_datahost->buf_watermark0 = 0.8 * device_datahost->buf_size;
device_datahost->buf_watermark1 = 0.8 * device_datahost->buf_size;
// Allocate device memory for 2nd level trap handler TMA
size_t deviceAllocSize = sizeof(*ht_data.device_data) + (2 * trap_buffer_size);
ht_data.device_data = (pcs_hosttrap_sampling_data_t*)finegrain_allocator()(deviceAllocSize, 0);
if (ht_data.device_data == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
// This cpuAgent is the owner of the system_allocator() pool
auto cpuAgent = GetNearestCpuAgent()->public_handle();
hsa_status_t ret = AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, ht_data.device_data);
assert(ret == HSA_STATUS_SUCCESS);
if (DmaCopy(ht_data.device_data, device_datahost, sizeof(*device_datahost)) !=
HSA_STATUS_SUCCESS) {
debug_print("Failed to dmaCopy!\n");
return HSA_STATUS_ERROR;
}
uint8_t* device_buf_ptr =
((uint8_t*)ht_data.device_data) + sizeof(pcs_hosttrap_sampling_data_t);
if (DmaFill(device_buf_ptr, 0, deviceAllocSize - sizeof(pcs_hosttrap_sampling_data_t)) !=
HSA_STATUS_SUCCESS) {
debug_print("Failed to dmaFill!\n");
return HSA_STATUS_ERROR;
}
ht_data.session = &session;
freeHostTrapResources.Dismiss();
}
session.SetThunkId(ioctlId);
ht_data.session = &session;
@@ -2417,6 +2551,16 @@ hsa_status_t GpuAgent::PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& ses
HSAKMT_STATUS retKmt = hsaKmtPcSamplingDestroy(node_id(), session.ThunkId());
ht_data.session = NULL;
if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
HSA::hsa_signal_destroy(ht_data.device_data->done_sig0);
HSA::hsa_signal_destroy(ht_data.device_data->done_sig1);
finegrain_deallocator()(ht_data.device_data);
system_deallocator()(ht_data.host_buffer);
ht_data.device_data = NULL;
ht_data.host_buffer = NULL;
ht_data.session = NULL;
}
return (retKmt == HSAKMT_STATUS_SUCCESS) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
}
@@ -2466,6 +2610,10 @@ hsa_status_t GpuAgent::PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& sessio
throw AMD::hsa_exception(HSA_STATUS_ERROR, "Failed to stop PC Sampling session.");
if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
// Wake up pcs_hosttrap_thread_ if it is waiting for data
HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig0, -1);
HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig1, -1);
os::WaitForThread(ht_data.thread);
os::CloseThread(ht_data.thread);
ht_data.thread = NULL;
@@ -2475,11 +2623,24 @@ hsa_status_t GpuAgent::PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& sessio
}
void GpuAgent::PcSamplingThread() {
uint64_t which_buffer = 0;
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
hsa_signal_t done_sig[] = {ht_data.device_data->done_sig0, ht_data.device_data->done_sig1};
while (ht_data.session->isActive()) {
do {
hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire(
done_sig[which_buffer], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
HSA_WAIT_STATE_ACTIVE); // TODO: change to HSA_WAIT_STATE_BLOCKED later
if (val == -1) goto thread_exit;
if (val == 0) break;
} while (true);
// Implement code to read data from 2nd level trap handler here
sleep(1);
}
thread_exit:
debug_print("PcSamplingThread::Exiting\n");
}
+12
Просмотреть файл
@@ -67,6 +67,7 @@ class Flag {
// Lift limit for 2.10 release RCCL workaround. This limit is not used when asynchronous scratch
// reclaim is supported
const size_t DEFAULT_SCRATCH_SINGLE_LIMIT = 146800640; // small_limit >> 2;
const size_t DEFAULT_PCS_MAX_DEVICE_BUFFER_SIZE = 256 * 1024 * 1024;
explicit Flag() { Refresh(); }
@@ -230,6 +231,13 @@ class Flag {
var = os::GetEnvVar("HSA_ENABLE_IPC_MODE_LEGACY");
enable_ipc_mode_legacy_ = (var == "1") ? true : false;
if (os::IsEnvVarSet("HSA_PCS_MAX_DEVICE_BUFFER_SIZE")) {
var = os::GetEnvVar("HSA_PCS_MAX_DEVICE_BUFFER_SIZE");
char* end;
pc_sampling_max_device_buffer_size_ = strtoul(var.c_str(), &end, 10);
} else {
pc_sampling_max_device_buffer_size_ = DEFAULT_PCS_MAX_DEVICE_BUFFER_SIZE;
}
// Temporary environment variable to disable CPU affinity override
// Will either rename to HSA_OVERRIDE_CPU_AFFINITY later or remove completely.
@@ -341,6 +349,8 @@ class Flag {
bool enable_ipc_mode_legacy() const { return enable_ipc_mode_legacy_; }
size_t pc_sampling_max_device_buffer_size() const { return pc_sampling_max_device_buffer_size_; }
private:
bool check_flat_scratch_;
bool enable_vm_fault_message_;
@@ -396,6 +406,8 @@ class Flag {
SRAMECC_ENABLE sramecc_enable_;
size_t pc_sampling_max_device_buffer_size_;
// Map GPU index post RVD to its default cu mask.
std::map<uint32_t, std::vector<uint32_t>> cu_mask_;