PC Sampling: Add create and destroy APIs

Implement PC Sampling session create and destroy APIs.

Change-Id: I93370d3d01b74ee15e71b8b0e20feb8f0066a3dc

Signed-off-by: David Yat Sin <David.YatSin@amd.com>
Signed-off-by: Vladimir Indic <Vladimir.Indic@amd.com>
Change-Id: Ib0c64356a1a4616b12d5dbeebe16273fe2a84abe


[ROCm/ROCR-Runtime commit: 632f9e60f7]
Dieser Commit ist enthalten in:
David Yat Sin
2023-09-23 15:48:46 +00:00
Ursprung 0a4415f202
Commit 566e2c60fd
7 geänderte Dateien mit 330 neuen und 2 gelöschten Zeilen
@@ -192,6 +192,13 @@ class GpuAgentInt : public core::Agent {
// @retval HSA_STATUS_SUCCESS if successful
virtual hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
void* cb_data) = 0;
virtual hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) = 0;
virtual hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId,
pcs::PcsRuntime::PcSamplingSession& session) = 0;
virtual hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) = 0;
};
class GpuAgent : public GpuAgentInt {
@@ -474,6 +481,10 @@ class GpuAgent : public GpuAgentInt {
hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
void* cb_data);
hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId,
pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session);
// @brief Node properties.
const HsaNodeProperties properties_;
@@ -677,6 +688,15 @@ class GpuAgent : public GpuAgentInt {
std::function<void*(size_t size, core::MemoryRegion::AllocateFlags flags)> finegrain_allocator_;
std::function<void(void*)> finegrain_deallocator_;
/* PC Sampling fields - begin */
typedef struct {
pcs::PcsRuntime::PcSamplingSession* session;
} pcs_hosttrap_t;
pcs_hosttrap_t pcs_hosttrap_data_;
/* PC Sampling fields - end */
// @brief device handle
amdgpu_device_handle ldrm_dev_;
@@ -113,7 +113,8 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
sdma_blit_used_mask_(0),
scratch_limit_async_threshold_(0),
scratch_cache_(
[this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }) {
[this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }),
pcs_hosttrap_data_() {
const bool is_apu_node = (properties_.NumCPUCores > 0);
profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE;
@@ -2362,6 +2363,49 @@ hsa_status_t GpuAgent::PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configura
return HSA_STATUS_SUCCESS;
}
hsa_status_t GpuAgent::PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) {
HsaPcSamplingInfo sampleInfo = {};
HsaPcSamplingTraceId thunkId;
// IOCTL id does not exist at the moment, so passing 0 is OK,
// since it will be overridden later in this function.
hsa_status_t ret = PcSamplingCreateFromId(0, session);
if (ret != HSA_STATUS_SUCCESS) return ret;
session.GetHsaKmtSamplingInfo(&sampleInfo);
HSAKMT_STATUS retkmt = hsaKmtPcSamplingCreate(node_id(), &sampleInfo, &thunkId);
if (retkmt != HSAKMT_STATUS_SUCCESS) {
return (retkmt == HSAKMT_STATUS_KERNEL_ALREADY_OPENED) ? (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY
: HSA_STATUS_ERROR;
}
debug_print("Created PC sampling session with thunkId:%d\n", thunkId);
session.SetThunkId(thunkId);
return ret;
}
hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId,
pcs::PcsRuntime::PcSamplingSession& session) {
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1 && ht_data.session)
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
session.SetThunkId(ioctlId);
ht_data.session = &session;
return HSA_STATUS_SUCCESS;
}
hsa_status_t GpuAgent::PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) {
pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
HSAKMT_STATUS retKmt = hsaKmtPcSamplingDestroy(node_id(), session.ThunkId());
ht_data.session = NULL;
return (retKmt == HSAKMT_STATUS_SUCCESS) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
}
} // namespace amd
} // namespace rocr
@@ -253,6 +253,9 @@ global:
hsa_tools_scratch_event_async_reclaim_start;
hsa_tools_scratch_event_async_reclaim_end;
hsa_ven_amd_pcs_iterate_configuration;
hsa_ven_amd_pcs_create;
hsa_ven_amd_pcs_create_from_id;
hsa_ven_amd_pcs_destroy;
local:
*;
@@ -94,10 +94,50 @@ hsa_status_t hsa_ven_amd_pcs_iterate_configuration(
CATCH;
}
hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t hsa_agent, hsa_ven_amd_pcs_method_kind_t method,
hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency,
size_t buffer_size,
hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb,
void* client_cb_data, hsa_ven_amd_pcs_t* handle) {
TRY;
IS_OPEN();
core::Agent* agent = core::Agent::Convert(hsa_agent);
if (agent == NULL || !agent->IsValid() || agent->device_type() != core::Agent::kAmdGpuDevice)
return HSA_STATUS_ERROR_INVALID_AGENT;
return PcsRuntime::instance()->PcSamplingCreate(
agent, method, units, interval, latency, buffer_size, data_ready_cb, client_cb_data, handle);
CATCH;
}
hsa_status_t hsa_ven_amd_pcs_create_from_id(uint32_t pcs_id, hsa_agent_t hsa_agent,
hsa_ven_amd_pcs_method_kind_t method,
hsa_ven_amd_pcs_units_t units, size_t interval,
size_t latency, size_t buffer_size,
hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb,
void* client_cb_data, hsa_ven_amd_pcs_t* handle) {
TRY;
IS_OPEN();
core::Agent* agent = core::Agent::Convert(hsa_agent);
if (agent == NULL || !agent->IsValid() || agent->device_type() != core::Agent::kAmdGpuDevice)
return HSA_STATUS_ERROR_INVALID_AGENT;
return PcsRuntime::instance()->PcSamplingCreateFromId(pcs_id, agent, method, units, interval,
latency, buffer_size, data_ready_cb,
client_cb_data, handle);
CATCH;
}
hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t handle) {
TRY;
return PcsRuntime::instance()->PcSamplingDestroy(handle);
CATCH;
}
void LoadPcSampling(core::PcSamplingExtTableInternal* pcs_api) {
pcs_api->hsa_ven_amd_pcs_iterate_configuration_fn = hsa_ven_amd_pcs_iterate_configuration;
pcs_api->hsa_ven_amd_pcs_create_fn = hsa_ven_amd_pcs_create;
pcs_api->hsa_ven_amd_pcs_create_from_id_fn = hsa_ven_amd_pcs_create_from_id;
pcs_api->hsa_ven_amd_pcs_destroy_fn = hsa_ven_amd_pcs_destroy;
pcs_api->hsa_ven_amd_pcs_start_fn = hsa_ven_amd_pcs_start;
pcs_api->hsa_ven_amd_pcs_stop_fn = hsa_ven_amd_pcs_stop;
@@ -59,6 +59,20 @@ hsa_status_t hsa_ven_amd_pcs_iterate_configuration(
hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
void* callback_data);
hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency,
size_t buffer_size,
hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback,
void* client_callback_data, hsa_ven_amd_pcs_t* pc_sampling);
hsa_status_t hsa_ven_amd_pcs_create_from_id(
uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size,
hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data,
hsa_ven_amd_pcs_t* pc_sampling);
hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling);
// Update Api table with func pointers that implement functionality
void LoadPcSampling(core::PcSamplingExtTableInternal* pcs_api);
@@ -52,6 +52,11 @@
namespace rocr {
namespace pcs {
#define IS_BAD_PTR(ptr) \
do { \
if ((ptr) == NULL) return HSA_STATUS_ERROR_INVALID_ARGUMENT; \
} while (false)
std::atomic<PcsRuntime*> PcsRuntime::instance_(NULL);
std::mutex PcsRuntime::instance_mutex_;
@@ -94,6 +99,66 @@ void PcsRuntime::DestroySingleton() {
}
void ReleasePcSamplingRsrcs() { PcsRuntime::DestroySingleton(); }
PcsRuntime::PcSamplingSession::PcSamplingSession(
core::Agent* _agent, hsa_ven_amd_pcs_method_kind_t method, hsa_ven_amd_pcs_units_t units,
size_t interval, size_t latency, size_t buffer_size,
hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data)
: agent(_agent), thunkId_(0), valid_(true), sample_size_(0) {
switch (method) {
case HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1:
sample_size_ = sizeof(perf_sample_hosttrap_v1_t);
break;
case HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1:
sample_size_ = sizeof(perf_sample_snapshot_v1_t);
break;
default:
valid_ = false;
return;
}
if (!interval || !buffer_size || (buffer_size % (2 * sample_size_))) {
valid_ = false;
return;
}
csd.method = method;
csd.units = units;
csd.interval = interval;
csd.latency = latency;
csd.buffer_size = buffer_size;
csd.data_ready_callback = data_ready_callback;
csd.client_callback_data = client_callback_data;
}
void PcsRuntime::PcSamplingSession::GetHsaKmtSamplingInfo(HsaPcSamplingInfo* sampleInfo) {
sampleInfo->value_min = 0;
sampleInfo->value_max = 0;
sampleInfo->flags = 0;
sampleInfo->value = csd.interval;
switch (csd.method) {
case HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1:
sampleInfo->method = HSA_PC_SAMPLING_METHOD_KIND_HOSTTRAP_V1;
break;
case HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1:
sampleInfo->method = HSA_PC_SAMPLING_METHOD_KIND_STOCHASTIC_V1;
break;
}
switch (csd.units) {
case HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS:
sampleInfo->units = HSA_PC_SAMPLING_UNIT_INTERVAL_MICROSECONDS;
break;
case HSA_VEN_AMD_PCS_INTERVAL_UNITS_CLOCK_CYCLES:
sampleInfo->units = HSA_PC_SAMPLING_UNIT_INTERVAL_CYCLES;
break;
case HSA_VEN_AMD_PCS_INTERVAL_UNITS_INSTRUCTIONS:
sampleInfo->units = HSA_PC_SAMPLING_UNIT_INTERVAL_INSTRUCTIONS;
break;
}
}
hsa_status_t PcsRuntime::PcSamplingIterateConfig(
core::Agent* agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
void* callback_data) {
@@ -101,6 +166,80 @@ hsa_status_t PcsRuntime::PcSamplingIterateConfig(
return gpu_agent->PcSamplingIterateConfig(configuration_callback, callback_data);
}
hsa_status_t PcsRuntime::PcSamplingCreate(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method,
hsa_ven_amd_pcs_units_t units, size_t interval,
size_t latency, size_t buffer_size,
hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb,
void* client_cb_data, hsa_ven_amd_pcs_t* handle) {
IS_BAD_PTR(handle);
IS_BAD_PTR(data_ready_cb);
return PcSamplingCreateInternal(
agent, method, units, interval, latency, buffer_size, data_ready_cb, client_cb_data, handle,
[](core::Agent* agent_, PcSamplingSession& session_) {
return static_cast<AMD::GpuAgentInt*>(agent_)->PcSamplingCreate(session_);
});
}
hsa_status_t PcsRuntime::PcSamplingCreateFromId(uint32_t ioctl_pcs_id, core::Agent* agent,
hsa_ven_amd_pcs_method_kind_t method,
hsa_ven_amd_pcs_units_t units, size_t interval,
size_t latency, size_t buffer_size,
hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb,
void* client_cb_data, hsa_ven_amd_pcs_t* handle) {
IS_BAD_PTR(handle);
IS_BAD_PTR(data_ready_cb);
return PcSamplingCreateInternal(
agent, method, units, interval, latency, buffer_size, data_ready_cb, client_cb_data, handle,
[&](core::Agent* agent_, PcSamplingSession& session_) {
return static_cast<AMD::GpuAgentInt*>(agent_)->PcSamplingCreateFromId(ioctl_pcs_id,
session_);
});
}
hsa_status_t PcsRuntime::PcSamplingCreateInternal(
core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method, hsa_ven_amd_pcs_units_t units,
size_t interval, size_t latency, size_t buffer_size,
hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, void* client_cb_data,
hsa_ven_amd_pcs_t* handle, agent_pcs_create_fn_t agent_pcs_create_fn) {
ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
handle->handle = ++pc_sampling_id_;
// create a new PcSamplingSession(agent, method, units, interval, latency, buffer_size,
// data_ready_cb, client_cb_data) reference and insert into pc_sampling_
pc_sampling_.emplace(std::piecewise_construct, std::forward_as_tuple(handle->handle),
std::forward_as_tuple(agent, method, units, interval, latency, buffer_size,
data_ready_cb, client_cb_data));
if (!pc_sampling_[handle->handle].isValid()) {
pc_sampling_.erase(handle->handle);
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
hsa_status_t ret = agent_pcs_create_fn(agent, pc_sampling_[handle->handle]);
if (ret != HSA_STATUS_SUCCESS) {
pc_sampling_.erase(handle->handle);
return ret;
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t PcsRuntime::PcSamplingDestroy(hsa_ven_amd_pcs_t handle) {
ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
auto pcSamplingSessionIt = pc_sampling_.find(reinterpret_cast<uint64_t>(handle.handle));
if (pcSamplingSessionIt == pc_sampling_.end()) {
debug_warning(false && "Cannot find PcSampling session");
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
AMD::GpuAgentInt* gpu_agent = static_cast<AMD::GpuAgentInt*>(pcSamplingSessionIt->second.agent);
hsa_status_t ret = gpu_agent->PcSamplingDestroy(pcSamplingSessionIt->second);
pc_sampling_.erase(pcSamplingSessionIt);
return ret;
}
} // namespace pcs
} // namespace rocr
@@ -59,7 +59,7 @@ namespace pcs {
class PcsRuntime {
public:
PcsRuntime() {}
PcsRuntime() : pc_sampling_id_(0) {}
~PcsRuntime() {}
/// @brief Getter for the PcsRuntime singleton object.
@@ -68,10 +68,65 @@ class PcsRuntime {
/// @brief Destroy singleton object.
static void DestroySingleton();
class PcSamplingSession {
public:
PcSamplingSession() : agent(NULL), thunkId_(0){};
PcSamplingSession(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method,
hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency,
size_t buffer_size, hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback,
void* client_callback_data);
~PcSamplingSession(){};
const bool isValid() { return valid_; }
const size_t buffer_size() { return csd.buffer_size; }
const hsa_ven_amd_pcs_method_kind_t method() { return csd.method; }
const size_t latency() { return csd.latency; }
const size_t sample_size() { return sample_size_; }
void GetHsaKmtSamplingInfo(HsaPcSamplingInfo* sampleInfo);
core::Agent* agent;
void SetThunkId(HsaPcSamplingTraceId thunkId) { thunkId_ = thunkId; }
HsaPcSamplingTraceId ThunkId() { return thunkId_; }
private:
HsaPcSamplingTraceId thunkId_;
bool valid_; // Whether configuration parameters are valid
size_t sample_size_;
struct client_session_data_t {
hsa_ven_amd_pcs_method_kind_t method;
hsa_ven_amd_pcs_units_t units;
size_t interval;
size_t latency;
size_t buffer_size;
hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback;
void* client_callback_data;
};
struct client_session_data_t csd;
}; // class PcSamplingSession
hsa_status_t PcSamplingIterateConfig(
core::Agent* agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
void* callback_data);
hsa_status_t PcSamplingCreate(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method,
hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency,
size_t buffer_size,
hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb,
void* client_cb_data, hsa_ven_amd_pcs_t* handle);
hsa_status_t PcSamplingCreateFromId(uint32_t ioctl_pcs_id, core::Agent* agent,
hsa_ven_amd_pcs_method_kind_t method,
hsa_ven_amd_pcs_units_t units, size_t interval,
size_t latency, size_t buffer_size,
hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb,
void* client_cb_data, hsa_ven_amd_pcs_t* handle);
hsa_status_t PcSamplingDestroy(hsa_ven_amd_pcs_t handle);
private:
/// @brief Initialize singleton object, must be called once.
static PcsRuntime* CreateSingleton();
@@ -80,7 +135,20 @@ class PcsRuntime {
static std::atomic<PcsRuntime*> instance_;
static std::mutex instance_mutex_;
// Map of pc sampling sessions indexed by hsa_ven_amd_pcs_t handle
std::map<uint64_t, PcSamplingSession> pc_sampling_;
KernelMutex pc_sampling_lock_;
uint64_t pc_sampling_id_;
DISALLOW_COPY_AND_ASSIGN(PcsRuntime);
using agent_pcs_create_fn_t = std::function<hsa_status_t(core::Agent*, PcSamplingSession&)>;
hsa_status_t PcSamplingCreateInternal(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method,
hsa_ven_amd_pcs_units_t units, size_t interval,
size_t latency, size_t buffer_size,
hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb,
void* client_cb_data, hsa_ven_amd_pcs_t* handle,
agent_pcs_create_fn_t agent_pcs_create_fn);
};
} // namespace pcs