contrctor/destructor changes, OnLoad/OnUnload functions, removed hsa_init()/hsa_shutdown()

[ROCm/rocprofiler commit: a9a5119399]
This commit is contained in:
Evgeny
2018-02-21 10:07:59 -06:00
förälder 1621e666da
incheckning 6eef0003a5
11 ändrade filer med 97 tillägg och 58 borttagningar
+8 -2
Visa fil
@@ -329,8 +329,14 @@ class Context {
const profile_vector_t profile_vector = GetProfiles(group_index);
for (auto& tuple : profile_vector) {
// Wait for stop packet to complete
hsa_signal_wait_scacquire(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
HSA_WAIT_STATE_BLOCKED);
const uint64_t timeout = UINT64_MAX;
bool complete = false;
while (!complete) {
const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, timeout,
HSA_WAIT_STATE_BLOCKED);
complete = (signal_value == 0);
if (!complete) printf("ROCProfiler: Signal timeout, signal(%d) timeout(%lx)\n", (int)signal_value, timeout);
}
for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT;
callback_data_t callback_data{tuple.info_vector, tuple.info_vector->size(), NULL};
const hsa_status_t status =
@@ -9,7 +9,6 @@ void InterceptQueue::HsaIntercept(HsaApiTable* table) {
InterceptQueue::mutex_t InterceptQueue::mutex_;
rocprofiler_callback_t InterceptQueue::on_dispatch_cb_ = NULL;
void* InterceptQueue::on_dispatch_cb_data_ = NULL;
const char* InterceptQueue::tool_lib_ = NULL;
void* InterceptQueue::tool_handle_ = NULL;
InterceptQueue::obj_map_t* InterceptQueue::obj_map_ = NULL;
const char* InterceptQueue::kernel_none_ = "";
} // namespace rocprofiler
+7 -23
Visa fil
@@ -26,30 +26,13 @@ class InterceptQueue {
static void HsaIntercept(HsaApiTable* table);
static void SetTool(const char* tool) { tool_lib_ = tool; }
static void UnloadTool() {
if (tool_handle_) dlclose(tool_handle_);
}
static hsa_status_t QueueCreate(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t* source,
void* data),
void* data, uint32_t private_segment_size,
uint32_t group_segment_size, hsa_queue_t** queue) {
std::lock_guard<mutex_t> lck(mutex_);
hsa_status_t status = HSA_STATUS_ERROR;
if (tool_lib_) {
tool_handle_ = dlopen(tool_lib_, RTLD_NOW);
if (tool_handle_ == NULL) {
fprintf(stderr, "ROCProfiler: can't load tool library \"%s\"\n", tool_lib_);
fprintf(stderr, "%s\n", dlerror());
exit(1);
}
tool_lib_ = NULL;
}
std::lock_guard<mutex_t> lck(mutex_);
if (!obj_map_) obj_map_ = new obj_map_t;
@@ -152,7 +135,7 @@ class InterceptQueue {
return (*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask;
}
static char* GetKernelName(const hsa_kernel_dispatch_packet_t* dispatch_packet) {
static const char* GetKernelName(const hsa_kernel_dispatch_packet_t* dispatch_packet) {
const amd_kernel_code_t* kernel_code = NULL;
hsa_status_t status =
util::HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address(
@@ -167,13 +150,15 @@ class InterceptQueue {
// Kernel name is mangled name
// apply __cxa_demangle() to demangle it
char* funcname = NULL;
const char* funcname = NULL;
if (kernel_name != NULL) {
size_t funcnamesize = 0;
int status;
char* ret = abi::__cxa_demangle(kernel_name, NULL, &funcnamesize, &status);
const char* ret = abi::__cxa_demangle(kernel_name, NULL, &funcnamesize, &status);
funcname = (ret != 0) ? ret : strdup(kernel_name);
}
if (funcname == NULL) funcname = strdup(kernel_none_);
return funcname;
}
@@ -181,9 +166,8 @@ class InterceptQueue {
static const packet_word_t header_type_mask = (1ul << HSA_PACKET_HEADER_WIDTH_TYPE) - 1;
static rocprofiler_callback_t on_dispatch_cb_;
static void* on_dispatch_cb_data_;
static const char* tool_lib_;
static void* tool_handle_;
static obj_map_t* obj_map_;
static const char* kernel_none_;
ProxyQueue* const proxy_;
const util::AgentInfo* agent_info_;
+48 -5
Visa fil
@@ -99,14 +99,53 @@ void RestoreHsaApi() {
#endif
}
typedef void (*tool_handler_t)();
void * kTtoolHandle = NULL;
void LoadTool(const char* tool_lib) {
if (tool_lib) {
kTtoolHandle = dlopen(tool_lib, RTLD_NOW);
if (kTtoolHandle == NULL) {
fprintf(stderr, "ROCProfiler: can't load tool library \"%s\"\n", tool_lib);
fprintf(stderr, "%s\n", dlerror());
exit(1);
}
tool_handler_t handler = reinterpret_cast<tool_handler_t>(dlsym(kTtoolHandle, "OnLoadTool"));
if (handler == NULL) {
fprintf(stderr, "ROCProfiler: tool library corrupted, OnLoadTool() method is expected\n");
fprintf(stderr, "%s\n", dlerror());
exit(1);
}
tool_handler_t on_unload_handler = reinterpret_cast<tool_handler_t>(dlsym(kTtoolHandle, "OnUnloadTool"));
if (on_unload_handler == NULL) {
fprintf(stderr, "ROCProfiler: tool library corrupted, OnUnloadTool() method is expected\n");
fprintf(stderr, "%s\n", dlerror());
exit(1);
}
handler();
}
}
void UnloadTool() {
if (kTtoolHandle) {
tool_handler_t handler = reinterpret_cast<tool_handler_t>(dlsym(kTtoolHandle, "OnUnloadTool"));
if (handler == NULL) {
fprintf(stderr, "ROCProfiler error: tool library corrupted, OnUnloadTool() method is expected\n");
fprintf(stderr, "%s\n", dlerror());
exit(1);
}
handler();
dlclose(kTtoolHandle);
}
}
CONSTRUCTOR_API void constructor() {
util::Logger::Create();
util::HsaRsrcFactory::Create();
}
DESTRUCTOR_API void destructor() {
rocprofiler::MetricsDict::Destroy();
util::HsaRsrcFactory::Destroy();
rocprofiler::MetricsDict::Destroy();
util::Logger::Destroy();
}
@@ -139,19 +178,23 @@ extern "C" {
// HSA-runtime tool on-load method
PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
const char* const* failed_tool_names) {
const bool intercept_mode = (getenv("ROCP_HSA_INTERCEPT") != NULL);
rocprofiler::SaveHsaApi(table);
rocprofiler::ProxyQueue::InitFactory();
rocprofiler::InterceptQueue::SetTool(getenv("ROCP_TOOL_LIB"));
// HSA intercepting
if (getenv("ROCP_HSA_INTERCEPT") != NULL) {
if (intercept_mode) {
rocprofiler::InterceptQueue::HsaIntercept(table);
rocprofiler::ProxyQueue::HsaIntercept(table);
}
rocprofiler::LoadTool(getenv("ROCP_TOOL_LIB"));
return true;
}
// HSA-runtime tool on-unload method
PUBLIC_API void OnUnload() { rocprofiler::RestoreHsaApi(); }
PUBLIC_API void OnUnload() {
rocprofiler::UnloadTool();
rocprofiler::RestoreHsaApi();
}
// Returns library vesrion
PUBLIC_API uint32_t rocprofiler_version_major() { return ROCPROFILER_VERSION_MAJOR; }
@@ -141,7 +141,10 @@ class SimpleProxyQueue : public ProxyQueue {
queue_mask_(0),
submit_index_(0),
on_submit_cb_(0),
on_submit_cb_data_(0) {}
on_submit_cb_data_(0)
{
printf("ROCProfiler: SimpleProxyQueue is enabled\n");
}
~SimpleProxyQueue() {}
@@ -81,10 +81,13 @@ hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* d
// Constructor of the class
HsaRsrcFactory::HsaRsrcFactory() {
hsa_status_t status;
#if 0
// Initialize the Hsa Runtime
hsa_status_t status = hsa_init();
printf("ROCProfiler: HSA init\n");
status = hsa_init();
CHECK_STATUS("Error in hsa_init", status);
#endif
// Discover the set of Gpu devices available on the platform
status = hsa_iterate_agents(GetHsaAgentsCallback, this);
CHECK_STATUS("Error Calling hsa_iterate_agents", status);
@@ -108,9 +111,11 @@ HsaRsrcFactory::HsaRsrcFactory() {
HsaRsrcFactory::~HsaRsrcFactory() {
for (auto p : cpu_list_) delete p;
for (auto p : gpu_list_) delete p;
#if 0
printf("ROCProfiler: HSA shutdown\n");
hsa_status_t status = hsa_shut_down();
CHECK_STATUS("Error in hsa_shut_down", status);
#endif
}
hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) {
@@ -114,9 +114,7 @@ class HsaRsrcFactory {
public:
typedef std::recursive_mutex mutex_t;
static HsaRsrcFactory* Create() { return NULL; }
static HsaRsrcFactory* CreateInstance() {
static HsaRsrcFactory* Create() {
std::lock_guard<mutex_t> lck(mutex_);
if (instance_ == NULL) {
instance_ = new HsaRsrcFactory();
@@ -125,9 +123,9 @@ class HsaRsrcFactory {
}
static HsaRsrcFactory& Instance() {
CreateInstance();
if (instance_ == NULL) instance_ = Create();
hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
CHECK_STATUS("HsaRsrcFactory::Instance() is not found", status);
CHECK_STATUS("HsaRsrcFactory::Instance() failed", status);
return *instance_;
}
+1 -1
Visa fil
@@ -13,6 +13,6 @@ int main(int argc, char** argv) {
const int diter = (diter_s != NULL) ? atol(diter_s) : 1;
TestHsa::HsaInstantiate();
for (int i = 0; i < kiter; ++i) RunKernel<SimpleConvolution, TestAql>(argc, argv, diter);
//TestHsa::HsaShutdown();
TestHsa::HsaShutdown();
return 0;
}
+4 -1
Visa fil
@@ -69,7 +69,10 @@ HsaRsrcFactory* TestHsa::HsaInstantiate(const uint32_t agent_ind) {
}
void TestHsa::HsaShutdown() {
if (hsa_queue_ != NULL) hsa_queue_destroy(hsa_queue_);
if (hsa_queue_ != NULL) {
hsa_queue_destroy(hsa_queue_);
hsa_queue_ = NULL;
}
if (hsa_rsrc_) hsa_rsrc_->Destroy();
}
+8 -10
Visa fil
@@ -338,7 +338,7 @@ static hsa_status_t info_callback(const rocprofiler_info_data_t info, void * arg
}
// Tool constructor
CONSTRUCTOR_API void constructor()
extern "C" PUBLIC_API void OnLoadTool()
{
std::map<std::string, hsa_ven_amd_aqlprofile_parameter_name_t> parameters_dict;
parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET"] =
@@ -487,23 +487,21 @@ CONSTRUCTOR_API void constructor()
}
// Tool destructor
DESTRUCTOR_API void destructor() {
const bool result_file_opened = (result_prefix != NULL) && (result_file_handle != NULL);
extern "C" PUBLIC_API void OnUnloadTool() {
// Unregister dispatch callback
rocprofiler_remove_dispatch_callback();
// Dump stored profiling output data
const bool result_file_opened = (result_prefix != NULL) && (result_file_handle != NULL);
printf("\nROCPRofiler: %u contexts collected", context_count);
if (result_file_opened) printf(", output directory %s", result_prefix);
printf("\n");
// Dump stored profiling output data
dump_context_array();
if (result_file_opened) fclose(result_file_handle);
// Unregister dispatch callback and free callback data
rocprofiler_remove_dispatch_callback();
// Cleanup
if (dispatch_data != NULL) {
delete[] dispatch_data->features;
delete dispatch_data;
}
// Close output file
if (result_file_opened) fclose(result_file_handle);
}
+5 -5
Visa fil
@@ -213,14 +213,14 @@
# VALUBusy The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
<metric
name="VALUBusy"
expr=100*SQ_ACTIVE_INST_VALU*4/NUM_SIMDS/GRBM_GUI_ACTIVE
expr=100*SQ_ACTIVE_INST_VALU*4/SIMD_NUM/GRBM_GUI_ACTIVE
descr="The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."
></metric>
# SALUBusy The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
<metric
name="SALUBusy"
expr=100*SQ_INST_CYCLES_SALU*4/NUM_SIMDS/GRBM_GUI_ACTIVE
expr=100*SQ_INST_CYCLES_SALU*4/SIMD_NUM/GRBM_GUI_ACTIVE
descr="The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."
></metric>
@@ -248,14 +248,14 @@
# MemUnitBusy The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound).
<metric
name="MemUnitBusy"
expr=100*max(TA_BUSY,16)/GRBM_GUI_ACTIVE/NUM_SHADER_ENGINES
expr=100*max(TA_BUSY,16)/GRBM_GUI_ACTIVE/SE_NUM
descr="The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound)."
></metric>
# MemUnitStalled The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad).
<metric
name="MemUnitStalled"
expr=100*max(TCP_TA_DATA_STALL_CYCLES,16)/GRBM_GUI_ACTIVE/NUM_SHADER_ENGINES
expr=100*max(TCP_TA_DATA_STALL_CYCLES,16)/GRBM_GUI_ACTIVE/SE_NUM
descr="The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad)."
></metric>
@@ -276,7 +276,7 @@
# LDSBankConflict The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad).
<metric
name="LDSBankConflict"
expr=100*SQ_LDS_BANK_CONFLICT/GRBM_GUI_ACTIVE/NUM_SIMDS
expr=100*SQ_LDS_BANK_CONFLICT/GRBM_GUI_ACTIVE/CU_NUM
descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."
></metric>