[ROCm/rocprofiler commit: 1c8f767da4]
Этот коммит содержится в:
Evgeny
2018-02-27 13:32:11 -06:00
родитель ff5b92b47b
Коммит c1b3fa7d52
15 изменённых файлов: 370 добавлений и 267 удалений
+1
Просмотреть файл
@@ -215,6 +215,7 @@ hsa_status_t rocprofiler_reset(rocprofiler_t* context, // [in] profiling contex
// Profiling callback data
typedef struct {
hsa_agent_t agent;
uint32_t agent_index;
const hsa_queue_t* queue;
uint64_t queue_index;
uint64_t kernel_object;
+1 -1
Просмотреть файл
@@ -335,7 +335,7 @@ class Context {
const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, timeout,
HSA_WAIT_STATE_BLOCKED);
complete = (signal_value == 0);
if (!complete) printf("ROCProfiler: Signal timeout, signal(%d) timeout(%lx)\n", (int)signal_value, timeout);
if (!complete) printf("ROCProfiler: Signal timeout, signal(%d) timeout(0x%lx)\n", (int)signal_value, timeout);
}
for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT;
callback_data_t callback_data{tuple.info_vector, tuple.info_vector->size(), NULL};
+1
Просмотреть файл
@@ -87,6 +87,7 @@ class InterceptQueue {
reinterpret_cast<const hsa_kernel_dispatch_packet_t*>(packet);
const char* kernel_name = GetKernelName(dispatch_packet);
rocprofiler_callback_data_t data = {obj->agent_info_->dev_id,
obj->agent_info_->dev_index,
obj->queue_,
user_que_idx,
dispatch_packet->kernel_object,
+1 -1
Просмотреть файл
@@ -174,7 +174,7 @@ const MetricsDict* GetMetrics(const hsa_agent_t& agent) {
util::Logger::mutex_t util::Logger::mutex_;
util::Logger* util::Logger::instance_ = NULL;
uint64_t Context::timeout_ = 1000;
uint64_t Context::timeout_ = UINT64_MAX;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
+31 -44
Просмотреть файл
@@ -25,6 +25,7 @@ POSSIBILITY OF SUCH DAMAGE.
#include "util/hsa_rsrc_factory.h"
#include <dlfcn.h>
#include <fcntl.h>
#include <hsa.h>
#include <hsa_ext_amd.h>
#include <hsa_ext_finalize.h>
@@ -32,6 +33,8 @@ POSSIBILITY OF SUCH DAMAGE.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <atomic>
#include <cassert>
@@ -80,14 +83,13 @@ hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* d
}
// Constructor of the class
HsaRsrcFactory::HsaRsrcFactory() {
HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) {
hsa_status_t status;
#if 0
// Initialize the Hsa Runtime
printf("ROCProfiler: HSA init\n");
status = hsa_init();
CHECK_STATUS("Error in hsa_init", status);
#endif
if (initialize_hsa_) {
status = hsa_init();
CHECK_STATUS("Error in hsa_init", status);
}
// Discover the set of Gpu devices available on the platform
status = hsa_iterate_agents(GetHsaAgentsCallback, this);
CHECK_STATUS("Error Calling hsa_iterate_agents", status);
@@ -111,11 +113,10 @@ HsaRsrcFactory::HsaRsrcFactory() {
HsaRsrcFactory::~HsaRsrcFactory() {
for (auto p : cpu_list_) delete p;
for (auto p : gpu_list_) delete p;
#if 0
printf("ROCProfiler: HSA shutdown\n");
hsa_status_t status = hsa_shut_down();
CHECK_STATUS("Error in hsa_shut_down", status);
#endif
if (initialize_hsa_) {
hsa_status_t status = hsa_shut_down();
CHECK_STATUS("Error in hsa_shut_down", status);
}
}
hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) {
@@ -371,67 +372,53 @@ bool HsaRsrcFactory::TransferData(void* dest_buff, void* src_buff, uint32_t leng
//
// @return bool true if successful, false otherwise
//
void* HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc) {
// Finalize the Hsail object into code object
hsa_status_t status;
hsa_code_object_t code_object;
bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
const char* kernel_name, hsa_executable_t* executable, hsa_executable_symbol_t* code_desc) {
hsa_status_t status = HSA_STATUS_ERROR;
// Build the code object filename
std::string filename(brig_path);
std::clog << "Code object filename: " << filename << std::endl;
// Open the file containing code object
std::ifstream codeStream(filename.c_str(), std::ios::binary | std::ios::ate);
if (!codeStream) {
std::cerr << "Error: failed to load " << filename << std::endl;
hsa_file_t file_handle = open(filename.c_str(), O_RDONLY);
if (file_handle == -1) {
std::cerr << "Error: failed to load '" << filename << "'" << std::endl;
assert(false);
return NULL;
return false;
}
// Allocate memory to read in code object from file
size_t size = std::string::size_type(codeStream.tellg());
char* code_buf = (char*)AllocateSysMemory(agent_info, size);
if (!code_buf) {
std::cerr << "Error: failed to allocate memory for code object." << std::endl;
assert(false);
return NULL;
}
// Read the code object into allocated memory
codeStream.seekg(0, std::ios::beg);
std::copy(std::istreambuf_iterator<char>(codeStream), std::istreambuf_iterator<char>(), code_buf);
// De-Serialize the code object that has been read into memory
status = hsa_code_object_deserialize(code_buf, size, NULL, &code_object);
// Create code object reader
hsa_code_object_reader_t code_obj_rdr = {0};
status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr);
if (status != HSA_STATUS_SUCCESS) {
std::cerr << "Failed to deserialize code object" << std::endl;
if (code_buf) hsa_memory_free(code_buf);
return NULL;
std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl;
return false;
}
// Create executable.
status =
hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", hsa_exec);
status = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable);
CHECK_STATUS("Error in creating executable object", status);
// Load code object.
status = hsa_executable_load_code_object(*hsa_exec, agent_info->dev_id, code_object, "");
status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id,
code_obj_rdr, NULL, NULL);
CHECK_STATUS("Error in loading executable object", status);
// Freeze executable.
status = hsa_executable_freeze(*hsa_exec, "");
status = hsa_executable_freeze(*executable, "");
CHECK_STATUS("Error in freezing executable object", status);
// Get symbol handle.
hsa_executable_symbol_t kernelSymbol;
status = hsa_executable_get_symbol(*hsa_exec, NULL, kernel_name, agent_info->dev_id, 0,
status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0,
&kernelSymbol);
CHECK_STATUS("Error in looking up kernel symbol", status);
// Update output parameter
*code_desc = kernelSymbol;
return code_buf;
return true;
}
// Print the various fields of Hsa Gpu Agents
+9 -6
Просмотреть файл
@@ -114,16 +114,16 @@ class HsaRsrcFactory {
public:
typedef std::recursive_mutex mutex_t;
static HsaRsrcFactory* Create() {
static HsaRsrcFactory* Create(bool initialize_hsa = true) {
std::lock_guard<mutex_t> lck(mutex_);
if (instance_ == NULL) {
instance_ = new HsaRsrcFactory();
instance_ = new HsaRsrcFactory(initialize_hsa);
}
return instance_;
}
static HsaRsrcFactory& Instance() {
if (instance_ == NULL) instance_ = Create();
if (instance_ == NULL) instance_ = Create(false);
hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
CHECK_STATUS("HsaRsrcFactory::Instance() failed", status);
return *instance_;
@@ -229,9 +229,9 @@ class HsaRsrcFactory {
// @param code_desc Handle of finalized Code Descriptor that could
// be used to submit for execution
//
// @return code buffer, non NULL if successful, NULL otherwise
// @return true if successful, false otherwise
//
void* LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc);
// Print the various fields of Hsa Gpu Agents
@@ -259,11 +259,14 @@ class HsaRsrcFactory {
// Constructor of the class. Will initialize the Hsa Runtime and
// query the system topology to get the list of Cpu and Gpu devices
HsaRsrcFactory();
HsaRsrcFactory(bool initialize_hsa);
// Destructor of the class
~HsaRsrcFactory();
// HSA was initialized
const bool initialize_hsa_;
// Add an instance of AgentInfo representing a Hsa Gpu agent
const AgentInfo* AddAgentInfo(const hsa_agent_t agent);
+1 -1
Просмотреть файл
@@ -89,7 +89,7 @@ class Xml {
AddExpr(full_tag, name, oss.str());
}
nodes_t GetNodes(std::string global_tag) { return (*map_)[global_tag]; }
nodes_t GetNodes(const std::string& global_tag) { return (*map_)[global_tag]; }
template <class F>
F ForEach(const F& f_i) {
+3 -5
Просмотреть файл
@@ -43,7 +43,7 @@ HsaRsrcFactory* TestHsa::HsaInstantiate(const uint32_t agent_ind) {
if (hsa_rsrc_ == NULL) {
agent_id_ = agent_ind;
hsa_rsrc_ = HsaRsrcFactory::CreateInstance();
hsa_rsrc_ = HsaRsrcFactory::Create();
// Print properties of the agents
hsa_rsrc_->PrintGpuAgents("> GPU agents");
@@ -125,9 +125,8 @@ bool TestHsa::Setup() {
// Load and Finalize Kernel Code Descriptor
char* brig_path = (char*)brig_path_obj_.c_str();
code_buf_ =
hsa_rsrc_->LoadAndFinalize(agent_info_, brig_path, name_.c_str(), &hsa_exec_, &kernel_code_desc_);
if (code_buf_ == NULL) {
bool suc = hsa_rsrc_->LoadAndFinalize(agent_info_, brig_path, name_.c_str(), &hsa_exec_, &kernel_code_desc_);
if (suc == false) {
std::cerr << "Error in loading and finalizing Kernel" << std::endl;
return false;
}
@@ -241,7 +240,6 @@ void TestHsa::PrintTime() {
bool TestHsa::Cleanup() {
hsa_executable_destroy(hsa_exec_);
hsa_memory_free(code_buf_);
hsa_signal_destroy(hsa_signal_);
return true;
}
+1 -3
Просмотреть файл
@@ -47,7 +47,6 @@ class TestHsa : public TestAql {
total_time_taken_ = 0;
setup_time_taken_ = 0;
dispatch_time_taken_ = 0;
code_buf_ = NULL;
hsa_exec_ = {};
}
@@ -123,8 +122,7 @@ class TestHsa : public TestAql {
// Test kernel name
std::string name_;
// Kernel code buffer
void* code_buf_;
// Kernel executable
hsa_executable_t hsa_exec_;
};
+121 -26
Просмотреть файл
@@ -32,6 +32,9 @@ struct callbacks_data_t {
unsigned feature_count;
unsigned group_index;
FILE* file_handle;
std::vector<uint32_t>* gpu_index;
std::vector<std::string>* kernel_string;
std::vector<uint32_t>* range;
};
// Context stored entry type
@@ -54,10 +57,18 @@ typedef std::map<uint32_t, context_entry_t> context_array_t;
context_array_t* context_array = NULL;
// Contexts collected count
uint32_t context_count = 0;
uint32_t context_collected = 0;
// Profiling results output file name
const char* result_prefix = NULL;
// Global results file handle
FILE* result_file_handle = NULL;
// Dispatch filters
// GPU index filter
std::vector<uint32_t>* gpu_index_vec = NULL;
// Kernel name filter
std::vector<std::string>* kernel_string_vec = NULL;
// DIspatch number range filter
std::vector<uint32_t>* range_vec = NULL;
// Check returned HSA API status
void check_status(hsa_status_t status) {
@@ -69,6 +80,20 @@ void check_status(hsa_status_t status) {
}
}
uint32_t next_context_count() {
if (pthread_mutex_lock(&mutex) != 0) {
perror("pthread_mutex_lock");
exit(1);
}
const uint32_t prev_val = context_count;
context_count = prev_val + 1;
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
exit(1);
}
return prev_val;
}
// Allocate entry to store profiling context
context_entry_t* alloc_context_entry() {
if (pthread_mutex_lock(&mutex) != 0) {
@@ -83,7 +108,6 @@ context_entry_t* alloc_context_entry() {
fprintf(stderr, "context_array corruption, index repeated %u\n", index);
abort();
}
++context_count;
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
@@ -91,7 +115,6 @@ context_entry_t* alloc_context_entry() {
}
context_entry_t* entry = &(ret.first->second);
entry->index = index;
return entry;
}
@@ -220,6 +243,7 @@ void dump_context(context_entry_t* entry) {
hsa_status_t status = HSA_STATUS_ERROR;
if (entry->valid) {
++context_collected;
entry->valid = 0;
const uint32_t index = entry->index;
FILE* file_handle = entry->file_handle;
@@ -285,10 +309,46 @@ void handler(rocprofiler_group_t group, void* arg) {
// Kernel disoatch callback
hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data,
rocprofiler_group_t* group) {
// HSA status
hsa_status_t status = HSA_STATUS_ERROR;
// Passed tool data
callbacks_data_t* tool_data = reinterpret_cast<callbacks_data_t*>(user_data);
// Checking dispatch condition
bool found = true;
std::vector<uint32_t>* range_ptr = tool_data->range;
if (found && range_ptr) {
found = false;
std::vector<uint32_t>& range = *range_ptr;
if (range.size() == 1) {
if (context_count >= range[0]) found = true;
} else if (range.size() == 2) {
if ((context_count >= range[0]) && (context_count < range[1])) found = true;
}
}
std::vector<uint32_t>* gpu_index = tool_data->gpu_index;
if (found && gpu_index) {
found = false;
for (uint32_t i : *gpu_index) {
if (i == callback_data->agent_index) {
found = true;
}
}
}
std::vector<std::string>* kernel_string = tool_data->kernel_string;
if (found && kernel_string) {
found = false;
for (const std::string& s : *kernel_string) {
if (std::string(callback_data->kernel_name).find(s) != std::string::npos) {
found = true;
}
}
}
if (found == false) {
next_context_count();
return HSA_STATUS_SUCCESS;
}
// HSA status
hsa_status_t status = HSA_STATUS_ERROR;
// Profiling context
rocprofiler_t* context = NULL;
// Context entry
@@ -320,6 +380,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data,
entry->data = *callback_data;
entry->data.kernel_name = strdup(callback_data->kernel_name);
entry->file_handle = tool_data->file_handle;
entry->index = next_context_count();
entry->valid = 1;
return status;
@@ -341,19 +402,48 @@ static hsa_status_t info_callback(const rocprofiler_info_data_t info, void * arg
return HSA_STATUS_SUCCESS;
}
void get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector<std::string>* vec, const char* label = NULL) {
auto nodes = xml->GetNodes(tag);
auto rit = nodes.rbegin();
auto rend = nodes.rend();
while (rit != rend) {
auto& opts = (*rit)->opts;
if (opts.find(field) != opts.end()) break;
++rit;
}
if (rit != rend) {
const std::string array_string = (*rit)->opts[field];
if (label != NULL) printf("%s%s = %s\n", label, field.c_str(), array_string.c_str());
size_t pos1 = 0;
while (pos1 < array_string.length()) {
const size_t pos2 = array_string.find(delim, pos1);
const std::string token = array_string.substr(pos1, pos2 - pos1);
vec->push_back(token);
if (pos2 == std::string::npos) break;
pos1 = pos2 + 1;
}
}
}
void get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector<uint32_t>* vec, const char* label = NULL) {
std::vector<std::string> str_vec;
get_xml_array(xml, tag, field, delim, &str_vec, label);
for (const std::string& str : str_vec) vec->push_back(atoi(str.c_str()));
}
// Tool constructor
extern "C" PUBLIC_API void OnLoadTool()
{
std::map<std::string, hsa_ven_amd_aqlprofile_parameter_name_t> parameters_dict;
parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET"] =
parameters_dict["COMPUTE_UNIT_TARGET"] =
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET;
parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK"] =
parameters_dict["VM_ID_MASK"] =
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK;
parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK"] =
parameters_dict["MASK"] =
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK;
parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK"] =
parameters_dict["TOKEN_MASK"] =
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK;
parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2"] =
parameters_dict["TOKEN_MASK2"] =
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2;
char* info_symb = getenv("ROCP_INFO");
@@ -374,7 +464,7 @@ extern "C" PUBLIC_API void OnLoadTool()
DIR* dir = opendir(result_prefix);
if (dir == NULL) {
std::ostringstream errmsg;
errmsg << "Cannot open output directory '" << result_prefix << "'";
errmsg << "ROCProfiler: Cannot open output directory '" << result_prefix << "'";
perror(errmsg.str().c_str());
exit(1);
}
@@ -383,7 +473,7 @@ extern "C" PUBLIC_API void OnLoadTool()
result_file_handle = fopen(oss.str().c_str(), "w");
if (result_file_handle == NULL) {
std::ostringstream errmsg;
errmsg << "fopen error, file '" << oss.str().c_str() << "'";
errmsg << "ROCProfiler: fopen error, file '" << oss.str().c_str() << "'";
perror(errmsg.str().c_str());
exit(1);
}
@@ -399,24 +489,23 @@ extern "C" PUBLIC_API void OnLoadTool()
printf("ROCProfiler: input from \"%s\"\n", xml_name);
xml::Xml* xml = xml::Xml::Create(xml_name);
if (xml == NULL) {
fprintf(stderr, "Input file not found '%s'\n", xml_name);
fprintf(stderr, "ROCProfiler: Input file not found '%s'\n", xml_name);
exit(1);
}
// Getting metrics
auto metrics_list = xml->GetNodes("top.metric");
std::vector<std::string> metrics_vec;
for (auto* entry : metrics_list) {
const std::string entry_str = entry->opts["name"];
size_t pos1 = 0;
while (pos1 < entry_str.length()) {
const size_t pos2 = entry_str.find(",", pos1);
const std::string metric_name = entry_str.substr(pos1, pos2 - pos1);
metrics_vec.push_back(metric_name);
if (pos2 == std::string::npos) break;
pos1 = pos2 + 1;
}
}
get_xml_array(xml, "top.metric", "name", ",", &metrics_vec);
// Getting GPU indexes
gpu_index_vec = new std::vector<uint32_t>;
get_xml_array(xml, "top.metric", "gpu_index", ",", gpu_index_vec, " ");
// Getting kernel names
kernel_string_vec = new std::vector<std::string>;
get_xml_array(xml, "top.metric", "kernel", ",", kernel_string_vec, " ");
// Getting profiling range
range_vec = new std::vector<uint32_t>;
get_xml_array(xml, "top.metric", "range", ":", range_vec, " ");
// Getting traces
auto traces_list = xml->GetNodes("top.trace");
@@ -458,7 +547,7 @@ extern "C" PUBLIC_API void OnLoadTool()
for (auto& v : params->opts) {
const std::string parameter_name = v.first;
if (parameters_dict.find(parameter_name) == parameters_dict.end()) {
fprintf(stderr, "ROCProfiler: unknown trace parameter %s\n", parameter_name.c_str());
fprintf(stderr, "ROCProfiler: unknown trace parameter '%s'\n", parameter_name.c_str());
exit(1);
}
const uint32_t value = strtol(v.second.c_str(), NULL, 0);
@@ -489,6 +578,9 @@ extern "C" PUBLIC_API void OnLoadTool()
callbacks_data->feature_count = feature_count;
callbacks_data->group_index = 0;
callbacks_data->file_handle = result_file_handle;
callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec;
callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec;
callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;;
rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data);
}
@@ -503,7 +595,7 @@ extern "C" PUBLIC_API void OnUnloadTool() {
// Dump stored profiling output data
const bool result_file_opened = (result_prefix != NULL) && (result_file_handle != NULL);
printf("\nROCPRofiler: %u contexts collected", context_count);
printf("\nROCPRofiler: %u contexts collected", context_collected);
if (result_file_opened) printf(", output directory %s", result_prefix);
printf("\n");
dump_context_array();
@@ -514,4 +606,7 @@ extern "C" PUBLIC_API void OnUnloadTool() {
delete[] callbacks_data->features;
delete callbacks_data;
}
delete gpu_index_vec;
delete kernel_string_vec;
delete range_vec;
}
+93
Просмотреть файл
@@ -0,0 +1,93 @@
<gfx8>
<metric name=GRBM_COUNT block=GRBM event=0 ></metric>
<metric name=GRBM_GUI_ACTIVE block=GRBM event=2 ></metric>
<metric name=SQ_CYCLES block=SQ event=2 ></metric>
<metric name=SQ_WAVES block=SQ event=4 ></metric>
<metric name=SQ_ITEMS block=SQ event=14 ></metric>
<metric name=SQ_INSTS_VALU block=SQ event=26 ></metric>
<metric name=SQ_INSTS_VMEM_WR block=SQ event=27 ></metric>
<metric name=SQ_INSTS_VMEM_RD block=SQ event=28 ></metric>
<metric name=SQ_INSTS_SALU block=SQ event=30 ></metric>
<metric name=SQ_INSTS_SMEM block=SQ event=31 ></metric>
<metric name=SQ_INSTS_FLAT block=SQ event=32 ></metric>
<metric name=SQ_INSTS_FLAT_LDS_ONLY block=SQ event=33 ></metric>
<metric name=SQ_INSTS_LDS block=SQ event=34 ></metric>
<metric name=SQ_INSTS_GDS block=SQ event=35 ></metric>
<metric name=SQ_WAVE_READY block=SQ event=47 ></metric>
<metric name=SQ_WAIT_INST_LDS block=SQ event=61 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
<metric name=SQ_ACTIVE_INST_VALU block=SQ event=69 descr="Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic)"></metric>
<metric name=SQ_INST_CYCLES_SALU block=SQ event=86 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"></metric>
<metric name=SQ_THREAD_CYCLES_VALU block=SQ event=89 descr="Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)"></metric>
<metric name=SQ_THREAD_CYCLES_VALU_MAX block=SQ event=90 descr="Maximum number of thread-cycles VALU operations that could have been executed given the instruction mix (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd, emulated)"></metric>
<metric name=SQ_LDS_BANK_CONFLICT block=SQ event=97 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
<metric name=TA_TA_BUSY block=TA event=15 ></metric>
<metric name=TA_FLAT_READ_WAVEFRONTS block=TA event=101 ></metric>
<metric name=TA_FLAT_WRITE_WAVEFRONTS block=TA event=102 ></metric>
<metric name=TCC_CYCLE block=TCC event=1 ></metric>
<metric name=TCC_REQ block=TCC event=3 ></metric>
<metric name=TCC_HIT block=TCC event=18 ></metric>
<metric name=TCC_MISS block=TCC event=19 ></metric>
<metric name=TCC_WRITEBACK block=TCC event=22 ></metric>
<metric name=TCC_EA_WRREQ block=TCC event=26 ></metric>
<metric name=TCC_EA_WRREQ_64B block=TCC event=27 ></metric>
<metric name=TCC_EA_WRREQ_STALL block=TCC event=30 ></metric>
<metric name=TCC_MC_RDREQ block=TCC event=35 ></metric>
<metric name="TCC_MC_WRREQ" block=TCC event=26 descr="Number of 32-byte transactions going over the TC_MC_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests."></metric>
<metric name="TCC_MC_WRREQ_STALL" block=TCC event=28 descr="Number of cycles a write request was stalled."></metric>
<metric name="TCP_TA_DATA_STALL_CYCLES" block=TCP event=3 descr="TCP stalls TA data interface. Now Windowed."></metric>
<metric name="TCP_TCP_TA_DATA_STALL_CYCLES" block=TCP event=3 descr="TCP stalls TA data interface. Not Windowed."></metric>
<metric name=CPC_ALWAYS_COUNT block=CPC event=0 ></metric>
<metric name=CPC_ME1_STALL_WAIT_ON_RCIU_READ block=CPC event=8 ></metric>
</gfx8>
<gfx9>
<metric name=GRBM_COUNT block=GRBM event=0 ></metric>
<metric name=GRBM_GUI_ACTIVE block=GRBM event=2 ></metric>
<metric name=SQ_CYCLES block=SQ event=2 ></metric>
<metric name=SQ_WAVES block=SQ event=4 ></metric>
<metric name=SQ_ITEMS block=SQ event=14 ></metric>
<metric name=SQ_INSTS_VALU block=SQ event=26 ></metric>
<metric name=SQ_INSTS_VMEM_WR block=SQ event=27 ></metric>
<metric name=SQ_INSTS_VMEM_RD block=SQ event=28 ></metric>
<metric name=SQ_INSTS_SALU block=SQ event=30 ></metric>
<metric name=SQ_INSTS_SMEM block=SQ event=31 ></metric>
<metric name=SQ_INSTS_FLAT block=SQ event=32 ></metric>
<metric name=SQ_INSTS_FLAT_LDS_ONLY block=SQ event=33 ></metric>
<metric name=SQ_INSTS_LDS block=SQ event=34 ></metric>
<metric name=SQ_INSTS_GDS block=SQ event=35 ></metric>
<metric name=SQ_WAVE_READY block=SQ event=47 ></metric>
<metric name=SQ_WAIT_INST_LDS block=SQ event=63 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
<metric name=SQ_ACTIVE_INST_VALU block=SQ event=71 descr="regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic)"></metric>
<metric name=SQ_INST_CYCLES_SALU block=SQ event=84 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"></metric>
<metric name=SQ_THREAD_CYCLES_VALU block=SQ event=85 descr="Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)"></metric>
<metric name=SQ_THREAD_CYCLES_VALU_MAX block=SQ event=86 descr="Maximum number of thread-cycles VALU operations that could have been executed given the instruction mix (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd, emulated)"></metric>
<metric name=SQ_LDS_BANK_CONFLICT block=SQ event=93 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
<metric name=TA_TA_BUSY block=TA event=15 ></metric>
<metric name=TA_FLAT_READ_WAVEFRONTS block=TA event=101 ></metric>
<metric name=TA_FLAT_WRITE_WAVEFRONTS block=TA event=102 ></metric>
<metric name=TCC_CYCLE block=TCC event=1 ></metric>
<metric name=TCC_REQ block=TCC event=3 ></metric>
<metric name=TCC_HIT block=TCC event=20 ></metric>
<metric name=TCC_MISS block=TCC event=22 ></metric>
<metric name=TCC_WRITEBACK block=TCC event=25 ></metric>
<metric name=TCC_EA_WRREQ block=TCC event=29 ></metric>
<metric name=TCC_EA_WRREQ_64B block=TCC event=30 ></metric>
<metric name=TCC_EA_WRREQ_STALL block=TCC event=33 ></metric>
<metric name=TCC_EA_RDREQ block=TCC event=41 ></metric>
<metric name=TCC_EA_RDREQ_32B block=TCC event=42 ></metric>
<metric name=TCP_TA_DATA_STALL_CYCLES block=TCP event=6 descr="TCP stalls TA data interface. Now Windowed."></metric>
<metric name=CPC_ALWAYS_COUNT block=CPC event=0 ></metric>
<metric name=CPC_ME1_STALL_WAIT_ON_RCIU_READ block=CPC event=8 ></metric>
</gfx9>
+18 -4
Просмотреть файл
@@ -1,9 +1,23 @@
<metric name=SQ_CYCLES,SQ_WAVES,SQ_INSTS_SMEM,SQ_INSTS_VALU,TA_FLAT_WRITE_WAVEFRONTS[0],TA_FLAT_WRITE_WAVEFRONTS[1],CPC_ALWAYS_COUNT,CPC_ME1_STALL_WAIT_ON_RCIU_READ,GPUBusy,VALUBusy,SALUBusy,MemUnitBusy,SFetchInsts,FetchSize,VWriteInsts,WriteSize
# Filter by dispatches range, GPU index and kernel names
<metric
# range format "3:9"
range=""
# list of gpu indexes "0,1,2,3"
gpu_index=""
# list of matched sub-strings "Simple1,Conv1,SimpleConvolution"
kernel=""
></metric>
<trace name=SQTT copy=true >
# List of metrics
<metric
name=SQ_CYCLES,SQ_WAVES,SQ_INSTS_SMEM,SQ_INSTS_VALU,TA_FLAT_WRITE_WAVEFRONTS[0],TA_FLAT_WRITE_WAVEFRONTS[1],CPC_ALWAYS_COUNT,CPC_ME1_STALL_WAIT_ON_RCIU_READ,GPUBusy,VALUBusy,SALUBusy,MemUnitBusy,SFetchInsts,FetchSize,VWriteInsts,WriteSize
></metric>
# SQTT trace with parameters
<trace name=SQTT copy="true">
<parameters
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK=0xf
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK=0xf
MASK=0x0f00
TOKEN_MASK=0x144b
TOKEN_MASK2=0xffff
></parameters>
</trace>
+47 -125
Просмотреть файл
@@ -1,283 +1,205 @@
#include "gfx_metrics.xml"
<gfx8>
<metric name=GRBM_COUNT block=GRBM event=0 ></metric>
<metric name=GRBM_GUI_ACTIVE block=GRBM event=2 ></metric>
<metric name=SQ_CYCLES block=SQ event=2 ></metric>
<metric name=SQ_WAVES block=SQ event=4 ></metric>
<metric name=SQ_ITEMS block=SQ event=14 ></metric>
<metric name=SQ_INSTS_VALU block=SQ event=26 ></metric>
<metric name=SQ_INSTS_VMEM_WR block=SQ event=27 ></metric>
<metric name=SQ_INSTS_VMEM_RD block=SQ event=28 ></metric>
<metric name=SQ_INSTS_SALU block=SQ event=30 ></metric>
<metric name=SQ_INSTS_SMEM block=SQ event=31 ></metric>
<metric name=SQ_INSTS_FLAT block=SQ event=32 ></metric>
<metric name=SQ_INSTS_FLAT_LDS_ONLY block=SQ event=33 ></metric>
<metric name=SQ_INSTS_LDS block=SQ event=34 ></metric>
<metric name=SQ_INSTS_GDS block=SQ event=35 ></metric>
<metric name=SQ_WAVE_READY block=SQ event=47 ></metric>
<metric name=SQ_WAIT_INST_LDS block=SQ event=61 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
<metric name=SQ_ACTIVE_INST_VALU block=SQ event=69 descr="Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic)"></metric>
<metric name=SQ_INST_CYCLES_SALU block=SQ event=86 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"></metric>
<metric name=SQ_THREAD_CYCLES_VALU block=SQ event=89 descr="Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)"></metric>
<metric name=SQ_THREAD_CYCLES_VALU_MAX block=SQ event=90 descr="Maximum number of thread-cycles VALU operations that could have been executed given the instruction mix (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd, emulated)"></metric>
<metric name=SQ_LDS_BANK_CONFLICT block=SQ event=97 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
<metric name=TA_BUSY block=TA event=15 ></metric>
<metric name=TA_FLAT_READ_WAVEFRONTS block=TA event=101 ></metric>
<metric name=TA_FLAT_WRITE_WAVEFRONTS block=TA event=102 ></metric>
<metric name=TCC_CYCLE block=TCC event=1 ></metric>
<metric name=TCC_REQ block=TCC event=3 ></metric>
<metric name=TCC_HIT block=TCC event=18 ></metric>
<metric name=TCC_MISS block=TCC event=19 ></metric>
<metric name=TCC_WRITEBACK block=TCC event=22 ></metric>
<metric name=TCC_EA_WRREQ block=TCC event=26 ></metric>
<metric name=TCC_EA_WRREQ_64B block=TCC event=27 ></metric>
<metric name=TCC_EA_WRREQ_STALL block=TCC event=30 ></metric>
<metric name=TCC_MC_RDREQ block=TCC event=35 ></metric>
<metric name=TCP_TA_DATA_STALL_CYCLES block=TCP event=3 descr="TCP stalls TA data interface. Now Windowed."></metric>
<metric name=CPC_ALWAYS_COUNT block=CPC event=0 ></metric>
<metric name=CPC_ME1_STALL_WAIT_ON_RCIU_READ block=CPC event=8 ></metric>
# average for (16 instances x 4 shader engines)
<metric name="TA_BUSY_avr" expr=avr(TA_BUSY,16)/4 ></metric>
# average for 16 instances
<metric name="TA_BUSY_avr" expr=avr(TA_TA_BUSY,16) ></metric>
<metric name="TA_BUSY_max" expr=max(TA_TA_BUSY,16) ></metric>
<metric name="TA_BUSY_min" expr=min(TA_TA_BUSY,16) ></metric>
# sum for 16 instances
<metric name="TA_FLAT_READ_WAVEFRONTS_sum" expr=sum(TA_FLAT_READ_WAVEFRONTS,16) ></metric>
<metric name="TA_FLAT_WRITE_WAVEFRONTS_sum" expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) ></metric>
<metric name="TCC_HIT_sum" expr=sum(TCC_HIT,16) ></metric>
<metric name="TCC_MISS_sum" expr=sum(TCC_MISS,16) ></metric>
<metric name="TCC_MC_RDREQ_sum" expr=sum(TCC_MC_RDREQ,16) ></metric>
<metric name="TCC_MC_WRREQ_sum" expr=sum(TCC_MC_WRREQ,16) ></metric>
<metric name="TCC_WRREQ_STALL_max" expr=max(TCC_MC_WRREQ_STALL,16) ></metric>
# FETCH_SIZE, kilobytes
# The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
<metric
name="FETCH_SIZE"
expr=(TCC_MC_RDREQ_sum*32)/1024
descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
></metric>
<metric name="FETCH_SIZE" expr=(TCC_MC_RDREQ_sum*32)/1024 ></metric>
# WRITE_SIZE
# The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
<metric name="WRITE_SIZE" expr=(TCC_MC_WRREQ_sum*32)/1024 ></metric>
</gfx8>
<gfx9>
<metric name=GRBM_COUNT block=GRBM event=0 ></metric>
<metric name=GRBM_GUI_ACTIVE block=GRBM event=2 ></metric>
<metric name=SQ_CYCLES block=SQ event=2 ></metric>
<metric name=SQ_WAVES block=SQ event=4 ></metric>
<metric name=SQ_ITEMS block=SQ event=14 ></metric>
<metric name=SQ_INSTS_VALU block=SQ event=26 ></metric>
<metric name=SQ_INSTS_VMEM_WR block=SQ event=27 ></metric>
<metric name=SQ_INSTS_VMEM_RD block=SQ event=28 ></metric>
<metric name=SQ_INSTS_SALU block=SQ event=30 ></metric>
<metric name=SQ_INSTS_SMEM block=SQ event=31 ></metric>
<metric name=SQ_INSTS_FLAT block=SQ event=32 ></metric>
<metric name=SQ_INSTS_FLAT_LDS_ONLY block=SQ event=33 ></metric>
<metric name=SQ_INSTS_LDS block=SQ event=34 ></metric>
<metric name=SQ_INSTS_GDS block=SQ event=35 ></metric>
<metric name=SQ_WAVE_READY block=SQ event=47 ></metric>
<metric name=SQ_WAIT_INST_LDS block=SQ event=63 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
<metric name=SQ_ACTIVE_INST_VALU block=SQ event=71 descr="regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic)"></metric>
<metric name=SQ_INST_CYCLES_SALU block=SQ event=84 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"></metric>
<metric name=SQ_THREAD_CYCLES_VALU block=SQ event=85 descr="Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)"></metric>
<metric name=SQ_THREAD_CYCLES_VALU_MAX block=SQ event=86 descr="Maximum number of thread-cycles VALU operations that could have been executed given the instruction mix (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd, emulated)"></metric>
<metric name=SQ_LDS_BANK_CONFLICT block=SQ event=93 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
<metric name=TA_BUSY block=TA event=15 ></metric>
<metric name=TA_FLAT_READ_WAVEFRONTS block=TA event=101 ></metric>
<metric name=TA_FLAT_WRITE_WAVEFRONTS block=TA event=102 ></metric>
<metric name=TCC_CYCLE block=TCC event=1 ></metric>
<metric name=TCC_REQ block=TCC event=3 ></metric>
<metric name=TCC_HIT block=TCC event=20 ></metric>
<metric name=TCC_MISS block=TCC event=22 ></metric>
<metric name=TCC_WRITEBACK block=TCC event=25 ></metric>
<metric name=TCC_EA_WRREQ block=TCC event=29 ></metric>
<metric name=TCC_EA_WRREQ_64B block=TCC event=30 ></metric>
<metric name=TCC_EA_WRREQ_STALL block=TCC event=33 ></metric>
<metric name=TCC_EA_RDREQ block=TCC event=41 ></metric>
<metric name=TCC_EA_RDREQ_32B block=TCC event=42 ></metric>
<metric name=TCP_TA_DATA_STALL_CYCLES block=TCP event=6 descr="TCP stalls TA data interface. Now Windowed."></metric>
<metric name=CPC_ALWAYS_COUNT block=CPC event=0 ></metric>
<metric name=CPC_ME1_STALL_WAIT_ON_RCIU_READ block=CPC event=8 ></metric>
# average for (16 instances x 4 shader engines)
<metric name="TA_BUSY_avr" expr=avr(TA_BUSY,16)/4 ></metric>
# average for 16 instances
<metric name="TA_BUSY_avr" expr=avr(TA_TA_BUSY,16) ></metric>
<metric name="TA_BUSY_max" expr=max(TA_TA_BUSY,16) ></metric>
<metric name="TA_BUSY_min" expr=min(TA_TA_BUSY,16) ></metric>
# sum for 16 instances
<metric name="TA_FLAT_READ_WAVEFRONTS_sum" expr=sum(TA_FLAT_READ_WAVEFRONTS,16) ></metric>
<metric name="TA_FLAT_WRITE_WAVEFRONTS_sum" expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) ></metric>
<metric name="TCC_HIT_sum" expr=sum(TCC_HIT,16) ></metric>
<metric name="TCC_MISS_sum" expr=sum(TCC_MISS,16) ></metric>
<metric name="TCC_EA_RDREQ_sum" expr=sum(TCC_EA_RDREQ,16) ></metric>
<metric name="TCC_EA_RDREQ_32B_sum" expr=sum(TCC_EA_RDREQ_32B,16) ></metric>
<metric name="TCC_EA_RDREQ_sum" expr=sum(TCC_EA_RDREQ,16) ></metric>
<metric name="TCC_EA_WRREQ_sum" expr=sum(TCC_EA_WRREQ,16) ></metric>
<metric name="TCC_EA_WRREQ_64B_sum" expr=sum(TCC_EA_WRREQ_64B,16) ></metric>
<metric name="TCC_WRREQ_STALL_max" expr=max(TCC_EA_WRREQ_STALL,16) ></metric>
# FETCH_SIZE, kilobytes
# The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
<metric
name="FETCH_SIZE"
expr=((TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64+TCC_EA_RDREQ_32B_sum*32)/1024
descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
></metric>
<metric name="FETCH_SIZE" expr=(TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024 ></metric>
# WRITE_SIZE
# The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
<metric name="WRITE_SIZE" expr=((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024 ></metric>
</gfx9>
<global>
# GPU_BUSY, percentage
# GPUBusy, percentage
# The percentage of time GPU was busy.
<metric
name="GPUBusy"
expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT
descr="The percentage of time GPU was busy."
expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT
></metric>
# Wavefronts Total wavefronts.,
<metric
name="Wavefronts"
expr=SQ_WAVES
descr="Total wavefronts."
expr=SQ_WAVES
></metric>
# VALUInsts The average number of vector ALU instructions executed per work-item (affected by flow control).
<metric
name="VALUInsts"
expr=SQ_INSTS_VALU/SQ_WAVES
descr="The average number of vector ALU instructions executed per work-item (affected by flow control)."
expr=SQ_INSTS_VALU/SQ_WAVES
></metric>
# SALUInsts The average number of scalar ALU instructions executed per work-item (affected by flow control).
<metric
name="SALUInsts"
expr=SQ_INSTS_SALU/SQ_WAVES
descr="The average number of scalar ALU instructions executed per work-item (affected by flow control)."
expr=SQ_INSTS_SALU/SQ_WAVES
></metric>
# VFetchInsts The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory.
<metric
name="VFetchInsts"
expr=(SQ_INSTS_VMEM_RD-TA_FLAT_READ_WAVEFRONTS_sum)/SQ_WAVES
descr="The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory."
expr=(SQ_INSTS_VMEM_RD-TA_FLAT_READ_WAVEFRONTS_sum)/SQ_WAVES
></metric>
# SFetchInsts The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control).
<metric
name="SFetchInsts"
expr=SQ_INSTS_SMEM/SQ_WAVES
descr="The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control)."
expr=SQ_INSTS_SMEM/SQ_WAVES
></metric>
# VWriteInsts The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory.
<metric
name="VWriteInsts"
expr=(SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES
descr="The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory."
expr=(SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES
></metric>
# FlatVMemInsts The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch.
<metric
name="FlatVMemInsts"
expr=(SQ_INSTS_FLAT-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
descr="The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch."
expr=(SQ_INSTS_FLAT-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
></metric>
# LDSInsts The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS.
<metric
name="LDSInsts"
expr=(SQ_INSTS_LDS-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
descr="The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS."
expr=(SQ_INSTS_LDS-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
></metric>
# FlatLDSInsts The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control).
<metric
name="FlatLDSInsts"
expr=SQ_INSTS_FLAT_LDS_ONLY/SQ_WAVES
descr="The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control)."
expr=SQ_INSTS_FLAT_LDS_ONLY/SQ_WAVES
></metric>
# GDSInsts The average number of GDS read or GDS write instructions executed per work item (affected by flow control).
<metric
name="GDSInsts"
expr=SQ_INSTS_GDS/SQ_WAVES
descr="The average number of GDS read or GDS write instructions executed per work item (affected by flow control)."
expr=SQ_INSTS_GDS/SQ_WAVES
></metric>
# VALUUtilization The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence).
<metric
name="VALUUtilization"
expr=100*SQ_THREAD_CYCLES_VALU/(SQ_ACTIVE_INST_VALU*64)
descr="The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence)."
expr=100*SQ_THREAD_CYCLES_VALU/(SQ_ACTIVE_INST_VALU*MAX_WAVE_SIZE)
></metric>
# VALUBusy The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
<metric
name="VALUBusy"
expr=100*SQ_ACTIVE_INST_VALU*4/SIMD_NUM/GRBM_GUI_ACTIVE
descr="The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."
expr=100*SQ_ACTIVE_INST_VALU*4/SIMD_NUM/GRBM_GUI_ACTIVE
></metric>
# SALUBusy The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
<metric
name="SALUBusy"
expr=100*SQ_INST_CYCLES_SALU*4/SIMD_NUM/GRBM_GUI_ACTIVE
descr="The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."
expr=100*SQ_INST_CYCLES_SALU*4/SIMD_NUM/GRBM_GUI_ACTIVE
></metric>
# FetchSize The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
<metric
name="FetchSize"
expr=FETCH_SIZE
descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
expr=FETCH_SIZE
></metric>
# WriteSize The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
<metric
name="WriteSize"
expr=((sum(TCC_EA_WRREQ,16)-sum(TCC_EA_WRREQ_64B,16))*32+sum(TCC_EA_WRREQ_64B,16)*64)/1024
descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
expr=WRITE_SIZE
></metric>
# L2CacheHit The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal).
<metric
name="L2CacheHit"
expr=100*sum(TCC_HIT,16)/(sum(TCC_HIT,16)+sum(TCC_MISS,16))
descr="The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal)."
expr=100*sum(TCC_HIT,16)/(sum(TCC_HIT,16)+sum(TCC_MISS,16))
></metric>
# MemUnitBusy The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound).
<metric
name="MemUnitBusy"
expr=100*max(TA_BUSY,16)/GRBM_GUI_ACTIVE/SE_NUM
descr="The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound)."
expr=100*max(TA_TA_BUSY,16)/GRBM_GUI_ACTIVE/SE_NUM
></metric>
# MemUnitStalled The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad).
<metric
name="MemUnitStalled"
expr=100*max(TCP_TA_DATA_STALL_CYCLES,16)/GRBM_GUI_ACTIVE/SE_NUM
descr="The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad)."
expr=100*max(TCP_TCP_TA_DATA_STALL_CYCLES,16)/GRBM_GUI_ACTIVE/SE_NUM
></metric>
# WriteUnitStalled The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad).
<metric
name="WriteUnitStalled"
expr=100*max(TCC_EA_WRREQ_STALL,16)/GRBM_GUI_ACTIVE
descr="The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad)."
expr=100*TCC_WRREQ_STALL_max/GRBM_GUI_ACTIVE
></metric>
# The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad).
<metric
name="ALUStalledByLDS"
expr=100*SQ_WAIT_INST_LDS*4/SQ_WAVES/GRBM_GUI_ACTIVE
descr="The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad)."
expr=100*SQ_WAIT_INST_LDS*4/SQ_WAVES/GRBM_GUI_ACTIVE
></metric>
# LDSBankConflict The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad).
<metric
name="LDSBankConflict"
expr=100*SQ_LDS_BANK_CONFLICT/GRBM_GUI_ACTIVE/CU_NUM
descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."
expr=100*SQ_LDS_BANK_CONFLICT/GRBM_GUI_ACTIVE/CU_NUM
></metric>
</global>
+32 -42
Просмотреть файл
@@ -25,6 +25,7 @@ POSSIBILITY OF SUCH DAMAGE.
#include "util/hsa_rsrc_factory.h"
#include <dlfcn.h>
#include <fcntl.h>
#include <hsa.h>
#include <hsa_ext_amd.h>
#include <hsa_ext_finalize.h>
@@ -32,6 +33,8 @@ POSSIBILITY OF SUCH DAMAGE.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <atomic>
#include <cassert>
@@ -77,12 +80,13 @@ hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* d
}
// Constructor of the class
HsaRsrcFactory::HsaRsrcFactory() {
HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) {
hsa_status_t status;
// Initialize the Hsa Runtime
printf("HSA init\n");
hsa_status_t status = hsa_init();
CHECK_STATUS("Error in hsa_init", status);
if (initialize_hsa_) {
status = hsa_init();
CHECK_STATUS("Error in hsa_init", status);
}
// Discover the set of Gpu devices available on the platform
status = hsa_iterate_agents(GetHsaAgentsCallback, this);
CHECK_STATUS("Error Calling hsa_iterate_agents", status);
@@ -106,10 +110,10 @@ HsaRsrcFactory::HsaRsrcFactory() {
HsaRsrcFactory::~HsaRsrcFactory() {
for (auto p : cpu_list_) delete p;
for (auto p : gpu_list_) delete p;
printf("HSA shutdown\n");
hsa_status_t status = hsa_shut_down();
CHECK_STATUS("Error in hsa_shut_down", status);
if (initialize_hsa_) {
hsa_status_t status = hsa_shut_down();
CHECK_STATUS("Error in hsa_shut_down", status);
}
}
hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) {
@@ -365,67 +369,53 @@ bool HsaRsrcFactory::TransferData(void* dest_buff, void* src_buff, uint32_t leng
//
// @return bool true if successful, false otherwise
//
void* HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc) {
// Finalize the Hsail object into code object
hsa_status_t status;
hsa_code_object_t code_object;
bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
const char* kernel_name, hsa_executable_t* executable, hsa_executable_symbol_t* code_desc) {
hsa_status_t status = HSA_STATUS_ERROR;
// Build the code object filename
std::string filename(brig_path);
std::clog << "Code object filename: " << filename << std::endl;
// Open the file containing code object
std::ifstream codeStream(filename.c_str(), std::ios::binary | std::ios::ate);
if (!codeStream) {
std::cerr << "Error: failed to load " << filename << std::endl;
hsa_file_t file_handle = open(filename.c_str(), O_RDONLY);
if (file_handle == -1) {
std::cerr << "Error: failed to load '" << filename << "'" << std::endl;
assert(false);
return NULL;
return false;
}
// Allocate memory to read in code object from file
size_t size = std::string::size_type(codeStream.tellg());
char* code_buf = (char*)AllocateSysMemory(agent_info, size);
if (!code_buf) {
std::cerr << "Error: failed to allocate memory for code object." << std::endl;
assert(false);
return NULL;
}
// Read the code object into allocated memory
codeStream.seekg(0, std::ios::beg);
std::copy(std::istreambuf_iterator<char>(codeStream), std::istreambuf_iterator<char>(), code_buf);
// De-Serialize the code object that has been read into memory
status = hsa_code_object_deserialize(code_buf, size, NULL, &code_object);
// Create code object reader
hsa_code_object_reader_t code_obj_rdr = {0};
status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr);
if (status != HSA_STATUS_SUCCESS) {
std::cerr << "Failed to deserialize code object" << std::endl;
if (code_buf) hsa_memory_free(code_buf);
return NULL;
std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl;
return false;
}
// Create executable.
status =
hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", hsa_exec);
status = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable);
CHECK_STATUS("Error in creating executable object", status);
// Load code object.
status = hsa_executable_load_code_object(*hsa_exec, agent_info->dev_id, code_object, "");
status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id,
code_obj_rdr, NULL, NULL);
CHECK_STATUS("Error in loading executable object", status);
// Freeze executable.
status = hsa_executable_freeze(*hsa_exec, "");
status = hsa_executable_freeze(*executable, "");
CHECK_STATUS("Error in freezing executable object", status);
// Get symbol handle.
hsa_executable_symbol_t kernelSymbol;
status = hsa_executable_get_symbol(*hsa_exec, NULL, kernel_name, agent_info->dev_id, 0,
status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0,
&kernelSymbol);
CHECK_STATUS("Error in looking up kernel symbol", status);
// Update output parameter
*code_desc = kernelSymbol;
return code_buf;
return true;
}
// Print the various fields of Hsa Gpu Agents
+10 -9
Просмотреть файл
@@ -112,20 +112,18 @@ class HsaRsrcFactory {
public:
typedef std::recursive_mutex mutex_t;
static HsaRsrcFactory* Create() { return NULL; }
static HsaRsrcFactory* CreateInstance() {
static HsaRsrcFactory* Create(bool initialize_hsa = true) {
std::lock_guard<mutex_t> lck(mutex_);
if (instance_ == NULL) {
instance_ = new HsaRsrcFactory();
instance_ = new HsaRsrcFactory(initialize_hsa);
}
return instance_;
}
static HsaRsrcFactory& Instance() {
CreateInstance();
if (instance_ == NULL) instance_ = Create(false);
hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
CHECK_STATUS("HsaRsrcFactory::Instance() is not found", status);
CHECK_STATUS("HsaRsrcFactory::Instance() failed", status);
return *instance_;
}
@@ -229,9 +227,9 @@ class HsaRsrcFactory {
// @param code_desc Handle of finalized Code Descriptor that could
// be used to submit for execution
//
// @return code buffer, non NULL if successful, NULL otherwise
// @return true if successful, false otherwise
//
void* LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc);
// Print the various fields of Hsa Gpu Agents
@@ -259,11 +257,14 @@ class HsaRsrcFactory {
// Constructor of the class. Will initialize the Hsa Runtime and
// query the system topology to get the list of Cpu and Gpu devices
HsaRsrcFactory();
HsaRsrcFactory(bool initialize_hsa);
// Destructor of the class
~HsaRsrcFactory();
// HSA was initialized
const bool initialize_hsa_;
// Add an instance of AgentInfo representing a Hsa Gpu agent
const AgentInfo* AddAgentInfo(const hsa_agent_t agent);