[ROCm/rocprofiler commit: 79510b8c6c]
Этот коммит содержится в:
Evgeny
2018-02-01 14:52:21 -06:00
родитель 14e45e9cb4
Коммит 80d462a8c2
17 изменённых файлов: 789 добавлений и 238 удалений
+2 -3
Просмотреть файл
@@ -78,9 +78,8 @@ extern "C" {
// Profiling feature kind
typedef enum {
ROCPROFILER_FEATURE_KIND_COUNTER = 0,
ROCPROFILER_FEATURE_KIND_METRIC = 1,
ROCPROFILER_FEATURE_KIND_TRACE = 2
ROCPROFILER_FEATURE_KIND_METRIC = 0,
ROCPROFILER_FEATURE_KIND_TRACE = 1
} rocprofiler_feature_kind_t;
// Profiling feture parameter
+1 -1
Просмотреть файл
@@ -200,7 +200,7 @@ class Context {
const rocprofiler_feature_kind_t kind = info->kind;
const char* name = info->name;
if (kind != ROCPROFILER_FEATURE_KIND_TRACE) { // Processing metrics features
if (kind == ROCPROFILER_FEATURE_KIND_METRIC) { // Processing metrics features
const Metric* metric = metrics_->Get(name);
if (metric == NULL)
EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found");
+109 -34
Просмотреть файл
@@ -38,7 +38,7 @@ class Metric {
virtual const xml::Expr* GetExpr() const = 0;
private:
std::string name_;
const std::string name_;
};
class BaseMetric : public Metric {
@@ -107,10 +107,38 @@ class MetricsDict {
}
}
const Metric* Get(const std::string& name) const {
const Metric* Get(const std::string& name) {
const Metric* metric = NULL;
auto it = cache_.find(name);
if (it != cache_.end()) metric = it->second;
else {
const std::size_t pos = name.find(':');
if (pos != std::string::npos) {
std::string block_name = name.substr(0, pos);
const std::string event_str = name.substr(pos + 1);
uint32_t block_index = 0;
bool indexed = false;
const std::size_t pos1 = block_name.find('[');
if (pos1 != std::string::npos) {
const std::size_t pos2 = block_name.find(']');
if (pos2 == std::string::npos) EXC_RAISING(HSA_STATUS_ERROR, "Malformed metric name '" << name << "'");
block_name = name.substr(0, pos1);
const std::string block_index_str = name.substr(pos1 + 1, pos2 - (pos1 + 1));
block_index = atol(block_index_str.c_str());
indexed = true;
}
const hsa_ven_amd_aqlprofile_id_query_t query = Translate(agent_info_, block_name);
const hsa_ven_amd_aqlprofile_block_name_t block_id = (hsa_ven_amd_aqlprofile_block_name_t)query.id;
if ((query.instance_count > 1) && (indexed == false)) EXC_RAISING(HSA_STATUS_ERROR, "Malformed indexed metric name '" << name << "'");
const uint32_t event_id = atol(event_str.c_str());
const counter_t counter = {name, {block_id, block_index, event_id}};
metric = new BaseMetric(name, counter);
}
}
return metric;
}
@@ -119,12 +147,15 @@ class MetricsDict {
const_iterator_t End() const { return cache_.end(); }
private:
MetricsDict(const util::AgentInfo* agent_info) : xml_(NULL) {
MetricsDict(const util::AgentInfo* agent_info) : xml_(NULL), agent_info_(agent_info) {
const char* xml_name = getenv("ROCP_METRICS");
if (xml_name != NULL) {
xml_ = xml::Xml::Create(xml_name);
if (xml_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "metrics .xml open error '" << xml_name << "'");
std::cout << "ROCProfiler: importing metrics from '" << xml_name << "':" << std::endl;
xml_->AddConst("top.const.metric", "NUM_SIMDS", 64);
xml_->AddConst("top.const.metric", "NUM_SHADER_ENGINES", 4);
std::cout << "ROCProfiler: importing '" << xml_name << "':" << std::endl;
ImportMetrics(agent_info, "const");
ImportMetrics(agent_info, agent_info->gfxip);
ImportMetrics(agent_info, "global");
}
@@ -135,47 +166,53 @@ class MetricsDict {
for (auto& entry : cache_) delete entry.second;
}
void ImportMetrics(const util::AgentInfo* agent_info, const char* scope) {
auto scope_list = xml_->GetNodes("top." + std::string(scope) + ".metric");
static hsa_ven_amd_aqlprofile_id_query_t Translate(const util::AgentInfo* agent_info, const std::string& block_name) {
hsa_ven_amd_aqlprofile_profile_t profile;
profile.agent = agent_info->dev_id;
hsa_ven_amd_aqlprofile_id_query_t query = {block_name.c_str(), 0, 0};
hsa_status_t status =
util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info(
&profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID, &query);
if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(HSA_STATUS_ERROR, "ImportMetrics: bad block name '" << block_name << "'");
return query;
}
void ImportMetrics(const util::AgentInfo* agent_info, const std::string& scope) {
auto scope_list = xml_->GetNodes("top." + scope + ".metric");
if (!scope_list.empty()) {
std::cout << " " << scope_list.size() << " " << scope << " metrics found" << std::endl;
for (auto node : scope_list) {
const std::string name = node->opts["name"];
if (cache_.find(name) != cache_.end())
EXC_RAISING(HSA_STATUS_ERROR, "ImportMetrics: metrics redefined '" << name << "'");
const std::string expr_str = node->opts["expr"];
std::string descr = node->opts["descr"];
if (descr.empty()) descr = (expr_str.empty()) ? name : expr_str;
if (expr_str.empty()) {
const std::string block_name = node->opts["block"];
const uint32_t event_id = atoi(node->opts["event"].c_str());
const std::string event_str = node->opts["event"];
const uint32_t event_id = atol(event_str.c_str());
hsa_ven_amd_aqlprofile_profile_t profile;
profile.agent = agent_info->dev_id;
hsa_ven_amd_aqlprofile_id_query_t query = {block_name.c_str(), 0, 0};
hsa_status_t status =
util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info(
&profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID, &query);
if (status == HSA_STATUS_SUCCESS) {
const hsa_ven_amd_aqlprofile_block_name_t block_id =
(hsa_ven_amd_aqlprofile_block_name_t)query.id;
if (query.instance_count > 1) {
for (unsigned block_index = 0; block_index < query.instance_count; ++block_index) {
std::ostringstream os;
os << name << '[' << block_index << ']';
const std::string full_name = os.str();
const counter_t counter = {full_name, {block_id, block_index, event_id}};
cache_[full_name] = new BaseMetric(full_name, counter);
}
} else {
const counter_t counter = {name, {block_id, 0, event_id}};
cache_[name] = new BaseMetric(name, counter);
const hsa_ven_amd_aqlprofile_id_query_t query = Translate(agent_info, block_name);
const hsa_ven_amd_aqlprofile_block_name_t block_id = (hsa_ven_amd_aqlprofile_block_name_t)query.id;
if (query.instance_count > 1) {
for (unsigned block_index = 0; block_index < query.instance_count; ++block_index) {
std::ostringstream full_name;
full_name << name << '[' << block_index << ']';
std::ostringstream alias;
alias << block_name << "[" << block_index << "]:" << event_str;
const counter_t counter = {full_name.str(), {block_id, block_index, event_id}};
AddMetric(full_name.str(), alias.str(), counter);
}
} else
AQL_EXC_RAISING(HSA_STATUS_ERROR, "ImportMetrics: bad block name '" << block_name
<< "'");
} else {
const std::string alias = block_name + ":" + event_str;
const counter_t counter = {name, {block_id, 0, event_id}};
AddMetric(name, alias, counter);
}
} else {
xml::Expr* expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_));
std::cout << "# " << descr << std::endl;
std::cout << name << "=" << expr_obj->String() << "\n" << std::endl;
counters_vec_t counters_vec;
for (const std::string var : expr_obj->GetVars()) {
auto it = cache_.find(var);
@@ -184,12 +221,46 @@ class MetricsDict {
<< "' is not found");
it->second->GetCounters(counters_vec);
}
cache_[name] = new ExprMetric(name, counters_vec, expr_obj);
AddMetric(name, counters_vec, expr_obj);
}
}
}
}
const Metric* AddMetric(const std::string& name, const std::string& /*alias*/, const counter_t& counter) {
const Metric* metric = NULL;
const auto ret = cache_.insert({name, NULL});
if (ret.second) {
metric = new BaseMetric(name, counter);
ret.first->second = metric;
} else EXC_RAISING(HSA_STATUS_ERROR, "metric redefined '" << name << "'");
#if 0
if (alias != name) {
if (cache_.find(alias) != cache_.end()) EXC_RAISING(HSA_STATUS_ERROR, "metric alias/name interference '" << alias << "'");
const auto ret = aliases_.insert({alias, name});
if (!ret.second) EXC_RAISING(HSA_STATUS_ERROR, "metric alias redefined '" << alias << "'");
}
#endif
return metric;
}
const Metric* AddMetric(const std::string& name, const counters_vec_t& counters_vec, const xml::Expr* expr_obj) {
const Metric* metric = NULL;
const auto ret = cache_.insert({name, NULL});
if (ret.second) {
metric = new ExprMetric(name, counters_vec, expr_obj);
ret.first->second = metric;
} else EXC_RAISING(HSA_STATUS_ERROR, "expr-metric redefined '" << name << "'");
return metric;
}
#if 0
std::string UnAlias(const std::string& name) const {
auto it = aliases_.find(name);
return (it != aliases_.end()) ? it->second : name;
}
#endif
void Print() {
for (auto& v : cache_) {
const Metric* metric = v.second;
@@ -203,7 +274,11 @@ class MetricsDict {
}
xml::Xml* xml_;
const util::AgentInfo* agent_info_;
cache_t cache_;
#if 0
std::map<std::string, std::string> aliases_;
#endif
static map_t* map_;
static mutex_t mutex_;
+7 -6
Просмотреть файл
@@ -79,15 +79,16 @@ class Profile {
Profile(const util::AgentInfo* agent_info) : agent_info_(agent_info) {
profile_ = {};
profile_.agent = agent_info->dev_id;
completion_signal_ = {};
is_legacy_ = (strncmp(agent_info->name, "gfx8", 4) == 0);
}
virtual ~Profile() {
if (!info_vector_.empty()) {
info_vector_.clear();
hsa_memory_free(profile_.command_buffer.ptr);
hsa_memory_free(profile_.output_buffer.ptr);
free(const_cast<event_t*>(profile_.events));
free(const_cast<parameter_t*>(profile_.parameters));
info_vector_.clear();
if (profile_.command_buffer.ptr) hsa_memory_free(profile_.command_buffer.ptr);
if (profile_.output_buffer.ptr) hsa_memory_free(profile_.output_buffer.ptr);
if (profile_.events) free(const_cast<event_t*>(profile_.events));
if (profile_.parameters) free(const_cast<parameter_t*>(profile_.parameters));
if (completion_signal_.handle) {
hsa_status_t status = hsa_signal_destroy(completion_signal_);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_destroy " << std::hex << status);
}
+62 -18
Просмотреть файл
@@ -26,12 +26,14 @@ POSSIBILITY OF SUCH DAMAGE.
#include <dlfcn.h>
#include <hsa.h>
#include <hsa_ext_amd.h>
#include <hsa_ext_finalize.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <atomic>
#include <cassert>
#include <fstream>
#include <iostream>
@@ -76,6 +78,7 @@ hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* d
// Constructor of the class
HsaRsrcFactory::HsaRsrcFactory() {
// Initialize the Hsa Runtime
printf("HSA init\n");
hsa_status_t status = hsa_init();
CHECK_STATUS("Error in hsa_init", status);
@@ -100,6 +103,10 @@ HsaRsrcFactory::HsaRsrcFactory() {
// Destructor of the class
HsaRsrcFactory::~HsaRsrcFactory() {
for (auto p : cpu_list_) free(const_cast<AgentInfo*>(p));
for (auto p : gpu_list_) free(const_cast<AgentInfo*>(p));
printf("HSA shutdown\n");
hsa_status_t status = hsa_shut_down();
CHECK_STATUS("Error in hsa_shut_down", status);
}
@@ -162,12 +169,15 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) {
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name);
strncpy(agent_info->gfxip, agent_info->name, 4);
agent_info->gfxip[4] = '\0';
agent_info->max_wave_size = 0;
hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size);
agent_info->max_queue_size = 0;
hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size);
agent_info->profile = hsa_profile_t(108);
hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile);
agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false;
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &agent_info->cu_num);
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), &agent_info->waves_per_cu);
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), &agent_info->simds_per_cu);
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &agent_info->se_num);
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &agent_info->shader_arrays_per_se);
// Initialize memory regions to zero
agent_info->kernarg_region.handle = 0;
@@ -349,8 +359,8 @@ bool HsaRsrcFactory::TransferData(void* dest_buff, void* src_buff, uint32_t leng
//
// @return bool true if successful, false otherwise
//
bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
char* kernel_name, hsa_executable_symbol_t* code_desc) {
void* HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc) {
// Finalize the Hsail object into code object
hsa_status_t status;
hsa_code_object_t code_object;
@@ -364,52 +374,52 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br
if (!codeStream) {
std::cerr << "Error: failed to load " << filename << std::endl;
assert(false);
return false;
return NULL;
}
// Allocate memory to read in code object from file
size_t size = std::string::size_type(codeStream.tellg());
char* codeBuff = (char*)AllocateSysMemory(agent_info, size);
if (!codeBuff) {
char* code_buf = (char*)AllocateSysMemory(agent_info, size);
if (!code_buf) {
std::cerr << "Error: failed to allocate memory for code object." << std::endl;
assert(false);
return false;
return NULL;
}
// Read the code object into allocated memory
codeStream.seekg(0, std::ios::beg);
std::copy(std::istreambuf_iterator<char>(codeStream), std::istreambuf_iterator<char>(), codeBuff);
std::copy(std::istreambuf_iterator<char>(codeStream), std::istreambuf_iterator<char>(), code_buf);
// De-Serialize the code object that has been read into memory
status = hsa_code_object_deserialize(codeBuff, size, NULL, &code_object);
status = hsa_code_object_deserialize(code_buf, size, NULL, &code_object);
if (status != HSA_STATUS_SUCCESS) {
std::cerr << "Failed to deserialize code object" << std::endl;
return false;
if (code_buf) hsa_memory_free(code_buf);
return NULL;
}
// Create executable.
hsa_executable_t hsaExecutable;
status =
hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &hsaExecutable);
hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", hsa_exec);
CHECK_STATUS("Error in creating executable object", status);
// Load code object.
status = hsa_executable_load_code_object(hsaExecutable, agent_info->dev_id, code_object, "");
status = hsa_executable_load_code_object(*hsa_exec, agent_info->dev_id, code_object, "");
CHECK_STATUS("Error in loading executable object", status);
// Freeze executable.
status = hsa_executable_freeze(hsaExecutable, "");
status = hsa_executable_freeze(*hsa_exec, "");
CHECK_STATUS("Error in freezing executable object", status);
// Get symbol handle.
hsa_executable_symbol_t kernelSymbol;
status = hsa_executable_get_symbol(hsaExecutable, NULL, kernel_name, agent_info->dev_id, 0,
status = hsa_executable_get_symbol(*hsa_exec, NULL, kernel_name, agent_info->dev_id, 0,
&kernelSymbol);
CHECK_STATUS("Error in looking up kernel symbol", status);
// Update output parameter
*code_desc = kernelSymbol;
return true;
return code_buf;
}
// Print the various fields of Hsa Gpu Agents
@@ -423,13 +433,47 @@ bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) {
std::clog << "> agent[" << idx << "] :" << std::endl;
std::clog << ">> Name : " << agent_info->name << std::endl;
std::clog << ">> APU : " << agent_info->is_apu << std::endl;
std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl;
std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl;
std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl;
std::clog << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl;
std::clog << ">> CU number : " << agent_info->cu_num << std::endl;
std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl;
std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl;
std::clog << ">> SE number : " << agent_info->se_num << std::endl;
std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl;
}
return true;
}
uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, void* packet) {
const uint32_t slot_size_b = 0x40;
// adevance command queue
const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue);
hsa_queue_store_write_index_relaxed(queue, write_idx + 1);
while ((write_idx - hsa_queue_load_read_index_relaxed(queue)) >= queue->size) {
sched_yield();
}
uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
uint32_t* queue_slot = (uint32_t*)((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
uint32_t* slot_data = (uint32_t*)packet;
// Copy buffered commands into the queue slot.
// Overwrite the AQL invalid header (first dword) last.
// This prevents the slot from being read until it's fully written.
memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t));
std::atomic<uint32_t>* header_atomic_ptr = reinterpret_cast<std::atomic<uint32_t>*>(&queue_slot[0]);
header_atomic_ptr->store(slot_data[0], std::memory_order_release);
// ringdoor bell
hsa_signal_store_relaxed(queue->doorbell_signal, write_idx);
return write_idx;
}
HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL;
HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_;
+24 -3
Просмотреть файл
@@ -67,6 +67,9 @@ struct AgentInfo {
// Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
uint32_t dev_type;
// APU flag
bool is_apu;
// Agent system index
uint32_t dev_index;
@@ -90,6 +93,21 @@ struct AgentInfo {
// Memory region supporting kernel arguments
hsa_region_t kernarg_region;
// The number of compute unit available in the agent.
uint32_t cu_num;
// Maximum number of waves possible in a Compute Unit.
uint32_t waves_per_cu;
// Number of SIMD's per compute unit CU
uint32_t simds_per_cu;
// Number of Shader Engines (SE) in Gpu
uint32_t se_num;
// Number of Shader Arrays Per Shader Engines in Gpu
uint32_t shader_arrays_per_se;
};
class HsaRsrcFactory {
@@ -213,14 +231,17 @@ class HsaRsrcFactory {
// @param code_desc Handle of finalized Code Descriptor that could
// be used to submit for execution
//
// @return bool true if successful, false otherwise
// @return code buffer, non NULL if successful, NULL otherwise
//
bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, char* kernel_name,
hsa_executable_symbol_t* code_desc);
void* LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc);
// Print the various fields of Hsa Gpu Agents
bool PrintGpuAgents(const std::string& header);
// Submit AQL packet to given queue
static uint64_t Submit(hsa_queue_t* queue, void* packet);
// Return AqlProfile API table
typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t;
const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; }
+106 -1
Просмотреть файл
@@ -24,6 +24,7 @@ class div_zero_exception_t : public exception_t {
};
typedef uint64_t args_t;
static const args_t ARGS_MAX = UINT64_MAX;
typedef std::map<std::string, args_t> args_map_t;
class Expr;
@@ -177,14 +178,19 @@ class Expr {
bool SubCheck() const { return (sub_count_ == 0); }
unsigned FindOp() const {
unsigned i = pos_;
unsigned open_n = 0;
while (i < expr_.length()) {
switch (Symb(i)) {
case '*':
case '/':
case '+':
case '-':
goto end;
case '(':
++open_n;
break;
case ')':
if (open_n != 0) i += 1;
goto end;
}
++i;
@@ -263,6 +269,74 @@ class var_expr_t : public bin_expr_t {
const std::string name_;
};
class fun_expr_t : public bin_expr_t {
public:
typedef std::vector<var_expr_t> vvect_t;
fun_expr_t(const std::string& fname, const std::string& vname, const uint32_t& vnum) : fname_(fname) {
for (uint32_t i = 0; i < vnum; ++i) {
std::ostringstream var_full_name;
var_full_name << vname << "[" << i << "]";
vvect.push_back(var_expr_t(var_full_name.str()));
}
}
const vvect_t& GetVars() const { return vvect; }
std::string Symbol() const {
const std::string var = vvect[0].Symbol();
const std::string vname = var.substr(0, var.length() - 3);
std::ostringstream oss;
std::string str("(");
str.back() = ')';
oss << fname_ << "(" << vname << "," << vvect.size() << ")";
return oss.str();
}
private:
const std::string fname_;
vvect_t vvect;
};
class sum_expr_t : public fun_expr_t {
public:
sum_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("sum", vname, vnum) {}
args_t Eval(const args_cache_t& args) const {
args_t result = 0;
for (const auto& var : GetVars()) result += var.Eval(args);
return result;
}
};
class avr_expr_t : public fun_expr_t {
public:
avr_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("avr", vname, vnum) {}
args_t Eval(const args_cache_t& args) const {
args_t result = 0;
for (const auto& var : GetVars()) result += var.Eval(args);
return result / GetVars().size();
}
};
class min_expr_t : public fun_expr_t {
public:
min_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("min", vname, vnum) {}
args_t Eval(const args_cache_t& args) const {
args_t result = ARGS_MAX;
for (const auto& var : GetVars()) {
args_t val = var.Eval(args);
result = (val < result) ? val : result;
}
return result;
}
};
class max_expr_t : public fun_expr_t {
public:
max_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("max", vname, vnum) {}
args_t Eval(const args_cache_t& args) const {
args_t result = 0;
for (const auto& var : GetVars()) {
args_t val = var.Eval(args);
result = (val > result) ? val : result;
}
return result;
}
};
inline const bin_expr_t* bin_expr_t::CreateExpr(const bin_expr_t* arg1, const bin_expr_t* arg2,
const char op) {
const bin_expr_t* expr = NULL;
@@ -285,11 +359,41 @@ inline const bin_expr_t* bin_expr_t::CreateExpr(const bin_expr_t* arg1, const bi
inline const bin_expr_t* bin_expr_t::CreateArg(Expr* obj, const std::string str) {
const bin_expr_t* arg = NULL;
const unsigned i = strspn(str.c_str(), "1234567890");
if (i == str.length()) {
const unsigned value = atoi(str.c_str());
arg = new const_expr_t(value);
} else {
}
if (arg == NULL) {
const std::size_t pos = str.find('(');
if (pos != std::string::npos) {
char* fname = NULL;
char* vname = NULL;
int vnum = 0;
int ret = sscanf(str.c_str(), "%m[a-zA-Z_](%m[0-9a-zA-Z_],%d)", &fname, &vname, &vnum);
if (ret == 3) {
const std::string fun_name(fname);
const fun_expr_t* farg = NULL;
if (fun_name == "sum") {
farg = new sum_expr_t(vname, vnum);
} else if (fun_name == "avr") {
farg = new avr_expr_t(vname, vnum);
} else if (fun_name == "min") {
farg = new min_expr_t(vname, vnum);
} else if (fun_name == "max") {
farg = new max_expr_t(vname, vnum);
}
if (farg) for (const auto& var : farg->GetVars()) obj->AddVar(var.Symbol());
arg = farg;
}
free(fname);
free(vname);
}
}
if (arg == NULL) {
const std::string sub_expr = obj->Lookup(str);
if (sub_expr.empty()) {
arg = new var_expr_t(str);
@@ -299,6 +403,7 @@ inline const bin_expr_t* bin_expr_t::CreateArg(Expr* obj, const std::string str)
arg = expr->GetTree();
}
}
return arg;
}
+63 -8
Просмотреть файл
@@ -36,11 +36,29 @@ class Xml {
return xml;
}
void AddExpr(const std::string& full_tag, const std::string& name, const std::string& expr) {
const std::size_t pos = full_tag.rfind('.');
const std::size_t pos1 = (pos == std::string::npos) ? 0 : pos + 1;
const std::string level_tag = full_tag.substr(pos1);
level_t* level = new level_t;
map_[full_tag].push_back(level);
level->tag = level_tag;
level->opts["name"] = name;
level->opts["expr"] = expr;
}
void AddConst(const std::string& full_tag, const std::string& name, const uint64_t& val) {
std::ostringstream oss;
oss << val;
AddExpr(full_tag, name, oss.str());
}
static void Destroy(Xml *xml) { delete xml; }
std::vector<level_t*> GetNodes(std::string global_tag) { return map_[global_tag]; }
void Print() const {
std::cout << "XML file '" << file_name_ << "':" << std::endl;
for (auto& elem : map_) {
for (auto node : elem.second) {
if (node->opts.size()) {
@@ -74,13 +92,14 @@ class Xml {
while (1) {
token_t token = (remainder.size()) ? remainder : NextToken();
remainder.clear();
// token_t token1 = token;
// token1.push_back('\0');
// std::cout << "> " << &token1[0] << std::endl;
// End of file
if (token.size() == 0) break;
// token_t token1 = token;
// token1.push_back('\0');
// std::cout << "> " << &token1[0] << std::endl;
switch (state_) {
case BODY_STATE:
if (token[0] == '<') {
@@ -146,6 +165,11 @@ class Xml {
~Xml() {}
bool SpaceCheck() const {
bool cond = ((buffer_[index_] == ' ') || (buffer_[index_] == ' '));
return cond;
}
bool LineEndCheck() {
bool found = false;
if (buffer_[index_] == '\n') {
@@ -162,24 +186,55 @@ class Xml {
token_t NextToken() {
token_t token;
bool in_string = false;
bool special_symb = false;
while (1) {
if (data_size_ == 0) {
data_size_ = read(fd_, buffer_, buf_size_);
if (data_size_ <= 0) break;
}
if (token.empty())
while ((index_ < data_size_) && ((buffer_[index_] == ' ') || LineEndCheck())) {
while ((index_ < data_size_) && (SpaceCheck() || LineEndCheck())) {
++index_;
}
while ((index_ < data_size_) && (buffer_[index_] != ' ') && !LineEndCheck()) {
token.push_back(buffer_[index_++]);
while ((index_ < data_size_) && (in_string || !(SpaceCheck() || LineEndCheck()))) {
const char symb = buffer_[index_];
bool skip_symb = false;
switch (symb) {
case '\\':
if (special_symb) special_symb = false;
else {
special_symb = true;
skip_symb = true;
}
break;
case '"':
if (special_symb) special_symb = false;
else {
in_string = !in_string;
if (!in_string) {
buffer_[index_] = ' ';
--index_;
}
skip_symb = true;
}
break;
}
if (!skip_symb) token.push_back(symb);
++index_;
}
if (index_ == data_size_) {
index_ = 0;
data_size_ = 0;
} else
} else {
if (special_symb || in_string) BadFormat(token);
break;
}
}
return token;
+1 -1
Просмотреть файл
@@ -42,7 +42,7 @@ class TestAql {
}
TestAql* Test() { return test_; }
virtual AgentInfo* GetAgentInfo() { return (test_) ? test_->GetAgentInfo() : 0; }
virtual const AgentInfo* GetAgentInfo() { return (test_) ? test_->GetAgentInfo() : 0; }
virtual hsa_queue_t* GetQueue() { return (test_) ? test_->GetQueue() : 0; }
virtual HsaRsrcFactory* GetRsrcFactory() { return (test_) ? test_->GetRsrcFactory() : 0; }
+2 -2
Просмотреть файл
@@ -34,7 +34,7 @@ OF THE POSSIBILITY OF SUCH DAMAGE.
#include "util/hsa_rsrc_factory.h"
HsaRsrcFactory* TestHsa::hsa_rsrc_ = NULL;
AgentInfo* TestHsa::agent_info_ = NULL;
const AgentInfo* TestHsa::agent_info_ = NULL;
hsa_queue_t* TestHsa::hsa_queue_ = NULL;
uint32_t TestHsa::agent_id_ = 0;
@@ -43,7 +43,7 @@ HsaRsrcFactory* TestHsa::HsaInstantiate(const uint32_t agent_ind) {
if (hsa_rsrc_ == NULL) {
agent_id_ = agent_ind;
hsa_rsrc_ = HsaRsrcFactory::Create();
hsa_rsrc_ = HsaRsrcFactory::CreateInstance();
// Print properties of the agents
hsa_rsrc_->PrintGpuAgents("> GPU agents");
+2 -2
Просмотреть файл
@@ -52,7 +52,7 @@ class TestHsa : public TestAql {
}
// Get methods for Agent Info, HAS queue, HSA Resourcse Manager
AgentInfo* GetAgentInfo() { return agent_info_; }
const AgentInfo* GetAgentInfo() { return agent_info_; }
hsa_queue_t* GetQueue() { return hsa_queue_; }
HsaRsrcFactory* GetRsrcFactory() { return hsa_rsrc_; }
@@ -115,7 +115,7 @@ class TestHsa : public TestAql {
static uint32_t agent_id_;
// Handle to an Hsa Gpu Agent
static AgentInfo* agent_info_;
static const AgentInfo* agent_info_;
// Handle to an Hsa Queue
static hsa_queue_t* hsa_queue_;
+1
Просмотреть файл
@@ -371,6 +371,7 @@ CONSTRUCTOR_API void constructor()
fprintf(stderr, "Input file not found '%s'\n", xml_name);
exit(1);
}
xml->Print();
// Getting metrics
auto metrics_list = xml->GetNodes("top.metric");
+203 -66
Просмотреть файл
@@ -8,11 +8,23 @@
<metric name=SQ_INSTS_VALU block=SQ event=26 ></metric>
<metric name=SQ_INSTS_VMEM_WR block=SQ event=27 ></metric>
<metric name=SQ_INSTS_VMEM_RD block=SQ event=28 ></metric>
<metric name=SQ_INSTS_SALU block=SQ event=30 ></metric>
<metric name=SQ_INSTS_SMEM block=SQ event=31 ></metric>
<metric name=SQ_INSTS_FLAT block=SQ event=32 ></metric>
<metric name=SQ_INSTS_FLAT_LDS_ONLY block=SQ event=33 ></metric>
<metric name=SQ_INSTS_LDS block=SQ event=34 ></metric>
<metric name=SQ_INSTS_GDS block=SQ event=35 ></metric>
<metric name=SQ_WAVE_READY block=SQ event=47 ></metric>
<metric name=SQ_WAIT_INST_LDS block=SQ event=61 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
<metric name=SQ_ACTIVE_INST_VALU block=SQ event=69 descr="Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic)"></metric>
<metric name=SQ_INST_CYCLES_SALU block=SQ event=86 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"></metric>
<metric name=SQ_THREAD_CYCLES_VALU block=SQ event=89 ></metric>
<metric name=SQ_THREAD_CYCLES_VALU_MAX block=SQ event=90 ></metric>
<metric name=SQ_LDS_BANK_CONFLICT block=SQ event=97 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
<metric name=TA_BUSY block=TA event=15 ></metric>
<metric name=TA_FLAT_READ_WAVEFRONTS block=TA event=101 ></metric>
<metric name=TA_FLAT_WRITE_WAVEFRONTS block=TA event=102 ></metric>
@@ -22,37 +34,32 @@
<metric name=TCC_HIT block=TCC event=18 ></metric>
<metric name=TCC_MISS block=TCC event=19 ></metric>
<metric name=TCC_WRITEBACK block=TCC event=22 ></metric>
<metric name=TCC_EA_WRREQ block=TCC event=26 ></metric>
<metric name=TCC_EA_WRREQ_64B block=TCC event=27 ></metric>
<metric name=TCC_EA_WRREQ_STALL block=TCC event=30 ></metric>
<metric name=TCC_MC_RDREQ block=TCC event=35 ></metric>
<metric name=TCP_TA_DATA_STALL_CYCLES block=TCP event=3 descr="TCP stalls TA data interface. Now Windowed."></metric>
<metric name=CPC_ALWAYS_COUNT block=CPC event=0 ></metric>
<metric name=CPC_ME1_STALL_WAIT_ON_RCIU_READ block=CPC event=8 ></metric>
# average for (16 instances x 4 shader engines)
<metric
name=TA_BUSY_avr
expr=(TA_BUSY[0]+TA_BUSY[1]+TA_BUSY[2]+TA_BUSY[3]+TA_BUSY[4]+TA_BUSY[5]+TA_BUSY[6]+TA_BUSY[7]+TA_BUSY[8]+TA_BUSY[9]+TA_BUSY[10]+TA_BUSY[11]+TA_BUSY[12]+TA_BUSY[13]+TA_BUSY[14]+TA_BUSY[15])/(16*4)
></metric>
# sun for 16 instances
<metric
name=TA_FLAT_WRITE_WAVEFRONTS_sum
expr=TA_FLAT_WRITE_WAVEFRONTS[0]+TA_FLAT_WRITE_WAVEFRONTS[1]+TA_FLAT_WRITE_WAVEFRONTS[2]+TA_FLAT_WRITE_WAVEFRONTS[3]+TA_FLAT_WRITE_WAVEFRONTS[4]+TA_FLAT_WRITE_WAVEFRONTS[5]+TA_FLAT_WRITE_WAVEFRONTS[6]+TA_FLAT_WRITE_WAVEFRONTS[7]+TA_FLAT_WRITE_WAVEFRONTS[8]+TA_FLAT_WRITE_WAVEFRONTS[9]+TA_FLAT_WRITE_WAVEFRONTS[10]+TA_FLAT_WRITE_WAVEFRONTS[11]+TA_FLAT_WRITE_WAVEFRONTS[12]+TA_FLAT_WRITE_WAVEFRONTS[13]+TA_FLAT_WRITE_WAVEFRONTS[14]+TA_FLAT_WRITE_WAVEFRONTS[15]
></metric>
<metric
name=TCC_HIT_sum
expr=TCC_HIT[0]+TCC_HIT[1]+TCC_HIT[2]+TCC_HIT[3]+TCC_HIT[4]+TCC_HIT[5]+TCC_HIT[6]+TCC_HIT[7]+TCC_HIT[8]+TCC_HIT[9]+TCC_HIT[10]+TCC_HIT[11]+TCC_HIT[12]+TCC_HIT[13]+TCC_HIT[14]+TCC_HIT[15]
></metric>
<metric
name=TCC_MISS_sum
expr=TCC_MISS[0]+TCC_MISS[1]+TCC_MISS[2]+TCC_MISS[3]+TCC_MISS[4]+TCC_MISS[5]+TCC_MISS[6]+TCC_MISS[7]+TCC_MISS[8]+TCC_MISS[9]+TCC_MISS[10]+TCC_MISS[11]+TCC_MISS[12]+TCC_MISS[13]+TCC_MISS[14]+TCC_MISS[15]
></metric>
<metric
name=TCC_MC_RDREQ_sum
expr=TCC_MC_RDREQ[0]+TCC_MC_RDREQ[1]+TCC_MC_RDREQ[2]+TCC_MC_RDREQ[3]+TCC_MC_RDREQ[4]+TCC_MC_RDREQ[5]+TCC_MC_RDREQ[6]+TCC_MC_RDREQ[7]+TCC_MC_RDREQ[8]+TCC_MC_RDREQ[9]+TCC_MC_RDREQ[10]+TCC_MC_RDREQ[11]+TCC_MC_RDREQ[12]+TCC_MC_RDREQ[13]+TCC_MC_RDREQ[14]+TCC_MC_RDREQ[15]
></metric>
<metric name="TA_BUSY_avr" expr=avr(TA_BUSY,16)/4 ></metric>
# sum for 16 instances
<metric name="TA_FLAT_READ_WAVEFRONTS_sum" expr=sum(TA_FLAT_READ_WAVEFRONTS,16) ></metric>
<metric name="TA_FLAT_WRITE_WAVEFRONTS_sum" expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) ></metric>
<metric name="TCC_HIT_sum" expr=sum(TCC_HIT,16) ></metric>
<metric name="TCC_MISS_sum" expr=sum(TCC_MISS,16) ></metric>
<metric name="TCC_MC_RDREQ_sum" expr=sum(TCC_MC_RDREQ,16) ></metric>
# FETCH_SIZE, kilobytes
# The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
<metric name=FETCH_SIZE expr=(TCC_MC_RDREQ_sum*32)/1024 ></metric>
<metric
name="FETCH_SIZE"
expr=(TCC_MC_RDREQ_sum*32)/1024
descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
></metric>
</gfx8>
<gfx9>
@@ -63,13 +70,25 @@
<metric name=SQ_WAVES block=SQ event=4 ></metric>
<metric name=SQ_ITEMS block=SQ event=14 ></metric>
<metric name=SQ_INSTS_VALU block=SQ event=26 ></metric>
<metric name=SQ_INSTS_VMEM_WR block=SQ event=30 ></metric>
<metric name=SQ_INSTS_VMEM_RD block=SQ event=31 ></metric>
<metric name=SQ_INSTS_VMEM_WR block=SQ event=27 ></metric>
<metric name=SQ_INSTS_VMEM_RD block=SQ event=28 ></metric>
<metric name=SQ_INSTS_SALU block=SQ event=30 ></metric>
<metric name=SQ_INSTS_SMEM block=SQ event=31 ></metric>
<metric name=SQ_INSTS_FLAT block=SQ event=32 ></metric>
<metric name=SQ_INSTS_FLAT_LDS_ONLY block=SQ event=33 ></metric>
<metric name=SQ_INSTS_LDS block=SQ event=34 ></metric>
<metric name=SQ_INSTS_GDS block=SQ event=35 ></metric>
<metric name=SQ_WAVE_READY block=SQ event=47 ></metric>
<metric name=SQ_WAIT_INST_LDS block=SQ event=63 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
<metric name=SQ_ACTIVE_INST_VALU block=SQ event=72 descr="regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic)"></metric>
<metric name=SQ_INST_CYCLES_SALU block=SQ event=84 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"></metric>
<metric name=SQ_THREAD_CYCLES_VALU block=SQ event=85 ></metric>
<metric name=SQ_THREAD_CYCLES_VALU_MAX block=SQ event=86 ></metric>
<metric name=SQ_LDS_BANK_CONFLICT block=SQ event=93 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
<metric name=TA_BUSY block=TA event=15 ></metric>
<metric name=TA_FLAT_READ_WAVEFRONTS block=TA event=101 ></metric>
<metric name=TA_FLAT_WRITE_WAVEFRONTS block=TA event=102 ></metric>
@@ -86,60 +105,178 @@
<metric name=CPC_ME1_STALL_WAIT_ON_RCIU_READ block=CPC event=8 ></metric>
# average for (16 instances x 4 shader engines)
<metric
name=TA_BUSY_avr
expr=(TA_BUSY[0]+TA_BUSY[1]+TA_BUSY[2]+TA_BUSY[3]+TA_BUSY[4]+TA_BUSY[5]+TA_BUSY[6]+TA_BUSY[7]+TA_BUSY[8]+TA_BUSY[9]+TA_BUSY[10]+TA_BUSY[11]+TA_BUSY[12]+TA_BUSY[13]+TA_BUSY[14]+TA_BUSY[15])/(16*4)
></metric>
<metric name="TA_BUSY_avr" expr=avr(TA_BUSY,16)/4 ></metric>
# sum for 16 instances
<metric
name=TA_FLAT_WRITE_WAVEFRONTS_sum
expr=TA_FLAT_WRITE_WAVEFRONTS[0]+TA_FLAT_WRITE_WAVEFRONTS[1]+TA_FLAT_WRITE_WAVEFRONTS[2]+TA_FLAT_WRITE_WAVEFRONTS[3]+TA_FLAT_WRITE_WAVEFRONTS[4]+TA_FLAT_WRITE_WAVEFRONTS[5]+TA_FLAT_WRITE_WAVEFRONTS[6]+TA_FLAT_WRITE_WAVEFRONTS[7]+TA_FLAT_WRITE_WAVEFRONTS[8]+TA_FLAT_WRITE_WAVEFRONTS[9]+TA_FLAT_WRITE_WAVEFRONTS[10]+TA_FLAT_WRITE_WAVEFRONTS[11]+TA_FLAT_WRITE_WAVEFRONTS[12]+TA_FLAT_WRITE_WAVEFRONTS[13]+TA_FLAT_WRITE_WAVEFRONTS[14]+TA_FLAT_WRITE_WAVEFRONTS[15]
></metric>
<metric
name=TCC_HIT_sum
expr=TCC_HIT[0]+TCC_HIT[1]+TCC_HIT[2]+TCC_HIT[3]+TCC_HIT[4]+TCC_HIT[5]+TCC_HIT[6]+TCC_HIT[7]+TCC_HIT[8]+TCC_HIT[9]+TCC_HIT[10]+TCC_HIT[11]+TCC_HIT[12]+TCC_HIT[13]+TCC_HIT[14]+TCC_HIT[15]
></metric>
<metric
name=TCC_MISS_sum
expr=TCC_MISS[0]+TCC_MISS[1]+TCC_MISS[2]+TCC_MISS[3]+TCC_MISS[4]+TCC_MISS[5]+TCC_MISS[6]+TCC_MISS[7]+TCC_MISS[8]+TCC_MISS[9]+TCC_MISS[10]+TCC_MISS[11]+TCC_MISS[12]+TCC_MISS[13]+TCC_MISS[14]+TCC_MISS[15]
></metric>
<metric
name=TCC_EA_RDREQ_sum
expr=TCC_EA_RDREQ[0]+TCC_EA_RDREQ[1]+TCC_EA_RDREQ[2]+TCC_EA_RDREQ[3]+TCC_EA_RDREQ[4]+TCC_EA_RDREQ[5]+TCC_EA_RDREQ[6]+TCC_EA_RDREQ[7]+TCC_EA_RDREQ[8]+TCC_EA_RDREQ[9]+TCC_EA_RDREQ[10]+TCC_EA_RDREQ[11]+TCC_EA_RDREQ[12]+TCC_EA_RDREQ[13]+TCC_EA_RDREQ[14]+TCC_EA_RDREQ[15]
></metric>
<metric
name=TCC_EA_RDREQ_32B_sum
expr=TCC_EA_RDREQ_32B[0]+TCC_EA_RDREQ_32B[1]+TCC_EA_RDREQ_32B[2]+TCC_EA_RDREQ_32B[3]+TCC_EA_RDREQ_32B[4]+TCC_EA_RDREQ_32B[5]+TCC_EA_RDREQ_32B[6]+TCC_EA_RDREQ_32B[7]+TCC_EA_RDREQ_32B[8]+TCC_EA_RDREQ_32B[9]+TCC_EA_RDREQ_32B[10]+TCC_EA_RDREQ_32B[11]+TCC_EA_RDREQ_32B[12]+TCC_EA_RDREQ_32B[13]+TCC_EA_RDREQ_32B[14]+TCC_EA_RDREQ_32B[15]
></metric>
<metric name="TA_FLAT_READ_WAVEFRONTS_sum" expr=sum(TA_FLAT_READ_WAVEFRONTS,16) ></metric>
<metric name="TA_FLAT_WRITE_WAVEFRONTS_sum" expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) ></metric>
<metric name=TCC_HIT_sum expr=sum(TCC_HIT,16) ></metric>
<metric name=TCC_MISS_sum expr=sum(TCC_MISS,16) ></metric>
<metric name=TCC_EA_RDREQ_sum expr=sum(TCC_EA_RDREQ,16) ></metric>
<metric name=TCC_EA_RDREQ_32B_sum expr=sum(TCC_EA_RDREQ_32B,16) ></metric>
# FETCH_SIZE, kilobytes
# The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
<metric name=FETCH_SIZE expr=((TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64+TCC_EA_RDREQ_32B_sum*32)/1024 ></metric>
<metric
name="FETCH_SIZE"
expr=((TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64+TCC_EA_RDREQ_32B_sum*32)/1024
descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
></metric>
</gfx9>
<global>
# GPU_BUSY, percentage
# The percentage of time GPU was busy.
<metric name=GPU_BUSY expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT ></metric>
<metric
name=GPU_BUSY
expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT
descr="The percentage of time GPU was busy."
></metric>
# MEM_BUSY, percentage
# The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound).
<metric name=MEM_BUSY expr=100*TA_BUSY_avr/GRBM_GUI_ACTIVE ></metric>
# Wavefronts Total wavefronts.,
<metric
name="Wavefronts"
expr=SQ_WAVES
descr="Total wavefronts."
></metric>
# VWRITE_INSTS
# The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory.
<metric name=VWRITE_INSTS expr=(SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES ></metric>
# VALUInsts The average number of vector ALU instructions executed per work-item (affected by flow control).
<metric
name="VALUInsts"
expr=SQ_INSTS_VALU/SQ_WAVES
descr="The average number of vector ALU instructions executed per work-item (affected by flow control)."
></metric>
# SFETCH_INSTS
# The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control).
<metric name=SFETCH_INSTS expr=SQ_INSTS_SMEM/SQ_WAVES ></metric>
# SALUInsts The average number of scalar ALU instructions executed per work-item (affected by flow control).
<metric
name="SALUInsts"
expr=SQ_INSTS_SALU/SQ_WAVES
descr="The average number of scalar ALU instructions executed per work-item (affected by flow control)."
></metric>
# VFetchInsts The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory.
<metric
name="VFetchInsts"
expr=(SQ_INSTS_VMEM_RD-TA_FLAT_READ_WAVEFRONTS_sum)/SQ_WAVES
descr="The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory."
></metric>
# VALU_INSTS
# The average number of vector ALU instructions executed per work-item (affected by flow control).
<metric name=VALU_INSTS expr=SQ_INSTS_VALU/SQ_WAVES ></metric>
# SFetchInsts The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control).
<metric
name="SFetchInsts"
expr=SQ_INSTS_SMEM/SQ_WAVES
descr="The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control)."
></metric>
# VWriteInsts The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory.
<metric
name=VWriteInsts
expr=(SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES
descr="The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory."
></metric>
# FlatVMemInsts The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch.
<metric
name="FlatVMemInsts"
expr=(SQ_INSTS_FLAT-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
descr="The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch."
></metric>
# LDSInsts The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS.
<metric
name="LDSInsts"
expr=(SQ_INSTS_LDS-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
descr="The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS."
></metric>
# FlatLDSInsts The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control).
<metric
name="FlatLDSInsts"
expr=SQ_INSTS_FLAT_LDS_ONLY/SQ_WAVES
descr="The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control)."
></metric>
# GDSInsts The average number of GDS read or GDS write instructions executed per work item (affected by flow control).
<metric
name="GDSInsts"
expr=SQ_INSTS_GDS/SQ_WAVES
descr="The average number of GDS read or GDS write instructions executed per work item (affected by flow control)."
></metric>
# VALUUtilization The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence).
<metric
name="VALUUtilization"
expr=100*SQ_THREAD_CYCLES_VALU/(SQ_ACTIVE_INST_VALU*64)
descr="The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence)."
></metric>
# VALUBusy The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
<metric
name="VALUBusy"
expr=100*SQ_ACTIVE_INST_VALU*4/NUM_SIMDS/GRBM_GUI_ACTIVE
descr="The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."
></metric>
# SALUBusy The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
<metric
name="SALUBusy"
expr=100*SQ_INST_CYCLES_SALU*4/(NUM_SIMDS/NUM_SHADER_ENGINES)/GRBM_GUI_ACTIVE
descr="The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."
></metric>
# FetchSize The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
<metric
name="FetchSize"
expr=FETCH_SIZE
descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
></metric>
# WriteSize The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
<metric
name="WriteSize"
expr=((sum(TCC_EA_WRREQ,16)-sum(TCC_EA_WRREQ_64B,16))*32+sum(TCC_EA_WRREQ_64B,16)*64)/1024
descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
></metric>
# L2CacheHit The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal).
<metric
name="L2CacheHit"
expr=100*sum(TCC_HIT,16)/(sum(TCC_HIT,16)+sum(TCC_MISS,16))
descr="The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal)."
></metric>
# MemUnitBusy The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound).
<metric
name="MemUnitBusy"
expr=100*max(TA_BUSY,16)/GRBM_GUI_ACTIVE/NUM_SHADER_ENGINES
descr="The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound)."
></metric>
# MemUnitStalled The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad).
<metric
name="MemUnitStalled"
expr=100*max(TCP_TA_DATA_STALL_CYCLES,16)/GRBM_GUI_ACTIVE/NUM_SHADER_ENGINES
descr="The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad)."
></metric>
# WriteUnitStalled The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad).
<metric
name="WriteUnitStalled"
expr=100*max(TCC_EA_WRREQ_STALL,16)/GRBM_GUI_ACTIVE
descr="The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad)."
></metric>
# The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad).
<metric
name="ALUStalledByLDS"
expr=100*SQ_WAIT_INST_LDS/SQ_WAVES/GRBM_GUI_ACTIVE/NUM_SHADER_ENGINES
descr="The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad)."
></metric>
# LDSBankConflict The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad).
<metric
name="LDSBankConflict"
expr=100*SQ_LDS_BANK_CONFLICT/GRBM_GUI_ACTIVE/NUM_SIMDS
descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."
></metric>
# L2CACHE_HIT, percentage
# The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal).
<metric name=L2CACHE_HIT expr=100*TCC_HIT_sum/(TCC_HIT_sum+TCC_MISS_sum) ></metric>
</global>
+1 -1
Просмотреть файл
@@ -21,7 +21,7 @@ export ROCP_METRICS=metrics.xml
export ROCP_INPUT=input.xml
# output directory for the tool library, for metrics results file 'results.txt'
# and SQTT trace files 'thread_trace.se<n>.out'
#export ROCP_OUTPUT_DIR=./
export ROCP_OUTPUT_DIR=./RESULTS
if [ -n "$1" ] ; then
tbin="$*"
+94 -67
Просмотреть файл
@@ -26,6 +26,7 @@ POSSIBILITY OF SUCH DAMAGE.
#include <dlfcn.h>
#include <hsa.h>
#include <hsa_ext_amd.h>
#include <hsa_ext_finalize.h>
#include <stdint.h>
#include <stdio.h>
@@ -39,8 +40,17 @@ POSSIBILITY OF SUCH DAMAGE.
#include <string>
#include <vector>
// Callback function to get available in the system agents
hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) {
hsa_status_t status = HSA_STATUS_ERROR;
HsaRsrcFactory* hsa_rsrc = reinterpret_cast<HsaRsrcFactory*>(data);
const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent);
if (agent_info != NULL) status = HSA_STATUS_SUCCESS;
return status;
}
// Callback function to find and bind kernarg region of an agent
static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data) {
hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* data) {
hsa_region_global_flag_t flags;
hsa_region_segment_t segment_id;
@@ -62,53 +72,6 @@ static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data) {
return HSA_STATUS_SUCCESS;
}
// Callback function to get the number of agents
static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data) {
// Copy handle of agent and increment number of agents reported
HsaRsrcFactory* rsrcFactory = reinterpret_cast<HsaRsrcFactory*>(data);
// Determine if device is a Gpu agent
hsa_status_t status;
hsa_device_type_t type;
status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
CHECK_STATUS("Error Calling hsa_agent_get_info", status);
if (type == HSA_DEVICE_TYPE_DSP) {
return HSA_STATUS_SUCCESS;
}
if (type == HSA_DEVICE_TYPE_CPU) {
AgentInfo* agent_info = reinterpret_cast<AgentInfo*>(malloc(sizeof(AgentInfo)));
agent_info->dev_id = agent;
agent_info->dev_type = HSA_DEVICE_TYPE_CPU;
rsrcFactory->AddAgentInfo(agent_info, false);
return HSA_STATUS_SUCCESS;
}
// Device is a Gpu agent, build an instance of AgentInfo
AgentInfo* agent_info = reinterpret_cast<AgentInfo*>(malloc(sizeof(AgentInfo)));
agent_info->dev_id = agent;
agent_info->dev_type = HSA_DEVICE_TYPE_GPU;
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name);
agent_info->max_wave_size = 0;
hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size);
agent_info->max_queue_size = 0;
hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size);
agent_info->profile = hsa_profile_t(108);
hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile);
// Initialize memory regions to zero
agent_info->kernarg_region.handle = 0;
agent_info->coarse_region.handle = 0;
// Find and Bind Memory regions of the Gpu agent
hsa_agent_iterate_regions(agent, FindMemRegionsCallback, agent_info);
// Save the instance of AgentInfo
rsrcFactory->AddAgentInfo(agent_info, true);
return HSA_STATUS_SUCCESS;
}
// Constructor of the class
HsaRsrcFactory::HsaRsrcFactory() {
// Initialize the Hsa Runtime
@@ -128,12 +91,17 @@ HsaRsrcFactory::HsaRsrcFactory() {
status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, 1, 0, &aqlprofile_api_);
#endif
CHECK_STATUS("aqlprofile API table load failed", status);
// Get Loader API table
loader_api_ = {0};
status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_);
CHECK_STATUS("loader API table query failed", status);
}
// Destructor of the class
HsaRsrcFactory::~HsaRsrcFactory() {
for (auto p : cpu_list_) free(p);
for (auto p : gpu_list_) free(p);
for (auto p : cpu_list_) free(const_cast<AgentInfo*>(p));
for (auto p : gpu_list_) free(const_cast<AgentInfo*>(p));
printf("HSA shutdown\n");
hsa_status_t status = hsa_shut_down();
@@ -173,6 +141,68 @@ hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) {
return HSA_STATUS_SUCCESS;
}
// Add system agent info
const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) {
// Determine if device is a Gpu agent
hsa_status_t status;
AgentInfo* agent_info = NULL;
hsa_device_type_t type;
status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
CHECK_STATUS("Error Calling hsa_agent_get_info", status);
if (type == HSA_DEVICE_TYPE_CPU) {
agent_info = new AgentInfo{};
agent_info->dev_id = agent;
agent_info->dev_type = HSA_DEVICE_TYPE_CPU;
agent_info->dev_index = cpu_list_.size();
cpu_list_.push_back(agent_info);
}
if (type == HSA_DEVICE_TYPE_GPU) {
agent_info = new AgentInfo{};
agent_info->dev_id = agent;
agent_info->dev_type = HSA_DEVICE_TYPE_GPU;
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name);
strncpy(agent_info->gfxip, agent_info->name, 4);
agent_info->gfxip[4] = '\0';
hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size);
hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size);
hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile);
agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false;
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &agent_info->cu_num);
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), &agent_info->waves_per_cu);
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), &agent_info->simds_per_cu);
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &agent_info->se_num);
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &agent_info->shader_arrays_per_se);
// Initialize memory regions to zero
agent_info->kernarg_region.handle = 0;
agent_info->coarse_region.handle = 0;
// Find and Bind Memory regions of the Gpu agent
hsa_agent_iterate_regions(agent, FindMemRegionsCallback, agent_info);
// Set GPU index
agent_info->dev_index = gpu_list_.size();
gpu_list_.push_back(agent_info);
}
if (agent_info) agent_map_[agent.handle] = agent_info;
return agent_info;
}
// Return systen agent info
const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) {
const AgentInfo* agent_info = NULL;
auto it = agent_map_.find(agent.handle);
if (it != agent_map_.end()) {
agent_info = it->second;
}
return agent_info;
}
// Get the count of Hsa Gpu Agents available on the platform
//
// @return uint32_t Number of Gpu agents on platform
@@ -193,7 +223,7 @@ uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size(
//
// @return bool true if successful, false otherwise
//
bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {
bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) {
// Determine if request is valid
uint32_t size = uint32_t(gpu_list_.size());
if (idx >= size) {
@@ -202,6 +232,7 @@ bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {
// Copy AgentInfo from specified index
*agent_info = gpu_list_[idx];
return true;
}
@@ -213,7 +244,7 @@ bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {
//
// @return bool true if successful, false otherwise
//
bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {
bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) {
// Determine if request is valid
uint32_t size = uint32_t(cpu_list_.size());
if (idx >= size) {
@@ -236,7 +267,8 @@ bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {
//
// @return bool true if successful, false otherwise
//
bool HsaRsrcFactory::CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) {
bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts,
hsa_queue_t** queue) {
hsa_status_t status;
status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
UINT32_MAX, UINT32_MAX, queue);
@@ -324,7 +356,7 @@ bool HsaRsrcFactory::TransferData(void* dest_buff, void* src_buff, uint32_t leng
//
// @return bool true if successful, false otherwise
//
void* HsaRsrcFactory::LoadAndFinalize(AgentInfo* agent_info, const char* brig_path,
void* HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc) {
// Finalize the Hsail object into code object
hsa_status_t status;
@@ -387,32 +419,27 @@ void* HsaRsrcFactory::LoadAndFinalize(AgentInfo* agent_info, const char* brig_pa
return code_buf;
}
// Add an instance of AgentInfo representing a Hsa Gpu agent
void HsaRsrcFactory::AddAgentInfo(AgentInfo* agent_info, bool gpu) {
// Add input to Gpu list
if (gpu) {
gpu_list_.push_back(agent_info);
return;
}
// Add input to Cpu list
cpu_list_.push_back(agent_info);
}
// Print the various fields of Hsa Gpu Agents
bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) {
std::clog << header << " :" << std::endl;
AgentInfo* agent_info;
const AgentInfo* agent_info;
int size = uint32_t(gpu_list_.size());
for (int idx = 0; idx < size; idx++) {
agent_info = gpu_list_[idx];
std::clog << "> agent[" << idx << "] :" << std::endl;
std::clog << ">> Name : " << agent_info->name << std::endl;
std::clog << ">> APU : " << agent_info->is_apu << std::endl;
std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl;
std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl;
std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl;
std::clog << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl;
std::clog << ">> CU number : " << agent_info->cu_num << std::endl;
std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl;
std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl;
std::clog << ">> SE number : " << agent_info->se_num << std::endl;
std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl;
}
return true;
}
+65 -17
Просмотреть файл
@@ -28,6 +28,7 @@ POSSIBILITY OF SUCH DAMAGE.
#include <hsa.h>
#include <hsa_ext_finalize.h>
#include <hsa_ven_amd_aqlprofile.h>
#include <hsa_ven_amd_loader.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
@@ -35,6 +36,7 @@ POSSIBILITY OF SUCH DAMAGE.
#include <iostream>
#include <mutex>
#include <map>
#include <string>
#include <vector>
@@ -52,6 +54,7 @@ POSSIBILITY OF SUCH DAMAGE.
static const unsigned MEM_PAGE_BYTES = 0x1000;
static const unsigned MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t;
// Encapsulates information about a Hsa Agent such as its
// handle, name, max queue size, max wavefront size, etc.
@@ -62,6 +65,15 @@ struct AgentInfo {
// Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
uint32_t dev_type;
// APU flag
bool is_apu;
// Agent system index
uint32_t dev_index;
// GFXIP name
char gfxip[64];
// Name of Agent whose length is less than 64
char name[64];
@@ -79,31 +91,52 @@ struct AgentInfo {
// Memory region supporting kernel arguments
hsa_region_t kernarg_region;
// The number of compute unit available in the agent.
uint32_t cu_num;
// Maximum number of waves possible in a Compute Unit.
uint32_t waves_per_cu;
// Number of SIMD's per compute unit CU
uint32_t simds_per_cu;
// Number of Shader Engines (SE) in Gpu
uint32_t se_num;
// Number of Shader Arrays Per Shader Engines in Gpu
uint32_t shader_arrays_per_se;
};
class HsaRsrcFactory {
public:
typedef std::recursive_mutex mutex_t;
static HsaRsrcFactory* Create() {
static HsaRsrcFactory* Create() { return NULL; }
static HsaRsrcFactory* CreateInstance() {
std::lock_guard<mutex_t> lck(mutex_);
if (HsaRsrcFactory::instance_ == NULL) {
HsaRsrcFactory::instance_ = new HsaRsrcFactory();
if (instance_ == NULL) {
instance_ = new HsaRsrcFactory();
}
return instance_;
}
static HsaRsrcFactory& Instance() {
CreateInstance();
hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
CHECK_STATUS("HsaRsrcFactory::Instance() is not found", status);
return *instance_;
}
static void Destroy() {
std::lock_guard<mutex_t> lck(mutex_);
if (instance_) delete instance_;
instance_ = NULL;
}
static HsaRsrcFactory& Instance() {
hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
CHECK_STATUS("HsaRsrcFactory::Instance()", status);
return *instance_;
}
// Return system agent info
const AgentInfo* GetAgentInfo(const hsa_agent_t agent);
// Get the count of Hsa Gpu Agents available on the platform
//
@@ -125,7 +158,7 @@ class HsaRsrcFactory {
//
// @return bool true if successful, false otherwise
//
bool GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info);
bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info);
// Get the AgentInfo handle of a Cpu device
//
@@ -135,7 +168,7 @@ class HsaRsrcFactory {
//
// @return bool true if successful, false otherwise
//
bool GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info);
bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info);
// Create a Queue object and return its handle. The queue object is expected
// to support user requested number of Aql dispatch packets.
@@ -148,7 +181,7 @@ class HsaRsrcFactory {
//
// @return bool true if successful, false otherwise
//
bool CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue);
bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue);
// Create a Signal object and return its handle.
//
@@ -198,12 +231,9 @@ class HsaRsrcFactory {
//
// @return code buffer, non NULL if successful, NULL otherwise
//
void* LoadAndFinalize(AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
void* LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc);
// Add an instance of AgentInfo representing a Hsa Gpu agent
void AddAgentInfo(AgentInfo* agent_info, bool gpu);
// Print the various fields of Hsa Gpu Agents
bool PrintGpuAgents(const std::string& header);
@@ -214,7 +244,16 @@ class HsaRsrcFactory {
typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t;
const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; }
// Return Loader API table
const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; }
private:
// System agents iterating callback
static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data);
// Callback function to find and bind kernarg region of an agent
static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data);
// Load AQL profile HSA extension library directly
static hsa_status_t LoadAqlProfileLib(aqlprofile_pfn_t* api);
@@ -225,17 +264,26 @@ class HsaRsrcFactory {
// Destructor of the class
~HsaRsrcFactory();
// Add an instance of AgentInfo representing a Hsa Gpu agent
const AgentInfo* AddAgentInfo(const hsa_agent_t agent);
static HsaRsrcFactory* instance_;
static mutex_t mutex_;
// Used to maintain a list of Hsa Gpu Agent Info
std::vector<AgentInfo*> gpu_list_;
std::vector<const AgentInfo*> gpu_list_;
// Used to maintain a list of Hsa Cpu Agent Info
std::vector<AgentInfo*> cpu_list_;
std::vector<const AgentInfo*> cpu_list_;
// System agents map
std::map<hsa_agent_handle_t, const AgentInfo*> agent_map_;
// AqlProfile API table
aqlprofile_pfn_t aqlprofile_api_;
// Loader API table
hsa_ven_amd_loader_1_00_pfn_t loader_api_;
};
#endif // TEST_UTIL_HSA_RSRC_FACTORY_H_
+46 -8
Просмотреть файл
@@ -41,6 +41,7 @@ class Xml {
std::vector<level_t*> GetNodes(std::string global_tag) { return map_[global_tag]; }
void Print() const {
std::cout << "XML file '" << file_name_ << "':" << std::endl;
for (auto& elem : map_) {
for (auto node : elem.second) {
if (node->opts.size()) {
@@ -74,13 +75,14 @@ class Xml {
while (1) {
token_t token = (remainder.size()) ? remainder : NextToken();
remainder.clear();
// token_t token1 = token;
// token1.push_back('\0');
// std::cout << "> " << &token1[0] << std::endl;
// End of file
if (token.size() == 0) break;
// token_t token1 = token;
// token1.push_back('\0');
// std::cout << "> " << &token1[0] << std::endl;
switch (state_) {
case BODY_STATE:
if (token[0] == '<') {
@@ -146,6 +148,11 @@ class Xml {
~Xml() {}
bool SpaceCheck() const {
bool cond = ((buffer_[index_] == ' ') || (buffer_[index_] == ' '));
return cond;
}
bool LineEndCheck() {
bool found = false;
if (buffer_[index_] == '\n') {
@@ -162,24 +169,55 @@ class Xml {
token_t NextToken() {
token_t token;
bool in_string = false;
bool special_symb = false;
while (1) {
if (data_size_ == 0) {
data_size_ = read(fd_, buffer_, buf_size_);
if (data_size_ <= 0) break;
}
if (token.empty())
while ((index_ < data_size_) && ((buffer_[index_] == ' ') || LineEndCheck())) {
while ((index_ < data_size_) && (SpaceCheck() || LineEndCheck())) {
++index_;
}
while ((index_ < data_size_) && (buffer_[index_] != ' ') && !LineEndCheck()) {
token.push_back(buffer_[index_++]);
while ((index_ < data_size_) && (in_string || !(SpaceCheck() || LineEndCheck()))) {
const char symb = buffer_[index_];
bool skip_symb = false;
switch (symb) {
case '\\':
if (special_symb) special_symb = false;
else {
special_symb = true;
skip_symb = true;
}
break;
case '"':
if (special_symb) special_symb = false;
else {
in_string = !in_string;
if (!in_string) {
buffer_[index_] = ' ';
--index_;
}
skip_symb = true;
}
break;
}
if (!skip_symb) token.push_back(symb);
++index_;
}
if (index_ == data_size_) {
index_ = 0;
data_size_ = 0;
} else
} else {
if (special_symb || in_string) BadFormat(token);
break;
}
}
return token;