Fixing Multiple Profiler Issues:
1- Fixing Output Buffer Issues
2- Metrics Instances created for all GPUs in Init
3- Multi CPU/GPU/KernArg Pools are initialized for every Agent in Init
4- Lowering OverHead in the Packet WriteInterceptor in the ProxyQueue for both Profile Counting & ATT
Change-Id: Ic3d78156af8405bb134d01584574c339237d265f
[ROCm/rocprofiler commit: 00ecca25c7]
Tento commit je obsažen v:
@@ -99,10 +99,10 @@ bool metrics::ExtractMetricEvents(
|
||||
const Metric* metric = metrics_dict->Get(metric_names[i]);
|
||||
if (metric == nullptr) {
|
||||
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
|
||||
fatal("input metric'%s' not supported on this hardware: %s ", metric_names[i].c_str(),
|
||||
fatal("input metric'%s' not supported on this hardware: %s ", metric_names[i].c_str(),
|
||||
agentInfo.getName().data());
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// adding result object for derived metric
|
||||
std::lock_guard<std::mutex> lock(extract_metric_events_lock);
|
||||
@@ -185,7 +185,7 @@ bool metrics::ExtractMetricEvents(
|
||||
}
|
||||
|
||||
|
||||
bool metrics::GetCounterData(hsa_ven_amd_aqlprofile_profile_t* profile, hsa_agent_t gpu_agent,
|
||||
bool metrics::GetCounterData(hsa_ven_amd_aqlprofile_profile_t* profile, hsa_agent_t gpu_agent,
|
||||
std::vector<results_t*>& results_list) {
|
||||
uint32_t xcc_count = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle).getXccCount();
|
||||
uint32_t single_xcc_buff_size = profile->output_buffer.size /(sizeof(uint64_t) * xcc_count);
|
||||
|
||||
@@ -27,7 +27,7 @@
|
||||
if ((status) != HSA_STATUS_SUCCESS) { \
|
||||
const char* emsg = 0; \
|
||||
hsa_status_string(status, &emsg); \
|
||||
throw(ROCPROFILER_STATUS_ERROR_HSA_SUPPORT, \
|
||||
throw(ROCPROFILER_STATUS_ERROR_HSA_SUPPORT, \
|
||||
"Error: " << msg << ": " << emsg ? emsg : "<unknown error>"); \
|
||||
} \
|
||||
} while (0)
|
||||
@@ -62,32 +62,27 @@ AgentInfo::AgentInfo(const hsa_agent_t agent, ::CoreApiTable* table) : handle_(a
|
||||
table->hsa_agent_get_info_fn(
|
||||
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &se_num_);
|
||||
|
||||
if (table->hsa_agent_get_info_fn(
|
||||
agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE,
|
||||
&shader_arrays_per_se_) != HSA_STATUS_SUCCESS ||
|
||||
table->hsa_agent_get_info_fn(
|
||||
agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
|
||||
&waves_per_cu_) != HSA_STATUS_SUCCESS)
|
||||
{
|
||||
rocmtools::fatal("hsa_agent_get_info for gfxip hardware configuration failed");
|
||||
if (table->hsa_agent_get_info_fn(agent,
|
||||
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE,
|
||||
&shader_arrays_per_se_) != HSA_STATUS_SUCCESS ||
|
||||
table->hsa_agent_get_info_fn(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
|
||||
&waves_per_cu_) != HSA_STATUS_SUCCESS) {
|
||||
rocmtools::fatal("hsa_agent_get_info for gfxip hardware configuration failed");
|
||||
}
|
||||
|
||||
compute_units_per_sh_ = cu_num_ / (se_num_ * shader_arrays_per_se_);
|
||||
wave_slots_per_simd_ = waves_per_cu_ / simds_per_cu_;
|
||||
|
||||
if (table->hsa_agent_get_info_fn(
|
||||
agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_DOMAIN,
|
||||
&pci_domain_) != HSA_STATUS_SUCCESS ||
|
||||
table->hsa_agent_get_info_fn(
|
||||
agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID,
|
||||
&pci_location_id_) != HSA_STATUS_SUCCESS)
|
||||
{
|
||||
rocmtools::fatal("hsa_agent_get_info for PCI info failed");
|
||||
if (table->hsa_agent_get_info_fn(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_DOMAIN,
|
||||
&pci_domain_) != HSA_STATUS_SUCCESS ||
|
||||
table->hsa_agent_get_info_fn(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID,
|
||||
&pci_location_id_) != HSA_STATUS_SUCCESS) {
|
||||
rocmtools::fatal("hsa_agent_get_info for PCI info failed");
|
||||
}
|
||||
|
||||
// TODO: (sauverma) use hsa_agent_get_info_fn(HSA_AMD_AGENT_INFO_NUM_XCC)
|
||||
// TODO: (sauverma) use hsa_agent_get_info_fn(HSA_AMD_AGENT_INFO_NUM_XCC)
|
||||
// to get xcc_num once hsa headers are updated from rocr/hsa
|
||||
std::string gpu_name = std::string(name_).substr(0,6);
|
||||
std::string gpu_name = std::string(name_).substr(0, 6);
|
||||
if (gpu_name == "gfx940")
|
||||
xcc_num_ = 6;
|
||||
else
|
||||
@@ -117,6 +112,12 @@ void AgentInfo::setType(hsa_device_type_t type) { type_ = type; }
|
||||
void AgentInfo::setHandle(uint64_t handle) { handle_ = handle; }
|
||||
void AgentInfo::setName(const std::string& name) { strcpy(name_, name.c_str()); }
|
||||
|
||||
void AgentInfo::setNumaNode(uint32_t numa_node) { numa_node_ = numa_node; }
|
||||
uint32_t AgentInfo::getNumaNode() { return numa_node_; }
|
||||
|
||||
void AgentInfo::setNearCpuAgent(hsa_agent_t near_cpu_agent) { near_cpu_agent_ = near_cpu_agent; }
|
||||
hsa_agent_t AgentInfo::getNearCpuAgent() { return near_cpu_agent_; }
|
||||
|
||||
// CounterHardwareInfo Class
|
||||
|
||||
CounterHardwareInfo::CounterHardwareInfo(uint64_t event_id, const char* block_id)
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_api_trace.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <map>
|
||||
@@ -66,6 +67,16 @@ class AgentInfo {
|
||||
void setHandle(uint64_t handle);
|
||||
void setName(const std::string& name);
|
||||
|
||||
void setNumaNode(uint32_t numa_node);
|
||||
uint32_t getNumaNode();
|
||||
|
||||
void setNearCpuAgent(hsa_agent_t near_cpu_agent);
|
||||
hsa_agent_t getNearCpuAgent();
|
||||
|
||||
hsa_amd_memory_pool_t cpu_pool;
|
||||
hsa_amd_memory_pool_t kernarg_pool;
|
||||
hsa_amd_memory_pool_t gpu_pool;
|
||||
|
||||
private:
|
||||
int index_;
|
||||
hsa_device_type_t type_; // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
|
||||
@@ -87,6 +98,9 @@ class AgentInfo {
|
||||
|
||||
uint32_t pci_domain_;
|
||||
uint32_t pci_location_id_;
|
||||
|
||||
uint32_t numa_node_;
|
||||
hsa_agent_t near_cpu_agent_;
|
||||
};
|
||||
|
||||
// XXX TODO: This should be moved somewhere else so this file can be deleted
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include <mutex>
|
||||
#include <optional>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <string>
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
@@ -40,6 +41,7 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "core/hardware/hsa_info.h"
|
||||
#include "src/core/session/tracer/src/correlation_id.h"
|
||||
#include "src/core/session/tracer/src/exception.h"
|
||||
#include "src/core/session/tracer/src/roctracer.h"
|
||||
@@ -48,6 +50,9 @@
|
||||
#include "src/core/hsa/queues/queue.h"
|
||||
#include "src/api/rocmtool.h"
|
||||
|
||||
#include <experimental/filesystem>
|
||||
namespace fs = std::experimental::filesystem;
|
||||
|
||||
namespace {
|
||||
|
||||
hsa_status_t hsa_executable_iteration_callback(hsa_executable_t executable, hsa_agent_t agent,
|
||||
@@ -761,11 +766,74 @@ hsa_status_t QueueDestroyInterceptor(hsa_queue_t* hsa_queue) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
std::unordered_map<uint32_t, hsa_agent_t> numa_node_to_cpu_agent;
|
||||
std::unordered_map<long long, long long> gpu_numa_nodes_near_cpu;
|
||||
std::vector<hsa_agent_t> gpu_agents;
|
||||
|
||||
void Initialize(HsaApiTable* table) {
|
||||
InitKsymbols();
|
||||
// Save the HSA core api and amd_ext api.
|
||||
long long gpu_numa_nodes_start = 0;
|
||||
|
||||
SetCoreApiTable(*table->core_);
|
||||
SetAmdExtTable(table->amd_ext_);
|
||||
|
||||
// TODO(aelwazir): FIXME, this is a workaround for the issue of allocating buffers on KernArg
|
||||
// Pools that are nearest to the GPU which is not NUMA local to the CPU. This should be remove
|
||||
// once ROCR provides such API.
|
||||
std::string path = "/sys/class/kfd/kfd/topology/nodes";
|
||||
for (const auto& entry : fs::directory_iterator(path)) {
|
||||
long long node_id = std::stoll(entry.path().filename().c_str());
|
||||
std::ifstream gpu_id_file;
|
||||
std::string gpu_path = entry.path().c_str();
|
||||
gpu_path += "/gpu_id";
|
||||
gpu_id_file.open(gpu_path);
|
||||
std::string gpu_id_str;
|
||||
if (gpu_id_file.is_open()) {
|
||||
gpu_id_file >> gpu_id_str;
|
||||
long long gpu_id = std::stoll(gpu_id_str);
|
||||
if (gpu_id > 0) {
|
||||
gpu_numa_nodes_start = (gpu_numa_nodes_start > node_id || gpu_numa_nodes_start == 0)
|
||||
? node_id
|
||||
: gpu_numa_nodes_start;
|
||||
}
|
||||
}
|
||||
gpu_id_file.close();
|
||||
}
|
||||
path = "/sys/class/kfd/kfd/topology/nodes";
|
||||
for (const auto& entry : fs::directory_iterator(path)) {
|
||||
long long node_id = std::stoll(entry.path().filename().c_str());
|
||||
std::string numa_node_path = entry.path().c_str();
|
||||
long long agent_id = std::stoll(entry.path().filename().c_str());
|
||||
if (agent_id >= gpu_numa_nodes_start) {
|
||||
numa_node_path += "/io_links";
|
||||
for (const auto& numa_node_entry : fs::directory_iterator(numa_node_path)) {
|
||||
std::string numa_node_entry_properties_path = numa_node_entry.path().c_str();
|
||||
numa_node_entry_properties_path += "/properties";
|
||||
std::ifstream gpu_properties_file;
|
||||
gpu_properties_file.open(numa_node_entry_properties_path);
|
||||
std::string gpu_properties_file_line;
|
||||
if (gpu_properties_file.is_open()) {
|
||||
while (gpu_properties_file) {
|
||||
std::getline(gpu_properties_file, gpu_properties_file_line);
|
||||
std::string delimiter = " ";
|
||||
std::stringstream ss(gpu_properties_file_line);
|
||||
std::string word;
|
||||
ss >> word;
|
||||
if (word.compare("node_to") == 0) {
|
||||
ss >> word;
|
||||
long long near_cpu_node_id = std::stoll(word);
|
||||
if (near_cpu_node_id < gpu_numa_nodes_start) {
|
||||
gpu_numa_nodes_near_cpu[node_id] = near_cpu_node_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
gpu_properties_file.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Enumerate the agents.
|
||||
if (GetCoreApiTable().hsa_iterate_agents_fn(
|
||||
[](hsa_agent_t agent, void* data) {
|
||||
@@ -777,10 +845,16 @@ void Initialize(HsaApiTable* table) {
|
||||
case HSA_DEVICE_TYPE_CPU:
|
||||
agent_info.setIndex(cpu_agent_count++);
|
||||
cpu_agent = agent;
|
||||
rocmtools::queue::InitializePools(cpu_agent);
|
||||
rocmtools::queue::InitializePools(cpu_agent, &agent_info);
|
||||
uint32_t cpu_numa_node_id;
|
||||
if (GetCoreApiTable().hsa_agent_get_info_fn(
|
||||
agent, HSA_AGENT_INFO_NODE, &cpu_numa_node_id) != HSA_STATUS_SUCCESS)
|
||||
rocmtools::fatal("hsa_agent_get_info(HSA_AGENT_INFO_NODE) failed");
|
||||
agent_info.setNumaNode(cpu_numa_node_id);
|
||||
numa_node_to_cpu_agent[cpu_numa_node_id] = agent;
|
||||
break;
|
||||
case HSA_DEVICE_TYPE_GPU:
|
||||
// XXX FIXME: When multiple ranks are used, each rank's first
|
||||
// TODO(FIXME): When multiple ranks are used, each rank's first
|
||||
// logical device always has GPU ID 0, regardless of which
|
||||
// physical device is selected with CUDA_VISIBLE_DEVICES.
|
||||
// Because of this, when merging traces from multiple ranks,
|
||||
@@ -794,6 +868,15 @@ void Initialize(HsaApiTable* table) {
|
||||
// is currently doing as well as the roctracer compatibility
|
||||
// code earlier in this file.
|
||||
agent_info.setIndex(gpu_agent_count++);
|
||||
uint32_t gpu_cpu_numa_node_id;
|
||||
if (GetCoreApiTable().hsa_agent_get_info_fn(
|
||||
agent, HSA_AGENT_INFO_NODE, &gpu_cpu_numa_node_id) != HSA_STATUS_SUCCESS)
|
||||
rocmtools::fatal("hsa_agent_get_info(HSA_AGENT_INFO_NODE) failed");
|
||||
agent_info.setNumaNode(gpu_cpu_numa_node_id);
|
||||
agent_info.setNearCpuAgent(
|
||||
numa_node_to_cpu_agent[gpu_numa_nodes_near_cpu[gpu_cpu_numa_node_id]]);
|
||||
rocmtools::queue::InitializeGPUPool(agent, &agent_info);
|
||||
gpu_agents.push_back(agent);
|
||||
break;
|
||||
default:
|
||||
agent_info.setIndex(other_agent_count++);
|
||||
@@ -805,6 +888,19 @@ void Initialize(HsaApiTable* table) {
|
||||
nullptr) != HSA_STATUS_SUCCESS)
|
||||
rocmtools::fatal("hsa_iterate_agents failed");
|
||||
|
||||
for (auto& agent : gpu_agents) {
|
||||
GetAgentInfo(agent.handle).cpu_pool =
|
||||
GetAgentInfo(GetAgentInfo(agent.handle).getNearCpuAgent().handle).cpu_pool;
|
||||
GetAgentInfo(agent.handle).kernarg_pool =
|
||||
GetAgentInfo(GetAgentInfo(agent.handle).getNearCpuAgent().handle).kernarg_pool;
|
||||
}
|
||||
|
||||
rocmtools::queue::CheckPacketReqiurements(gpu_agents);
|
||||
|
||||
gpu_agents.clear();
|
||||
numa_node_to_cpu_agent.clear();
|
||||
gpu_numa_nodes_near_cpu.clear();
|
||||
|
||||
SetHSALoaderApi();
|
||||
|
||||
roctracer::hsa_support::Initialize_roctracer(table);
|
||||
|
||||
@@ -40,6 +40,7 @@
|
||||
#include "src/core/hsa/hsa_common.h"
|
||||
|
||||
#include "src/core/counters/metrics/metrics.h"
|
||||
#include "src/core/hardware/hsa_info.h"
|
||||
|
||||
|
||||
#define ASSERTM(exp, msg) assert(((void)msg, exp))
|
||||
@@ -58,11 +59,11 @@ namespace Packet {
|
||||
|
||||
static const size_t MEM_PAGE_BYTES = 0x1000;
|
||||
static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
|
||||
hsa_amd_memory_pool_t command_pool;
|
||||
hsa_amd_memory_pool_t output_pool;
|
||||
// hsa_amd_memory_pool_t command_pool;
|
||||
// hsa_amd_memory_pool_t output_pool;
|
||||
|
||||
hsa_amd_memory_pool_t& GetCommandPool() { return command_pool; }
|
||||
hsa_amd_memory_pool_t& GetOutputPool() { return output_pool; }
|
||||
// hsa_amd_memory_pool_t& GetCommandPool() { return command_pool; }
|
||||
// hsa_amd_memory_pool_t& GetOutputPool() { return output_pool; }
|
||||
|
||||
// This function checks to see if the provided
|
||||
// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true,
|
||||
@@ -111,22 +112,38 @@ hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) {
|
||||
return FindGlobalPool(pool, data, true);
|
||||
}
|
||||
|
||||
void InitializePools(hsa_agent_t cpu_agent) {
|
||||
void InitializePools(hsa_agent_t cpu_agent, Agent::AgentInfo* agent_info) {
|
||||
hsa_status_t status =
|
||||
rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
|
||||
cpu_agent, FindStandardPool, &command_pool);
|
||||
cpu_agent, FindStandardPool, &(agent_info->cpu_pool));
|
||||
if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Command Buffer Pool is not initialized\n");
|
||||
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
|
||||
cpu_agent, FindKernArgPool, &output_pool);
|
||||
cpu_agent, FindKernArgPool, &(agent_info->kernarg_pool));
|
||||
if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Output Buffer Pool is not initialized\n");
|
||||
}
|
||||
|
||||
void InitializeGPUPool(hsa_agent_t gpu_agent, Agent::AgentInfo* agent_info) {
|
||||
hsa_status_t status =
|
||||
hsa_amd_agent_iterate_memory_pools(gpu_agent, FindStandardPool, &(agent_info->gpu_pool));
|
||||
CHECK_HSA_STATUS("hsa_amd_agent_iterate_memory_pools(gpu_pool)", status);
|
||||
}
|
||||
|
||||
struct block_des_t {
|
||||
uint32_t id;
|
||||
uint32_t index;
|
||||
};
|
||||
|
||||
std::map<uint32_t, rocmtools::MetricsDict*> metricsDict;
|
||||
|
||||
void CheckPacketReqiurements(std::vector<hsa_agent_t>& gpu_agents) {
|
||||
for (auto& gpu_agent : gpu_agents) {
|
||||
// get the instance of MetricsDict
|
||||
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
|
||||
metricsDict[gpu_agent.handle] = rocmtools::MetricsDict::Create(&agentInfo);
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize the PM4 commands with having the CPU&GPU agents, the counters,
|
||||
// counters count to output three packets which are start, stop and read
|
||||
// packets
|
||||
@@ -136,8 +153,6 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
|
||||
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
|
||||
// get the instance of MetricsDict
|
||||
rocmtools::MetricsDict* metricsDict = rocmtools::MetricsDict::Create(&agentInfo);
|
||||
std::map<std::string, rocmtools::results_t*> results_map;
|
||||
std::vector<rocmtools::event_t> events_list;
|
||||
std::vector<rocmtools::results_t*> results_list;
|
||||
@@ -158,9 +173,9 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
|
||||
counters_count++;
|
||||
}
|
||||
|
||||
rocmtools::metrics::ExtractMetricEvents(counter_names, gpu_agent, metricsDict, results_map,
|
||||
events_list, results_list, event_to_max_block_count,
|
||||
metrics_counters);
|
||||
rocmtools::metrics::ExtractMetricEvents(counter_names, gpu_agent, metricsDict[gpu_agent.handle],
|
||||
results_map, events_list, results_list,
|
||||
event_to_max_block_count, metrics_counters);
|
||||
|
||||
// TODO: validate needs to be called on each events_list[i]
|
||||
// Validating the events array for the specified gpu agent
|
||||
@@ -171,8 +186,6 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
|
||||
throw("Error: Events are not valid for the current gpu agent");
|
||||
}
|
||||
|
||||
// std::cout << "Max Block Counters: " << max_block_counters << std::endl;
|
||||
|
||||
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>*
|
||||
profiles = new std::vector<
|
||||
std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>();
|
||||
@@ -197,27 +210,9 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
|
||||
counter_val_iteration++;
|
||||
block_max_events_count[std::make_pair<uint32_t, uint32_t>(
|
||||
static_cast<uint32_t>(event->block_name), static_cast<uint32_t>(event->block_index))]++;
|
||||
// std::cout << "Block Name: " << event->block_name << " Block Index: " <<
|
||||
// event->block_index
|
||||
// << " Current Count: "
|
||||
// << block_max_events_count[std::make_pair<uint32_t, uint32_t>(
|
||||
// static_cast<uint32_t>(event->block_name),
|
||||
// static_cast<uint32_t>(event->block_index))]
|
||||
// << std::endl;
|
||||
// std::cout << "Counter Taken: " << event->block_index << ", " << event->counter_id << " "
|
||||
// << block_max_events_count[std::make_pair<uint32_t, uint32_t>(
|
||||
// static_cast<uint32_t>(event->block_name),
|
||||
// static_cast<uint32_t>(event->block_index))]
|
||||
// << ":"
|
||||
// << event_to_max_block_count[std::make_pair<uint32_t, uint32_t>(
|
||||
// static_cast<uint32_t>(event->block_name),
|
||||
// static_cast<uint32_t>(event->block_index))]
|
||||
// << std::endl;
|
||||
results_list.erase(result);
|
||||
events_list.erase(event);
|
||||
} else {
|
||||
// std::cout << "Counter Left: " << event->block_index << ", " << event->counter_id
|
||||
// << std::endl;
|
||||
event++;
|
||||
result++;
|
||||
}
|
||||
@@ -234,7 +229,7 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
|
||||
counter_names.end()) {
|
||||
// std::cout << "Counter from Result List: " << result->name << std::endl;
|
||||
counters_taken.insert(result->name);
|
||||
metric = const_cast<rocmtools::Metric*>(metricsDict->Get(result->name));
|
||||
metric = const_cast<rocmtools::Metric*>(metricsDict[gpu_agent.handle]->Get(result->name));
|
||||
if (metric == nullptr) std::cout << result->name << " not found in metricsDict\n";
|
||||
context->metrics_list.push_back(metric);
|
||||
} else {
|
||||
@@ -257,7 +252,6 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
|
||||
}
|
||||
}
|
||||
if (flag) metrics_taken.insert(result.first);
|
||||
// std::cout << "Metric to be checked from map: " << result.first << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -267,8 +261,6 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
|
||||
for (auto metric_counter_name : metrics_counters.at(metric_name)) {
|
||||
if (metrics_counters_taken.find(metric_counter_name) == metrics_counters_taken.end() &&
|
||||
counters_taken.find(metric_counter_name) == counters_taken.end()) {
|
||||
// std::cout << metric_counter_name << " for " << metric_name << " is not found!"
|
||||
// << std::endl;
|
||||
flag = false;
|
||||
continue;
|
||||
}
|
||||
@@ -276,14 +268,15 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
|
||||
if (flag) {
|
||||
// std::cout << "Counter from Result Map: " << metric_name << std::endl;
|
||||
counters_taken.insert(metric_name);
|
||||
rocmtools::Metric* metric = const_cast<rocmtools::Metric*>(metricsDict->Get(metric_name));
|
||||
rocmtools::Metric* metric =
|
||||
const_cast<rocmtools::Metric*>(metricsDict[gpu_agent.handle]->Get(metric_name));
|
||||
if (metric == nullptr) std::cout << metric_name << " not found in metricsDict\n";
|
||||
context->metrics_list.push_back(metric);
|
||||
}
|
||||
}
|
||||
|
||||
context->results_map = results_map;
|
||||
context->metrics_dict = metricsDict;
|
||||
context->metrics_dict = metricsDict[gpu_agent.handle];
|
||||
|
||||
hsa_ven_amd_aqlprofile_parameter_t* params = {};
|
||||
|
||||
@@ -323,7 +316,7 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
|
||||
size_t size = profile->command_buffer.size;
|
||||
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
|
||||
command_pool, size, 0, reinterpret_cast<void**>(&(profile->command_buffer.ptr)));
|
||||
agentInfo.cpu_pool, size, 0, reinterpret_cast<void**>(&(profile->command_buffer.ptr)));
|
||||
|
||||
// Both the CPU and GPU can access the memory
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
@@ -349,7 +342,7 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
|
||||
size = profile->output_buffer.size;
|
||||
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
|
||||
output_pool, size, 0, reinterpret_cast<void**>(&profile->output_buffer.ptr));
|
||||
agentInfo.kernarg_pool, size, 0, reinterpret_cast<void**>(&profile->output_buffer.ptr));
|
||||
|
||||
if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
|
||||
printf("Error: Ran out of GPU memory to allocate Output Buffer\n");
|
||||
@@ -435,20 +428,14 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets(
|
||||
// Preparing an Getting the size of the command and output buffers
|
||||
status = hsa_ven_amd_aqlprofile_start(profile, NULL);
|
||||
|
||||
// Preparing and Initializing bool of buffers for command and output buffers
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
|
||||
cpu_agent, FindStandardPool, &command_pool);
|
||||
if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Command Buffer Pool is not initialized\n");
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
|
||||
cpu_agent, FindKernArgPool, &output_pool);
|
||||
if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Output Buffer Pool is not initialized\n");
|
||||
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
|
||||
|
||||
// Allocating Command Buffer
|
||||
status = HSA_STATUS_ERROR;
|
||||
size_t size = profile->command_buffer.size;
|
||||
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
|
||||
command_pool, size, 0, reinterpret_cast<void**>(&command_buffer));
|
||||
agentInfo.cpu_pool, size, 0, reinterpret_cast<void**>(&command_buffer));
|
||||
// Both the CPU and GPU can access the memory
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
hsa_agent_t ag_list[1] = {gpu_agent};
|
||||
@@ -463,7 +450,7 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets(
|
||||
size = profile->output_buffer.size;
|
||||
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
|
||||
output_pool, size, 0, reinterpret_cast<void**>(&output_buffer));
|
||||
agentInfo.kernarg_pool, size, 0, reinterpret_cast<void**>(&output_buffer));
|
||||
// Both the CPU and GPU can access the kernel arguments
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
hsa_agent_t ag_list[1] = {gpu_agent};
|
||||
@@ -517,14 +504,14 @@ uint8_t* AllocateLocalMemory(size_t size, hsa_amd_memory_pool_t* gpu_pool) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
hsa_status_t Allocate(hsa_agent_t gpu_agent, hsa_ven_amd_aqlprofile_profile_t* profile,
|
||||
hsa_amd_memory_pool_t* cpu_pool, hsa_amd_memory_pool_t* gpu_pool) {
|
||||
hsa_status_t Allocate(hsa_agent_t gpu_agent, hsa_ven_amd_aqlprofile_profile_t* profile) {
|
||||
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
|
||||
profile->command_buffer.ptr =
|
||||
AllocateSysMemory(gpu_agent, profile->command_buffer.size, cpu_pool);
|
||||
AllocateSysMemory(gpu_agent, profile->command_buffer.size, &agentInfo.cpu_pool);
|
||||
profile->output_buffer.size = g_output_buffer_size;
|
||||
profile->output_buffer.ptr = (g_output_buffer_local)
|
||||
? AllocateLocalMemory(profile->output_buffer.size, gpu_pool)
|
||||
: AllocateSysMemory(gpu_agent, profile->output_buffer.size, cpu_pool);
|
||||
? AllocateLocalMemory(profile->output_buffer.size, &agentInfo.gpu_pool)
|
||||
: AllocateSysMemory(gpu_agent, profile->output_buffer.size, &agentInfo.cpu_pool);
|
||||
return (profile->command_buffer.ptr && profile->output_buffer.ptr) ? HSA_STATUS_SUCCESS
|
||||
: HSA_STATUS_ERROR;
|
||||
}
|
||||
@@ -574,49 +561,27 @@ hsa_ven_amd_aqlprofile_profile_t* GenerateATTPackets(
|
||||
hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
|
||||
std::vector<hsa_ven_amd_aqlprofile_parameter_t>& att_params, packet_t* start_packet,
|
||||
packet_t* stop_packet) {
|
||||
att_memory_pools_t* att_mem_pools = NULL;
|
||||
auto it = GetAttMemPoolsMap()->find(gpu_agent.handle);
|
||||
if (it == GetAttMemPoolsMap()->end()) {
|
||||
att_mem_pools = new att_memory_pools_t;
|
||||
|
||||
// Allocate memory pools for cpu and gpu
|
||||
AllocateMemoryPools(cpu_agent, gpu_agent, &att_mem_pools->cpu_mem_pool,
|
||||
&att_mem_pools->gpu_mem_pool);
|
||||
|
||||
GetAttMemPoolsMap()->emplace(gpu_agent.handle, att_mem_pools);
|
||||
} else
|
||||
att_mem_pools = it->second;
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wconversion-null"
|
||||
// Preparing the profile structure to get the packets
|
||||
hsa_ven_amd_aqlprofile_profile_t* profile =
|
||||
new hsa_ven_amd_aqlprofile_profile_t{gpu_agent,
|
||||
HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE,
|
||||
nullptr,
|
||||
0,
|
||||
&att_params[0],
|
||||
(uint32_t)att_params.size(),
|
||||
NULL,
|
||||
NULL};
|
||||
new hsa_ven_amd_aqlprofile_profile_t{gpu_agent, HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE,
|
||||
nullptr, 0,
|
||||
&att_params[0], (uint32_t)att_params.size(),
|
||||
NULL, NULL};
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
// Check the profile buffer sizes
|
||||
hsa_status_t status = hsa_ven_amd_aqlprofile_start(profile, NULL);
|
||||
if (status != HSA_STATUS_SUCCESS) printf("Error: aqlprofile_start(NULL)");
|
||||
// // Double output buffer size if concurrent
|
||||
// if (is_concurrent) profile.output_buffer.size *= 2;
|
||||
|
||||
// TODO: create a separate class for memory allocations
|
||||
// Maintain pools per device
|
||||
// handle allocation and resource cleanup
|
||||
|
||||
|
||||
// Allocate command and output buffers
|
||||
// command buffer -> from CPU memory pool
|
||||
// output buffer -> from GPU memory pool
|
||||
status =
|
||||
Allocate(gpu_agent, profile, &att_mem_pools->cpu_mem_pool, &att_mem_pools->gpu_mem_pool);
|
||||
status = Allocate(gpu_agent, profile);
|
||||
if (status != HSA_STATUS_SUCCESS) printf("Error: Allocate()");
|
||||
|
||||
// Generate start/stop/read profiling packets
|
||||
|
||||
@@ -37,8 +37,6 @@
|
||||
|
||||
namespace Packet {
|
||||
|
||||
static std::mutex pool_lock;
|
||||
|
||||
typedef hsa_ext_amd_aql_pm4_packet_t packet_t;
|
||||
|
||||
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>*
|
||||
@@ -47,7 +45,8 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
|
||||
uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_pool_t* cpu_pool);
|
||||
void GetCommandBufferMap(std::map<size_t, uint8_t*>);
|
||||
void GetOutputBufferMap(std::map<size_t, uint8_t*>);
|
||||
void InitializePools(hsa_agent_t cpu_agent);
|
||||
void InitializePools(hsa_agent_t cpu_agent, Agent::AgentInfo* agent_info);
|
||||
void InitializeGPUPool(hsa_agent_t gpu_agent, Agent::AgentInfo* agent_info);
|
||||
hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets(
|
||||
hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, hsa_ven_amd_aqlprofile_event_t* events,
|
||||
uint32_t event_count, packet_t* start_packet, packet_t* stop_packet, packet_t* read_packet);
|
||||
@@ -63,9 +62,10 @@ hsa_ven_amd_aqlprofile_profile_t* GenerateATTPackets(
|
||||
|
||||
uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_pool_t* cpu_pool);
|
||||
|
||||
void get_command_buffer_map(std::map<size_t, uint8_t*> );
|
||||
void get_outbuffer_map(std::map<size_t, uint8_t*> );
|
||||
void get_command_buffer_map(std::map<size_t, uint8_t*>);
|
||||
void get_outbuffer_map(std::map<size_t, uint8_t*>);
|
||||
void initialize_pools(hsa_agent_t cpu_agent);
|
||||
void CheckPacketReqiurements(std::vector<hsa_agent_t>& gpu_agents);
|
||||
|
||||
typedef struct {
|
||||
hsa_amd_memory_pool_t cpu_mem_pool;
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
|
||||
#include "rocprofiler.h"
|
||||
#include "src/api/rocmtool.h"
|
||||
#include "src/core/hsa/packets/packets_generator.h"
|
||||
#include "src/core/hsa/hsa_support.h"
|
||||
@@ -284,7 +285,7 @@ hsa_status_t attTraceDataCallback(hsa_ven_amd_aqlprofile_info_type_t info_type,
|
||||
att_trace_callback_data_t* passed_data = reinterpret_cast<att_trace_callback_data_t*>(data);
|
||||
passed_data->push_back(*info_data);
|
||||
// TODO: clear output buffers after copying
|
||||
// either copy here or in AddattRecord
|
||||
// either copy here or in ::AddAttRecord
|
||||
|
||||
return status;
|
||||
}
|
||||
@@ -314,12 +315,10 @@ void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_sign
|
||||
|
||||
void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent,
|
||||
att_pending_signal_t& pending) {
|
||||
Agent::AgentInfo agent_info = hsa_support::GetAgentInfo(gpu_agent.handle);
|
||||
att_trace_callback_data_t data;
|
||||
hsa_ven_amd_aqlprofile_iterate_data(pending.profile, attTraceDataCallback, &data);
|
||||
|
||||
// Get CPU and GPU memory pools
|
||||
Packet::att_memory_pools_t* att_mem_pools = Packet::GetAttMemPools(gpu_agent);
|
||||
|
||||
// Allocate memory for shader_engine_data
|
||||
record->shader_engine_data = static_cast<rocprofiler_record_se_att_data_t*>(
|
||||
calloc(data.size(), sizeof(rocprofiler_record_se_att_data_t)));
|
||||
@@ -336,7 +335,7 @@ void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent
|
||||
void* buffer = NULL;
|
||||
if (data_size != 0) {
|
||||
// Allocate buffer on CPU to copy out trace data
|
||||
buffer = Packet::AllocateSysMemory(gpu_agent, data_size, &att_mem_pools->cpu_mem_pool);
|
||||
buffer = Packet::AllocateSysMemory(gpu_agent, data_size, &agent_info.cpu_pool);
|
||||
if (buffer == NULL) fatal("Trace data buffer allocation failed");
|
||||
|
||||
auto status =
|
||||
@@ -353,83 +352,6 @@ void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent
|
||||
record->shader_engine_data_count = data.size();
|
||||
}
|
||||
|
||||
// static const size_t MEM_PAGE_BYTES = 0x1000;
|
||||
// static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
|
||||
// static std::mutex begin_signal_lock;
|
||||
|
||||
// bool BeginSignalHandler(hsa_signal_value_t signal_value, void* data) {
|
||||
// std::lock_guard<std::mutex> lock(begin_signal_lock);
|
||||
// auto profiling_context =
|
||||
// static_cast<std::pair<rocmtools::profiling_context_t*,
|
||||
// hsa_ven_amd_aqlprofile_profile_t*>*>(
|
||||
// data);
|
||||
// if (!profiling_context->first->begin_completed.load(std::memory_order_relaxed)) {
|
||||
// std::cout << "BeginSignalHandler is called" << std::endl;
|
||||
// hsa_status_t status = HSA_STATUS_ERROR;
|
||||
// size_t size = profiling_context->second->command_buffer.size;
|
||||
// size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
// status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
|
||||
// Packet::GetCommandPool(), size, 0,
|
||||
// reinterpret_cast<void**>(&(profiling_context->second->command_buffer.ptr)));
|
||||
|
||||
// // Both the CPU and GPU can access the memory
|
||||
// if (status == HSA_STATUS_SUCCESS) {
|
||||
// hsa_agent_t ag_list[1] = {profiling_context->first->gpu_agent};
|
||||
// status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
|
||||
// 1, ag_list, NULL, profiling_context->second->command_buffer.ptr);
|
||||
|
||||
// if (status != HSA_STATUS_SUCCESS) {
|
||||
// printf("Error: Can't allow access for both agents to Command Buffer\n");
|
||||
// }
|
||||
// } else if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
|
||||
// printf("Error: Ran out of GPU memory to allocate Command Buffer\n");
|
||||
// } else {
|
||||
// const char* hsa_err_str = NULL;
|
||||
// if (hsa_status_string(status, &hsa_err_str) != HSA_STATUS_SUCCESS) hsa_err_str = "Unknown";
|
||||
// printf("Error: Allocating command Buffer (Size=%lu) (%s)\n", size, hsa_err_str);
|
||||
// }
|
||||
|
||||
// status = HSA_STATUS_ERROR;
|
||||
// size = profiling_context->second->output_buffer.size;
|
||||
// size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
// status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
|
||||
// Packet::GetOutputPool(), size, 0,
|
||||
// reinterpret_cast<void**>(&profiling_context->second->output_buffer.ptr));
|
||||
|
||||
// if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
|
||||
// printf("Error: Ran out of GPU memory to allocate Output Buffer\n");
|
||||
// }
|
||||
|
||||
// if (status == HSA_STATUS_SUCCESS) {
|
||||
// hsa_agent_t ag_list[1] = {profiling_context->first->gpu_agent};
|
||||
// status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
|
||||
// 1, ag_list, NULL, profiling_context->second->output_buffer.ptr);
|
||||
|
||||
// if (status == HSA_STATUS_SUCCESS) {
|
||||
// memset(profiling_context->second->output_buffer.ptr, 0x0,
|
||||
// profiling_context->second->output_buffer.size);
|
||||
|
||||
// // Creating the start/stop/read packets
|
||||
// status = hsa_ven_amd_aqlprofile_start(profiling_context->second,
|
||||
// profiling_context->first->start_packet);
|
||||
// status = hsa_ven_amd_aqlprofile_stop(profiling_context->second,
|
||||
// profiling_context->first->stop_packet);
|
||||
// status = hsa_ven_amd_aqlprofile_read(profiling_context->second,
|
||||
// profiling_context->first->read_packet);
|
||||
// } else {
|
||||
// printf("Error: Can't allow access for both agents to output Buffer\n");
|
||||
// }
|
||||
// } else {
|
||||
// const char* hsa_err_str = NULL;
|
||||
// if (hsa_status_string(status, &hsa_err_str) != HSA_STATUS_SUCCESS) hsa_err_str = "Unknown";
|
||||
// printf("Error: Allocating output Buffer (%s)\n", hsa_err_str);
|
||||
// }
|
||||
|
||||
// profiling_context->first->begin_completed.exchange(true, std::memory_order_relaxed);
|
||||
// }
|
||||
// return true;
|
||||
// }
|
||||
|
||||
bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) {
|
||||
auto queue_info_session = static_cast<queue_info_session_t*>(data);
|
||||
if (!queue_info_session || !GetROCMToolObj() ||
|
||||
@@ -647,6 +569,75 @@ template <typename Integral> constexpr Integral bit_extract(Integral x, int firs
|
||||
return (x >> first) & bit_mask<Integral>(0, last - first);
|
||||
}
|
||||
|
||||
rocprofiler_session_id_t session_id = rocprofiler_session_id_t{0};
|
||||
// Counter Names declaration
|
||||
std::vector<std::string> session_data;
|
||||
|
||||
rocprofiler_buffer_id_t buffer_id;
|
||||
|
||||
uint64_t session_data_count = 0;
|
||||
|
||||
bool is_counter_collection_mode = false;
|
||||
bool is_timestamp_collection_mode = false;
|
||||
bool is_att_collection_mode = false;
|
||||
bool is_pc_sampling_collection_mode = false;
|
||||
std::vector<rocprofiler_att_parameter_t> att_parameters_data;
|
||||
uint32_t replay_mode_count = 0;
|
||||
std::vector<std::string> kernel_profile_names;
|
||||
std::vector<std::string> att_counters_names;
|
||||
|
||||
rocmtools::Session* session = nullptr;
|
||||
|
||||
void ResetSessionID() { session_id = rocprofiler_session_id_t{0}; }
|
||||
|
||||
void CheckNeededProfileConfigs() {
|
||||
rocprofiler_session_id_t internal_session_id;
|
||||
if (GetROCMToolObj())
|
||||
// Getting Session ID
|
||||
internal_session_id = GetROCMToolObj()->GetCurrentSessionId();
|
||||
else
|
||||
internal_session_id = {0};
|
||||
|
||||
if (session_id.handle == 0 || internal_session_id.handle != session_id.handle) {
|
||||
session_id = internal_session_id;
|
||||
// Getting Counters count from the Session
|
||||
if (session_id.handle > 0 && GetROCMToolObj()) {
|
||||
session = GetROCMToolObj()->GetSession(session_id);
|
||||
if (session && session->FindFilterWithKind(ROCPROFILER_COUNTERS_COLLECTION)) {
|
||||
rocprofiler_filter_id_t filter_id =
|
||||
session->GetFilterIdWithKind(ROCPROFILER_COUNTERS_COLLECTION);
|
||||
rocmtools::Filter* filter = session->GetFilter(filter_id);
|
||||
session_data = filter->GetCounterData();
|
||||
is_counter_collection_mode = true;
|
||||
session_data_count = session_data.size();
|
||||
buffer_id = filter->GetBufferId();
|
||||
} else if (session &&
|
||||
session->FindFilterWithKind(ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION)) {
|
||||
is_timestamp_collection_mode = true;
|
||||
rocprofiler_filter_id_t filter_id =
|
||||
session->GetFilterIdWithKind(ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION);
|
||||
rocmtools::Filter* filter = session->GetFilter(filter_id);
|
||||
buffer_id = filter->GetBufferId();
|
||||
} else if (session && session->FindFilterWithKind(ROCPROFILER_ATT_TRACE_COLLECTION)) {
|
||||
rocprofiler_filter_id_t filter_id =
|
||||
session->GetFilterIdWithKind(ROCPROFILER_ATT_TRACE_COLLECTION);
|
||||
rocmtools::Filter* filter = session->GetFilter(filter_id);
|
||||
att_parameters_data = filter->GetAttParametersData();
|
||||
is_att_collection_mode = true;
|
||||
buffer_id =
|
||||
session->GetFilter(session->GetFilterIdWithKind(ROCPROFILER_ATT_TRACE_COLLECTION))
|
||||
->GetBufferId();
|
||||
|
||||
att_counters_names = filter->GetCounterData();
|
||||
kernel_profile_names = std::get<std::vector<std::string>>(
|
||||
filter->GetProperty(ROCPROFILER_FILTER_KERNEL_NAMES));
|
||||
} else if (session && session->FindFilterWithKind(ROCPROFILER_PC_SAMPLING_COLLECTION)) {
|
||||
is_pc_sampling_collection_mode = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int KernelInterceptCount = 0;
|
||||
std::atomic<uint32_t> WRITER_ID{0};
|
||||
/**
|
||||
@@ -660,64 +651,8 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
|
||||
hsa_amd_queue_intercept_packet_writer writer) {
|
||||
const Packet::packet_t* packets_arr = reinterpret_cast<const Packet::packet_t*>(packets);
|
||||
std::vector<Packet::packet_t> transformed_packets;
|
||||
rocprofiler_session_id_t session_id;
|
||||
if (GetROCMToolObj())
|
||||
// Getting Session ID
|
||||
session_id = GetROCMToolObj()->GetCurrentSessionId();
|
||||
else
|
||||
session_id = {0};
|
||||
|
||||
// Counter Names declaration
|
||||
std::vector<std::string> session_data;
|
||||
|
||||
rocprofiler_buffer_id_t buffer_id;
|
||||
|
||||
uint64_t session_data_count = 0;
|
||||
|
||||
bool is_counter_collection_mode = false;
|
||||
bool is_timestamp_collection_mode = false;
|
||||
bool is_att_collection_mode = false;
|
||||
bool is_pc_sampling_collection_mode = false;
|
||||
std::vector<rocprofiler_att_parameter_t> att_parameters_data;
|
||||
uint32_t replay_mode_count = 0;
|
||||
std::vector<std::string> kernel_profile_names;
|
||||
std::vector<std::string> att_counters_names;
|
||||
|
||||
rocmtools::Session* session = nullptr;
|
||||
|
||||
// Getting Counters count from the Session
|
||||
if (session_id.handle > 0 && GetROCMToolObj()) {
|
||||
session = GetROCMToolObj()->GetSession(session_id);
|
||||
if (session && session->FindFilterWithKind(ROCPROFILER_COUNTERS_COLLECTION)) {
|
||||
rocprofiler_filter_id_t filter_id =
|
||||
session->GetFilterIdWithKind(ROCPROFILER_COUNTERS_COLLECTION);
|
||||
rocmtools::Filter* filter = session->GetFilter(filter_id);
|
||||
session_data = filter->GetCounterData();
|
||||
is_counter_collection_mode = true;
|
||||
session_data_count = session_data.size();
|
||||
buffer_id = filter->GetBufferId();
|
||||
} else if (session && session->FindFilterWithKind(ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION)) {
|
||||
is_timestamp_collection_mode = true;
|
||||
rocprofiler_filter_id_t filter_id =
|
||||
session->GetFilterIdWithKind(ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION);
|
||||
rocmtools::Filter* filter = session->GetFilter(filter_id);
|
||||
buffer_id = filter->GetBufferId();
|
||||
} else if (session && session->FindFilterWithKind(ROCPROFILER_ATT_TRACE_COLLECTION)) {
|
||||
rocprofiler_filter_id_t filter_id =
|
||||
session->GetFilterIdWithKind(ROCPROFILER_ATT_TRACE_COLLECTION);
|
||||
rocmtools::Filter* filter = session->GetFilter(filter_id);
|
||||
att_parameters_data = filter->GetAttParametersData();
|
||||
is_att_collection_mode = true;
|
||||
buffer_id = session->GetFilter(session->GetFilterIdWithKind(ROCPROFILER_ATT_TRACE_COLLECTION))
|
||||
->GetBufferId();
|
||||
|
||||
att_counters_names = filter->GetCounterData();
|
||||
kernel_profile_names =
|
||||
std::get<std::vector<std::string>>(filter->GetProperty(ROCPROFILER_FILTER_KERNEL_NAMES));
|
||||
} else if (session && session->FindFilterWithKind(ROCPROFILER_PC_SAMPLING_COLLECTION)) {
|
||||
is_pc_sampling_collection_mode = true;
|
||||
}
|
||||
}
|
||||
CheckNeededProfileConfigs();
|
||||
|
||||
if (session_id.handle > 0 && pkt_count > 0 &&
|
||||
(is_counter_collection_mode || is_timestamp_collection_mode ||
|
||||
@@ -764,21 +699,6 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
|
||||
|
||||
if (session_data_count > 0 && is_counter_collection_mode && profiles &&
|
||||
replay_mode_count > 0) {
|
||||
// hsa_signal_t begin_signal{};
|
||||
// CreateSignal(0, &begin_signal);
|
||||
// hsa_barrier_and_packet_t barrier{0};
|
||||
// barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
|
||||
// CreateSignal(0, &barrier.completion_signal);
|
||||
// barrier.dep_signal[0] = hsa_signal_t{};
|
||||
// Packet::packet_t* __attribute__((__may_alias__)) pkt =
|
||||
// (reinterpret_cast<Packet::packet_t*>(&barrier));
|
||||
// transformed_packets.emplace_back(*pkt);
|
||||
// hsa_status_t status = hsa_support::GetAmdExtTable().hsa_amd_signal_async_handler_fn(
|
||||
// barrier.completion_signal, HSA_SIGNAL_CONDITION_GTE, 1, BeginSignalHandler,
|
||||
// &profiles->at(profile_id));
|
||||
// if (status != HSA_STATUS_SUCCESS)
|
||||
// fatal("hsa_amd_signal_async_handler failed for begin signal");
|
||||
|
||||
// Adding start packet and its barrier with a dummy signal
|
||||
hsa_signal_t dummy_signal{};
|
||||
dummy_signal.handle = 0;
|
||||
@@ -1087,8 +1007,6 @@ Queue::Queue(const hsa_agent_t& cpu_agent, const hsa_agent_t& gpu_agent, uint32_
|
||||
*queue = intercept_queue_;
|
||||
}
|
||||
|
||||
// Queue::~Queue() { std::lock_guard<std::mutex> lk(mutex_); }
|
||||
|
||||
hsa_queue_t* Queue::GetCurrentInterceptQueue() { return intercept_queue_; }
|
||||
|
||||
hsa_agent_t Queue::GetGPUAgent() { return gpu_agent_; }
|
||||
@@ -1097,7 +1015,15 @@ hsa_agent_t Queue::GetCPUAgent() { return cpu_agent_; }
|
||||
|
||||
uint64_t Queue::GetQueueID() { return intercept_queue_->id; }
|
||||
|
||||
void InitializePools(hsa_agent_t cpu_agent) { Packet::InitializePools(cpu_agent); }
|
||||
void InitializePools(hsa_agent_t cpu_agent, Agent::AgentInfo* agent_info) {
|
||||
Packet::InitializePools(cpu_agent, agent_info);
|
||||
}
|
||||
void InitializeGPUPool(hsa_agent_t gpu_agent, Agent::AgentInfo* agent_info) {
|
||||
Packet::InitializeGPUPool(gpu_agent, agent_info);
|
||||
}
|
||||
void CheckPacketReqiurements(std::vector<hsa_agent_t>& gpu_agents) {
|
||||
Packet::CheckPacketReqiurements(gpu_agents);
|
||||
}
|
||||
|
||||
} // namespace queue
|
||||
} // namespace rocmtools
|
||||
|
||||
@@ -86,7 +86,11 @@ struct queue_info_session_t {
|
||||
|
||||
void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_signal_t& pending);
|
||||
|
||||
void InitializePools(hsa_agent_t cpu_agent);
|
||||
void InitializePools(hsa_agent_t cpu_agent, Agent::AgentInfo* agent_info);
|
||||
void InitializeGPUPool(hsa_agent_t gpu_agent, Agent::AgentInfo* agent_info);
|
||||
void CheckPacketReqiurements(std::vector<hsa_agent_t>& gpu_agents);
|
||||
|
||||
void ResetSessionID();
|
||||
|
||||
} // namespace queue
|
||||
} // namespace rocmtools
|
||||
|
||||
@@ -47,6 +47,7 @@ Session::~Session() {
|
||||
while (GetCurrentActiveInterruptSignalsCount() > 0) {
|
||||
}
|
||||
if (profiler_started_.load(std::memory_order_release)) {
|
||||
rocmtools::queue::ResetSessionID();
|
||||
delete profiler_;
|
||||
profiler_started_.exchange(false, std::memory_order_release);
|
||||
}
|
||||
@@ -77,7 +78,8 @@ void Session::DisableTools(rocprofiler_buffer_id_t buffer_id) {
|
||||
}
|
||||
}
|
||||
if (FindFilterWithKind(ROCPROFILER_API_TRACE) &&
|
||||
GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetBufferId().value == buffer_id.value) {
|
||||
GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetBufferId().value ==
|
||||
buffer_id.value) {
|
||||
if (tracer_started_.load(std::memory_order_release)) {
|
||||
tracer_->DisableRoctracer();
|
||||
}
|
||||
@@ -131,9 +133,10 @@ void Session::Start() {
|
||||
GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetTraceData();
|
||||
if (!tracer_started_.load(std::memory_order_release)) {
|
||||
tracer_ = new tracer::Tracer(
|
||||
session_id_, (GetFilter(
|
||||
GetFilterIdWithKind(ROCPROFILER_API_TRACE))->HasCallback() ? GetFilter(
|
||||
GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetCallback() : nullptr),
|
||||
session_id_,
|
||||
(GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->HasCallback()
|
||||
? GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetCallback()
|
||||
: nullptr),
|
||||
GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetBufferId(), domains);
|
||||
tracer_started_.exchange(true, std::memory_order_release);
|
||||
}
|
||||
@@ -144,7 +147,8 @@ void Session::Start() {
|
||||
if (!pc_sampler_started_.load(std::memory_order_release)) {
|
||||
pc_sampler_ = new pc_sampler::PCSampler(
|
||||
GetFilter(GetFilterIdWithKind(ROCPROFILER_PC_SAMPLING_COLLECTION))->GetBufferId(),
|
||||
GetFilter(GetFilterIdWithKind(ROCPROFILER_PC_SAMPLING_COLLECTION))->GetId(), session_id_);
|
||||
GetFilter(GetFilterIdWithKind(ROCPROFILER_PC_SAMPLING_COLLECTION))->GetId(),
|
||||
session_id_);
|
||||
pc_sampler_started_.exchange(true, std::memory_order_release);
|
||||
}
|
||||
pc_sampler_->Start();
|
||||
@@ -167,6 +171,7 @@ void Session::Start() {
|
||||
|
||||
void Session::Terminate() {
|
||||
if (is_active_) {
|
||||
rocmtools::queue::ResetSessionID();
|
||||
std::lock_guard<std::mutex> lock(session_lock_);
|
||||
if (FindFilterWithKind(ROCPROFILER_SPM_COLLECTION)) {
|
||||
{
|
||||
@@ -214,9 +219,9 @@ pc_sampler::PCSampler* Session::GetPCSampler() { return pc_sampler_; }
|
||||
CountersSampler* Session::GetCountersSampler() { return counters_sampler_; }
|
||||
|
||||
rocprofiler_filter_id_t Session::CreateFilter(rocprofiler_filter_kind_t filter_kind,
|
||||
rocprofiler_filter_data_t filter_data,
|
||||
uint64_t data_count,
|
||||
rocprofiler_filter_property_t property) {
|
||||
rocprofiler_filter_data_t filter_data,
|
||||
uint64_t data_count,
|
||||
rocprofiler_filter_property_t property) {
|
||||
rocprofiler_filter_id_t id =
|
||||
rocprofiler_filter_id_t{filters_counter_.fetch_add(1, std::memory_order_release)};
|
||||
{
|
||||
@@ -287,7 +292,7 @@ rocprofiler_filter_id_t Session::GetFilterIdWithKind(rocprofiler_filter_kind_t k
|
||||
bool Session::HasBuffer() { return buffers_.size() > 0; }
|
||||
|
||||
rocprofiler_buffer_id_t Session::CreateBuffer(rocprofiler_buffer_callback_t buffer_callback,
|
||||
size_t buffer_size) {
|
||||
size_t buffer_size) {
|
||||
rocprofiler_buffer_id_t id =
|
||||
rocprofiler_buffer_id_t{buffers_counter_.fetch_add(1, std::memory_order_release)};
|
||||
{
|
||||
|
||||
@@ -489,7 +489,7 @@ uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t
|
||||
}
|
||||
|
||||
// Allocate memory to pass kernel parameters.
|
||||
// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter.
|
||||
// Memory is allocated accessible for all CPU agents and for GPU given by AgentInfo parameter.
|
||||
// @param agent_info Agent from whose memory region to allocate
|
||||
// @param size Size of memory in terms of bytes
|
||||
// @return uint8_t* Pointer to buffer, null if allocation fails.
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele