Fixing Multiple Profiler Issues:

1- Fixing Output Buffer Issues
2- Metrics Instances created for all GPUs in Init
3- Multi CPU/GPU/KernArg Pools are initialized for every Agent in Init
4- Lowering OverHead in the Packet WriteInterceptor in the ProxyQueue for both Profile Counting & ATT

Change-Id: Ic3d78156af8405bb134d01584574c339237d265f


[ROCm/rocprofiler commit: 00ecca25c7]
Tento commit je obsažen v:
Ammar ELWazir
2023-05-26 05:38:49 +00:00
odevzdal Ammar ELWazir
rodič c7e51f8abc
revize 935b40b837
10 změnil soubory, kde provedl 289 přidání a 278 odebrání
+3 -3
Zobrazit soubor
@@ -99,10 +99,10 @@ bool metrics::ExtractMetricEvents(
const Metric* metric = metrics_dict->Get(metric_names[i]);
if (metric == nullptr) {
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
fatal("input metric'%s' not supported on this hardware: %s ", metric_names[i].c_str(),
fatal("input metric'%s' not supported on this hardware: %s ", metric_names[i].c_str(),
agentInfo.getName().data());
}
}
// adding result object for derived metric
std::lock_guard<std::mutex> lock(extract_metric_events_lock);
@@ -185,7 +185,7 @@ bool metrics::ExtractMetricEvents(
}
bool metrics::GetCounterData(hsa_ven_amd_aqlprofile_profile_t* profile, hsa_agent_t gpu_agent,
bool metrics::GetCounterData(hsa_ven_amd_aqlprofile_profile_t* profile, hsa_agent_t gpu_agent,
std::vector<results_t*>& results_list) {
uint32_t xcc_count = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle).getXccCount();
uint32_t single_xcc_buff_size = profile->output_buffer.size /(sizeof(uint64_t) * xcc_count);
+20 -19
Zobrazit soubor
@@ -27,7 +27,7 @@
if ((status) != HSA_STATUS_SUCCESS) { \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
throw(ROCPROFILER_STATUS_ERROR_HSA_SUPPORT, \
throw(ROCPROFILER_STATUS_ERROR_HSA_SUPPORT, \
"Error: " << msg << ": " << emsg ? emsg : "<unknown error>"); \
} \
} while (0)
@@ -62,32 +62,27 @@ AgentInfo::AgentInfo(const hsa_agent_t agent, ::CoreApiTable* table) : handle_(a
table->hsa_agent_get_info_fn(
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &se_num_);
if (table->hsa_agent_get_info_fn(
agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE,
&shader_arrays_per_se_) != HSA_STATUS_SUCCESS ||
table->hsa_agent_get_info_fn(
agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
&waves_per_cu_) != HSA_STATUS_SUCCESS)
{
rocmtools::fatal("hsa_agent_get_info for gfxip hardware configuration failed");
if (table->hsa_agent_get_info_fn(agent,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE,
&shader_arrays_per_se_) != HSA_STATUS_SUCCESS ||
table->hsa_agent_get_info_fn(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
&waves_per_cu_) != HSA_STATUS_SUCCESS) {
rocmtools::fatal("hsa_agent_get_info for gfxip hardware configuration failed");
}
compute_units_per_sh_ = cu_num_ / (se_num_ * shader_arrays_per_se_);
wave_slots_per_simd_ = waves_per_cu_ / simds_per_cu_;
if (table->hsa_agent_get_info_fn(
agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_DOMAIN,
&pci_domain_) != HSA_STATUS_SUCCESS ||
table->hsa_agent_get_info_fn(
agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID,
&pci_location_id_) != HSA_STATUS_SUCCESS)
{
rocmtools::fatal("hsa_agent_get_info for PCI info failed");
if (table->hsa_agent_get_info_fn(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_DOMAIN,
&pci_domain_) != HSA_STATUS_SUCCESS ||
table->hsa_agent_get_info_fn(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID,
&pci_location_id_) != HSA_STATUS_SUCCESS) {
rocmtools::fatal("hsa_agent_get_info for PCI info failed");
}
// TODO: (sauverma) use hsa_agent_get_info_fn(HSA_AMD_AGENT_INFO_NUM_XCC)
// TODO: (sauverma) use hsa_agent_get_info_fn(HSA_AMD_AGENT_INFO_NUM_XCC)
// to get xcc_num once hsa headers are updated from rocr/hsa
std::string gpu_name = std::string(name_).substr(0,6);
std::string gpu_name = std::string(name_).substr(0, 6);
if (gpu_name == "gfx940")
xcc_num_ = 6;
else
@@ -117,6 +112,12 @@ void AgentInfo::setType(hsa_device_type_t type) { type_ = type; }
void AgentInfo::setHandle(uint64_t handle) { handle_ = handle; }
void AgentInfo::setName(const std::string& name) { strcpy(name_, name.c_str()); }
void AgentInfo::setNumaNode(uint32_t numa_node) { numa_node_ = numa_node; }
uint32_t AgentInfo::getNumaNode() { return numa_node_; }
void AgentInfo::setNearCpuAgent(hsa_agent_t near_cpu_agent) { near_cpu_agent_ = near_cpu_agent; }
hsa_agent_t AgentInfo::getNearCpuAgent() { return near_cpu_agent_; }
// CounterHardwareInfo Class
CounterHardwareInfo::CounterHardwareInfo(uint64_t event_id, const char* block_id)
+14
Zobrazit soubor
@@ -23,6 +23,7 @@
#include <hsa/hsa.h>
#include <hsa/hsa_api_trace.h>
#include <hsa/hsa_ext_amd.h>
#include <atomic>
#include <map>
@@ -66,6 +67,16 @@ class AgentInfo {
void setHandle(uint64_t handle);
void setName(const std::string& name);
void setNumaNode(uint32_t numa_node);
uint32_t getNumaNode();
void setNearCpuAgent(hsa_agent_t near_cpu_agent);
hsa_agent_t getNearCpuAgent();
hsa_amd_memory_pool_t cpu_pool;
hsa_amd_memory_pool_t kernarg_pool;
hsa_amd_memory_pool_t gpu_pool;
private:
int index_;
hsa_device_type_t type_; // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
@@ -87,6 +98,9 @@ class AgentInfo {
uint32_t pci_domain_;
uint32_t pci_location_id_;
uint32_t numa_node_;
hsa_agent_t near_cpu_agent_;
};
// XXX TODO: This should be moved somewhere else so this file can be deleted
+98 -2
Zobrazit soubor
@@ -27,6 +27,7 @@
#include <mutex>
#include <optional>
#include <unordered_map>
#include <unordered_set>
#include <string>
#include <atomic>
#include <cstdint>
@@ -40,6 +41,7 @@
#include <utility>
#include <vector>
#include "core/hardware/hsa_info.h"
#include "src/core/session/tracer/src/correlation_id.h"
#include "src/core/session/tracer/src/exception.h"
#include "src/core/session/tracer/src/roctracer.h"
@@ -48,6 +50,9 @@
#include "src/core/hsa/queues/queue.h"
#include "src/api/rocmtool.h"
#include <experimental/filesystem>
namespace fs = std::experimental::filesystem;
namespace {
hsa_status_t hsa_executable_iteration_callback(hsa_executable_t executable, hsa_agent_t agent,
@@ -761,11 +766,74 @@ hsa_status_t QueueDestroyInterceptor(hsa_queue_t* hsa_queue) {
return HSA_STATUS_SUCCESS;
}
std::unordered_map<uint32_t, hsa_agent_t> numa_node_to_cpu_agent;
std::unordered_map<long long, long long> gpu_numa_nodes_near_cpu;
std::vector<hsa_agent_t> gpu_agents;
void Initialize(HsaApiTable* table) {
InitKsymbols();
// Save the HSA core api and amd_ext api.
long long gpu_numa_nodes_start = 0;
SetCoreApiTable(*table->core_);
SetAmdExtTable(table->amd_ext_);
// TODO(aelwazir): FIXME, this is a workaround for the issue of allocating buffers on KernArg
// Pools that are nearest to the GPU which is not NUMA local to the CPU. This should be remove
// once ROCR provides such API.
std::string path = "/sys/class/kfd/kfd/topology/nodes";
for (const auto& entry : fs::directory_iterator(path)) {
long long node_id = std::stoll(entry.path().filename().c_str());
std::ifstream gpu_id_file;
std::string gpu_path = entry.path().c_str();
gpu_path += "/gpu_id";
gpu_id_file.open(gpu_path);
std::string gpu_id_str;
if (gpu_id_file.is_open()) {
gpu_id_file >> gpu_id_str;
long long gpu_id = std::stoll(gpu_id_str);
if (gpu_id > 0) {
gpu_numa_nodes_start = (gpu_numa_nodes_start > node_id || gpu_numa_nodes_start == 0)
? node_id
: gpu_numa_nodes_start;
}
}
gpu_id_file.close();
}
path = "/sys/class/kfd/kfd/topology/nodes";
for (const auto& entry : fs::directory_iterator(path)) {
long long node_id = std::stoll(entry.path().filename().c_str());
std::string numa_node_path = entry.path().c_str();
long long agent_id = std::stoll(entry.path().filename().c_str());
if (agent_id >= gpu_numa_nodes_start) {
numa_node_path += "/io_links";
for (const auto& numa_node_entry : fs::directory_iterator(numa_node_path)) {
std::string numa_node_entry_properties_path = numa_node_entry.path().c_str();
numa_node_entry_properties_path += "/properties";
std::ifstream gpu_properties_file;
gpu_properties_file.open(numa_node_entry_properties_path);
std::string gpu_properties_file_line;
if (gpu_properties_file.is_open()) {
while (gpu_properties_file) {
std::getline(gpu_properties_file, gpu_properties_file_line);
std::string delimiter = " ";
std::stringstream ss(gpu_properties_file_line);
std::string word;
ss >> word;
if (word.compare("node_to") == 0) {
ss >> word;
long long near_cpu_node_id = std::stoll(word);
if (near_cpu_node_id < gpu_numa_nodes_start) {
gpu_numa_nodes_near_cpu[node_id] = near_cpu_node_id;
}
}
}
}
gpu_properties_file.close();
}
}
}
// Enumerate the agents.
if (GetCoreApiTable().hsa_iterate_agents_fn(
[](hsa_agent_t agent, void* data) {
@@ -777,10 +845,16 @@ void Initialize(HsaApiTable* table) {
case HSA_DEVICE_TYPE_CPU:
agent_info.setIndex(cpu_agent_count++);
cpu_agent = agent;
rocmtools::queue::InitializePools(cpu_agent);
rocmtools::queue::InitializePools(cpu_agent, &agent_info);
uint32_t cpu_numa_node_id;
if (GetCoreApiTable().hsa_agent_get_info_fn(
agent, HSA_AGENT_INFO_NODE, &cpu_numa_node_id) != HSA_STATUS_SUCCESS)
rocmtools::fatal("hsa_agent_get_info(HSA_AGENT_INFO_NODE) failed");
agent_info.setNumaNode(cpu_numa_node_id);
numa_node_to_cpu_agent[cpu_numa_node_id] = agent;
break;
case HSA_DEVICE_TYPE_GPU:
// XXX FIXME: When multiple ranks are used, each rank's first
// TODO(FIXME): When multiple ranks are used, each rank's first
// logical device always has GPU ID 0, regardless of which
// physical device is selected with CUDA_VISIBLE_DEVICES.
// Because of this, when merging traces from multiple ranks,
@@ -794,6 +868,15 @@ void Initialize(HsaApiTable* table) {
// is currently doing as well as the roctracer compatibility
// code earlier in this file.
agent_info.setIndex(gpu_agent_count++);
uint32_t gpu_cpu_numa_node_id;
if (GetCoreApiTable().hsa_agent_get_info_fn(
agent, HSA_AGENT_INFO_NODE, &gpu_cpu_numa_node_id) != HSA_STATUS_SUCCESS)
rocmtools::fatal("hsa_agent_get_info(HSA_AGENT_INFO_NODE) failed");
agent_info.setNumaNode(gpu_cpu_numa_node_id);
agent_info.setNearCpuAgent(
numa_node_to_cpu_agent[gpu_numa_nodes_near_cpu[gpu_cpu_numa_node_id]]);
rocmtools::queue::InitializeGPUPool(agent, &agent_info);
gpu_agents.push_back(agent);
break;
default:
agent_info.setIndex(other_agent_count++);
@@ -805,6 +888,19 @@ void Initialize(HsaApiTable* table) {
nullptr) != HSA_STATUS_SUCCESS)
rocmtools::fatal("hsa_iterate_agents failed");
for (auto& agent : gpu_agents) {
GetAgentInfo(agent.handle).cpu_pool =
GetAgentInfo(GetAgentInfo(agent.handle).getNearCpuAgent().handle).cpu_pool;
GetAgentInfo(agent.handle).kernarg_pool =
GetAgentInfo(GetAgentInfo(agent.handle).getNearCpuAgent().handle).kernarg_pool;
}
rocmtools::queue::CheckPacketReqiurements(gpu_agents);
gpu_agents.clear();
numa_node_to_cpu_agent.clear();
gpu_numa_nodes_near_cpu.clear();
SetHSALoaderApi();
roctracer::hsa_support::Initialize_roctracer(table);
+46 -81
Zobrazit soubor
@@ -40,6 +40,7 @@
#include "src/core/hsa/hsa_common.h"
#include "src/core/counters/metrics/metrics.h"
#include "src/core/hardware/hsa_info.h"
#define ASSERTM(exp, msg) assert(((void)msg, exp))
@@ -58,11 +59,11 @@ namespace Packet {
static const size_t MEM_PAGE_BYTES = 0x1000;
static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
hsa_amd_memory_pool_t command_pool;
hsa_amd_memory_pool_t output_pool;
// hsa_amd_memory_pool_t command_pool;
// hsa_amd_memory_pool_t output_pool;
hsa_amd_memory_pool_t& GetCommandPool() { return command_pool; }
hsa_amd_memory_pool_t& GetOutputPool() { return output_pool; }
// hsa_amd_memory_pool_t& GetCommandPool() { return command_pool; }
// hsa_amd_memory_pool_t& GetOutputPool() { return output_pool; }
// This function checks to see if the provided
// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true,
@@ -111,22 +112,38 @@ hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) {
return FindGlobalPool(pool, data, true);
}
void InitializePools(hsa_agent_t cpu_agent) {
void InitializePools(hsa_agent_t cpu_agent, Agent::AgentInfo* agent_info) {
hsa_status_t status =
rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
cpu_agent, FindStandardPool, &command_pool);
cpu_agent, FindStandardPool, &(agent_info->cpu_pool));
if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Command Buffer Pool is not initialized\n");
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
cpu_agent, FindKernArgPool, &output_pool);
cpu_agent, FindKernArgPool, &(agent_info->kernarg_pool));
if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Output Buffer Pool is not initialized\n");
}
void InitializeGPUPool(hsa_agent_t gpu_agent, Agent::AgentInfo* agent_info) {
hsa_status_t status =
hsa_amd_agent_iterate_memory_pools(gpu_agent, FindStandardPool, &(agent_info->gpu_pool));
CHECK_HSA_STATUS("hsa_amd_agent_iterate_memory_pools(gpu_pool)", status);
}
struct block_des_t {
uint32_t id;
uint32_t index;
};
std::map<uint32_t, rocmtools::MetricsDict*> metricsDict;
void CheckPacketReqiurements(std::vector<hsa_agent_t>& gpu_agents) {
for (auto& gpu_agent : gpu_agents) {
// get the instance of MetricsDict
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
metricsDict[gpu_agent.handle] = rocmtools::MetricsDict::Create(&agentInfo);
}
}
// Initialize the PM4 commands with having the CPU&GPU agents, the counters,
// counters count to output three packets which are start, stop and read
// packets
@@ -136,8 +153,6 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
hsa_status_t status = HSA_STATUS_SUCCESS;
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
// get the instance of MetricsDict
rocmtools::MetricsDict* metricsDict = rocmtools::MetricsDict::Create(&agentInfo);
std::map<std::string, rocmtools::results_t*> results_map;
std::vector<rocmtools::event_t> events_list;
std::vector<rocmtools::results_t*> results_list;
@@ -158,9 +173,9 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
counters_count++;
}
rocmtools::metrics::ExtractMetricEvents(counter_names, gpu_agent, metricsDict, results_map,
events_list, results_list, event_to_max_block_count,
metrics_counters);
rocmtools::metrics::ExtractMetricEvents(counter_names, gpu_agent, metricsDict[gpu_agent.handle],
results_map, events_list, results_list,
event_to_max_block_count, metrics_counters);
// TODO: validate needs to be called on each events_list[i]
// Validating the events array for the specified gpu agent
@@ -171,8 +186,6 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
throw("Error: Events are not valid for the current gpu agent");
}
// std::cout << "Max Block Counters: " << max_block_counters << std::endl;
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>*
profiles = new std::vector<
std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>();
@@ -197,27 +210,9 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
counter_val_iteration++;
block_max_events_count[std::make_pair<uint32_t, uint32_t>(
static_cast<uint32_t>(event->block_name), static_cast<uint32_t>(event->block_index))]++;
// std::cout << "Block Name: " << event->block_name << " Block Index: " <<
// event->block_index
// << " Current Count: "
// << block_max_events_count[std::make_pair<uint32_t, uint32_t>(
// static_cast<uint32_t>(event->block_name),
// static_cast<uint32_t>(event->block_index))]
// << std::endl;
// std::cout << "Counter Taken: " << event->block_index << ", " << event->counter_id << " "
// << block_max_events_count[std::make_pair<uint32_t, uint32_t>(
// static_cast<uint32_t>(event->block_name),
// static_cast<uint32_t>(event->block_index))]
// << ":"
// << event_to_max_block_count[std::make_pair<uint32_t, uint32_t>(
// static_cast<uint32_t>(event->block_name),
// static_cast<uint32_t>(event->block_index))]
// << std::endl;
results_list.erase(result);
events_list.erase(event);
} else {
// std::cout << "Counter Left: " << event->block_index << ", " << event->counter_id
// << std::endl;
event++;
result++;
}
@@ -234,7 +229,7 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
counter_names.end()) {
// std::cout << "Counter from Result List: " << result->name << std::endl;
counters_taken.insert(result->name);
metric = const_cast<rocmtools::Metric*>(metricsDict->Get(result->name));
metric = const_cast<rocmtools::Metric*>(metricsDict[gpu_agent.handle]->Get(result->name));
if (metric == nullptr) std::cout << result->name << " not found in metricsDict\n";
context->metrics_list.push_back(metric);
} else {
@@ -257,7 +252,6 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
}
}
if (flag) metrics_taken.insert(result.first);
// std::cout << "Metric to be checked from map: " << result.first << std::endl;
}
}
@@ -267,8 +261,6 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
for (auto metric_counter_name : metrics_counters.at(metric_name)) {
if (metrics_counters_taken.find(metric_counter_name) == metrics_counters_taken.end() &&
counters_taken.find(metric_counter_name) == counters_taken.end()) {
// std::cout << metric_counter_name << " for " << metric_name << " is not found!"
// << std::endl;
flag = false;
continue;
}
@@ -276,14 +268,15 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
if (flag) {
// std::cout << "Counter from Result Map: " << metric_name << std::endl;
counters_taken.insert(metric_name);
rocmtools::Metric* metric = const_cast<rocmtools::Metric*>(metricsDict->Get(metric_name));
rocmtools::Metric* metric =
const_cast<rocmtools::Metric*>(metricsDict[gpu_agent.handle]->Get(metric_name));
if (metric == nullptr) std::cout << metric_name << " not found in metricsDict\n";
context->metrics_list.push_back(metric);
}
}
context->results_map = results_map;
context->metrics_dict = metricsDict;
context->metrics_dict = metricsDict[gpu_agent.handle];
hsa_ven_amd_aqlprofile_parameter_t* params = {};
@@ -323,7 +316,7 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
size_t size = profile->command_buffer.size;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
command_pool, size, 0, reinterpret_cast<void**>(&(profile->command_buffer.ptr)));
agentInfo.cpu_pool, size, 0, reinterpret_cast<void**>(&(profile->command_buffer.ptr)));
// Both the CPU and GPU can access the memory
if (status == HSA_STATUS_SUCCESS) {
@@ -349,7 +342,7 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
size = profile->output_buffer.size;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
output_pool, size, 0, reinterpret_cast<void**>(&profile->output_buffer.ptr));
agentInfo.kernarg_pool, size, 0, reinterpret_cast<void**>(&profile->output_buffer.ptr));
if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
printf("Error: Ran out of GPU memory to allocate Output Buffer\n");
@@ -435,20 +428,14 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets(
// Preparing an Getting the size of the command and output buffers
status = hsa_ven_amd_aqlprofile_start(profile, NULL);
// Preparing and Initializing bool of buffers for command and output buffers
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
cpu_agent, FindStandardPool, &command_pool);
if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Command Buffer Pool is not initialized\n");
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
cpu_agent, FindKernArgPool, &output_pool);
if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Output Buffer Pool is not initialized\n");
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
// Allocating Command Buffer
status = HSA_STATUS_ERROR;
size_t size = profile->command_buffer.size;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
command_pool, size, 0, reinterpret_cast<void**>(&command_buffer));
agentInfo.cpu_pool, size, 0, reinterpret_cast<void**>(&command_buffer));
// Both the CPU and GPU can access the memory
if (status == HSA_STATUS_SUCCESS) {
hsa_agent_t ag_list[1] = {gpu_agent};
@@ -463,7 +450,7 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets(
size = profile->output_buffer.size;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
output_pool, size, 0, reinterpret_cast<void**>(&output_buffer));
agentInfo.kernarg_pool, size, 0, reinterpret_cast<void**>(&output_buffer));
// Both the CPU and GPU can access the kernel arguments
if (status == HSA_STATUS_SUCCESS) {
hsa_agent_t ag_list[1] = {gpu_agent};
@@ -517,14 +504,14 @@ uint8_t* AllocateLocalMemory(size_t size, hsa_amd_memory_pool_t* gpu_pool) {
return ptr;
}
hsa_status_t Allocate(hsa_agent_t gpu_agent, hsa_ven_amd_aqlprofile_profile_t* profile,
hsa_amd_memory_pool_t* cpu_pool, hsa_amd_memory_pool_t* gpu_pool) {
hsa_status_t Allocate(hsa_agent_t gpu_agent, hsa_ven_amd_aqlprofile_profile_t* profile) {
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
profile->command_buffer.ptr =
AllocateSysMemory(gpu_agent, profile->command_buffer.size, cpu_pool);
AllocateSysMemory(gpu_agent, profile->command_buffer.size, &agentInfo.cpu_pool);
profile->output_buffer.size = g_output_buffer_size;
profile->output_buffer.ptr = (g_output_buffer_local)
? AllocateLocalMemory(profile->output_buffer.size, gpu_pool)
: AllocateSysMemory(gpu_agent, profile->output_buffer.size, cpu_pool);
? AllocateLocalMemory(profile->output_buffer.size, &agentInfo.gpu_pool)
: AllocateSysMemory(gpu_agent, profile->output_buffer.size, &agentInfo.cpu_pool);
return (profile->command_buffer.ptr && profile->output_buffer.ptr) ? HSA_STATUS_SUCCESS
: HSA_STATUS_ERROR;
}
@@ -574,49 +561,27 @@ hsa_ven_amd_aqlprofile_profile_t* GenerateATTPackets(
hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
std::vector<hsa_ven_amd_aqlprofile_parameter_t>& att_params, packet_t* start_packet,
packet_t* stop_packet) {
att_memory_pools_t* att_mem_pools = NULL;
auto it = GetAttMemPoolsMap()->find(gpu_agent.handle);
if (it == GetAttMemPoolsMap()->end()) {
att_mem_pools = new att_memory_pools_t;
// Allocate memory pools for cpu and gpu
AllocateMemoryPools(cpu_agent, gpu_agent, &att_mem_pools->cpu_mem_pool,
&att_mem_pools->gpu_mem_pool);
GetAttMemPoolsMap()->emplace(gpu_agent.handle, att_mem_pools);
} else
att_mem_pools = it->second;
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion-null"
// Preparing the profile structure to get the packets
hsa_ven_amd_aqlprofile_profile_t* profile =
new hsa_ven_amd_aqlprofile_profile_t{gpu_agent,
HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE,
nullptr,
0,
&att_params[0],
(uint32_t)att_params.size(),
NULL,
NULL};
new hsa_ven_amd_aqlprofile_profile_t{gpu_agent, HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE,
nullptr, 0,
&att_params[0], (uint32_t)att_params.size(),
NULL, NULL};
#pragma GCC diagnostic pop
// Check the profile buffer sizes
hsa_status_t status = hsa_ven_amd_aqlprofile_start(profile, NULL);
if (status != HSA_STATUS_SUCCESS) printf("Error: aqlprofile_start(NULL)");
// // Double output buffer size if concurrent
// if (is_concurrent) profile.output_buffer.size *= 2;
// TODO: create a separate class for memory allocations
// Maintain pools per device
// handle allocation and resource cleanup
// Allocate command and output buffers
// command buffer -> from CPU memory pool
// output buffer -> from GPU memory pool
status =
Allocate(gpu_agent, profile, &att_mem_pools->cpu_mem_pool, &att_mem_pools->gpu_mem_pool);
status = Allocate(gpu_agent, profile);
if (status != HSA_STATUS_SUCCESS) printf("Error: Allocate()");
// Generate start/stop/read profiling packets
+5 -5
Zobrazit soubor
@@ -37,8 +37,6 @@
namespace Packet {
static std::mutex pool_lock;
typedef hsa_ext_amd_aql_pm4_packet_t packet_t;
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>*
@@ -47,7 +45,8 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_pool_t* cpu_pool);
void GetCommandBufferMap(std::map<size_t, uint8_t*>);
void GetOutputBufferMap(std::map<size_t, uint8_t*>);
void InitializePools(hsa_agent_t cpu_agent);
void InitializePools(hsa_agent_t cpu_agent, Agent::AgentInfo* agent_info);
void InitializeGPUPool(hsa_agent_t gpu_agent, Agent::AgentInfo* agent_info);
hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets(
hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, hsa_ven_amd_aqlprofile_event_t* events,
uint32_t event_count, packet_t* start_packet, packet_t* stop_packet, packet_t* read_packet);
@@ -63,9 +62,10 @@ hsa_ven_amd_aqlprofile_profile_t* GenerateATTPackets(
uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_pool_t* cpu_pool);
void get_command_buffer_map(std::map<size_t, uint8_t*> );
void get_outbuffer_map(std::map<size_t, uint8_t*> );
void get_command_buffer_map(std::map<size_t, uint8_t*>);
void get_outbuffer_map(std::map<size_t, uint8_t*>);
void initialize_pools(hsa_agent_t cpu_agent);
void CheckPacketReqiurements(std::vector<hsa_agent_t>& gpu_agents);
typedef struct {
hsa_amd_memory_pool_t cpu_mem_pool;
+83 -157
Zobrazit soubor
@@ -27,6 +27,7 @@
#include <utility>
#include <algorithm>
#include "rocprofiler.h"
#include "src/api/rocmtool.h"
#include "src/core/hsa/packets/packets_generator.h"
#include "src/core/hsa/hsa_support.h"
@@ -284,7 +285,7 @@ hsa_status_t attTraceDataCallback(hsa_ven_amd_aqlprofile_info_type_t info_type,
att_trace_callback_data_t* passed_data = reinterpret_cast<att_trace_callback_data_t*>(data);
passed_data->push_back(*info_data);
// TODO: clear output buffers after copying
// either copy here or in AddattRecord
// either copy here or in ::AddAttRecord
return status;
}
@@ -314,12 +315,10 @@ void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_sign
void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent,
att_pending_signal_t& pending) {
Agent::AgentInfo agent_info = hsa_support::GetAgentInfo(gpu_agent.handle);
att_trace_callback_data_t data;
hsa_ven_amd_aqlprofile_iterate_data(pending.profile, attTraceDataCallback, &data);
// Get CPU and GPU memory pools
Packet::att_memory_pools_t* att_mem_pools = Packet::GetAttMemPools(gpu_agent);
// Allocate memory for shader_engine_data
record->shader_engine_data = static_cast<rocprofiler_record_se_att_data_t*>(
calloc(data.size(), sizeof(rocprofiler_record_se_att_data_t)));
@@ -336,7 +335,7 @@ void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent
void* buffer = NULL;
if (data_size != 0) {
// Allocate buffer on CPU to copy out trace data
buffer = Packet::AllocateSysMemory(gpu_agent, data_size, &att_mem_pools->cpu_mem_pool);
buffer = Packet::AllocateSysMemory(gpu_agent, data_size, &agent_info.cpu_pool);
if (buffer == NULL) fatal("Trace data buffer allocation failed");
auto status =
@@ -353,83 +352,6 @@ void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent
record->shader_engine_data_count = data.size();
}
// static const size_t MEM_PAGE_BYTES = 0x1000;
// static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
// static std::mutex begin_signal_lock;
// bool BeginSignalHandler(hsa_signal_value_t signal_value, void* data) {
// std::lock_guard<std::mutex> lock(begin_signal_lock);
// auto profiling_context =
// static_cast<std::pair<rocmtools::profiling_context_t*,
// hsa_ven_amd_aqlprofile_profile_t*>*>(
// data);
// if (!profiling_context->first->begin_completed.load(std::memory_order_relaxed)) {
// std::cout << "BeginSignalHandler is called" << std::endl;
// hsa_status_t status = HSA_STATUS_ERROR;
// size_t size = profiling_context->second->command_buffer.size;
// size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
// status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
// Packet::GetCommandPool(), size, 0,
// reinterpret_cast<void**>(&(profiling_context->second->command_buffer.ptr)));
// // Both the CPU and GPU can access the memory
// if (status == HSA_STATUS_SUCCESS) {
// hsa_agent_t ag_list[1] = {profiling_context->first->gpu_agent};
// status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
// 1, ag_list, NULL, profiling_context->second->command_buffer.ptr);
// if (status != HSA_STATUS_SUCCESS) {
// printf("Error: Can't allow access for both agents to Command Buffer\n");
// }
// } else if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
// printf("Error: Ran out of GPU memory to allocate Command Buffer\n");
// } else {
// const char* hsa_err_str = NULL;
// if (hsa_status_string(status, &hsa_err_str) != HSA_STATUS_SUCCESS) hsa_err_str = "Unknown";
// printf("Error: Allocating command Buffer (Size=%lu) (%s)\n", size, hsa_err_str);
// }
// status = HSA_STATUS_ERROR;
// size = profiling_context->second->output_buffer.size;
// size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
// status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
// Packet::GetOutputPool(), size, 0,
// reinterpret_cast<void**>(&profiling_context->second->output_buffer.ptr));
// if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
// printf("Error: Ran out of GPU memory to allocate Output Buffer\n");
// }
// if (status == HSA_STATUS_SUCCESS) {
// hsa_agent_t ag_list[1] = {profiling_context->first->gpu_agent};
// status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
// 1, ag_list, NULL, profiling_context->second->output_buffer.ptr);
// if (status == HSA_STATUS_SUCCESS) {
// memset(profiling_context->second->output_buffer.ptr, 0x0,
// profiling_context->second->output_buffer.size);
// // Creating the start/stop/read packets
// status = hsa_ven_amd_aqlprofile_start(profiling_context->second,
// profiling_context->first->start_packet);
// status = hsa_ven_amd_aqlprofile_stop(profiling_context->second,
// profiling_context->first->stop_packet);
// status = hsa_ven_amd_aqlprofile_read(profiling_context->second,
// profiling_context->first->read_packet);
// } else {
// printf("Error: Can't allow access for both agents to output Buffer\n");
// }
// } else {
// const char* hsa_err_str = NULL;
// if (hsa_status_string(status, &hsa_err_str) != HSA_STATUS_SUCCESS) hsa_err_str = "Unknown";
// printf("Error: Allocating output Buffer (%s)\n", hsa_err_str);
// }
// profiling_context->first->begin_completed.exchange(true, std::memory_order_relaxed);
// }
// return true;
// }
bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) {
auto queue_info_session = static_cast<queue_info_session_t*>(data);
if (!queue_info_session || !GetROCMToolObj() ||
@@ -647,6 +569,75 @@ template <typename Integral> constexpr Integral bit_extract(Integral x, int firs
return (x >> first) & bit_mask<Integral>(0, last - first);
}
rocprofiler_session_id_t session_id = rocprofiler_session_id_t{0};
// Counter Names declaration
std::vector<std::string> session_data;
rocprofiler_buffer_id_t buffer_id;
uint64_t session_data_count = 0;
bool is_counter_collection_mode = false;
bool is_timestamp_collection_mode = false;
bool is_att_collection_mode = false;
bool is_pc_sampling_collection_mode = false;
std::vector<rocprofiler_att_parameter_t> att_parameters_data;
uint32_t replay_mode_count = 0;
std::vector<std::string> kernel_profile_names;
std::vector<std::string> att_counters_names;
rocmtools::Session* session = nullptr;
void ResetSessionID() { session_id = rocprofiler_session_id_t{0}; }
void CheckNeededProfileConfigs() {
rocprofiler_session_id_t internal_session_id;
if (GetROCMToolObj())
// Getting Session ID
internal_session_id = GetROCMToolObj()->GetCurrentSessionId();
else
internal_session_id = {0};
if (session_id.handle == 0 || internal_session_id.handle != session_id.handle) {
session_id = internal_session_id;
// Getting Counters count from the Session
if (session_id.handle > 0 && GetROCMToolObj()) {
session = GetROCMToolObj()->GetSession(session_id);
if (session && session->FindFilterWithKind(ROCPROFILER_COUNTERS_COLLECTION)) {
rocprofiler_filter_id_t filter_id =
session->GetFilterIdWithKind(ROCPROFILER_COUNTERS_COLLECTION);
rocmtools::Filter* filter = session->GetFilter(filter_id);
session_data = filter->GetCounterData();
is_counter_collection_mode = true;
session_data_count = session_data.size();
buffer_id = filter->GetBufferId();
} else if (session &&
session->FindFilterWithKind(ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION)) {
is_timestamp_collection_mode = true;
rocprofiler_filter_id_t filter_id =
session->GetFilterIdWithKind(ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION);
rocmtools::Filter* filter = session->GetFilter(filter_id);
buffer_id = filter->GetBufferId();
} else if (session && session->FindFilterWithKind(ROCPROFILER_ATT_TRACE_COLLECTION)) {
rocprofiler_filter_id_t filter_id =
session->GetFilterIdWithKind(ROCPROFILER_ATT_TRACE_COLLECTION);
rocmtools::Filter* filter = session->GetFilter(filter_id);
att_parameters_data = filter->GetAttParametersData();
is_att_collection_mode = true;
buffer_id =
session->GetFilter(session->GetFilterIdWithKind(ROCPROFILER_ATT_TRACE_COLLECTION))
->GetBufferId();
att_counters_names = filter->GetCounterData();
kernel_profile_names = std::get<std::vector<std::string>>(
filter->GetProperty(ROCPROFILER_FILTER_KERNEL_NAMES));
} else if (session && session->FindFilterWithKind(ROCPROFILER_PC_SAMPLING_COLLECTION)) {
is_pc_sampling_collection_mode = true;
}
}
}
}
static int KernelInterceptCount = 0;
std::atomic<uint32_t> WRITER_ID{0};
/**
@@ -660,64 +651,8 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
hsa_amd_queue_intercept_packet_writer writer) {
const Packet::packet_t* packets_arr = reinterpret_cast<const Packet::packet_t*>(packets);
std::vector<Packet::packet_t> transformed_packets;
rocprofiler_session_id_t session_id;
if (GetROCMToolObj())
// Getting Session ID
session_id = GetROCMToolObj()->GetCurrentSessionId();
else
session_id = {0};
// Counter Names declaration
std::vector<std::string> session_data;
rocprofiler_buffer_id_t buffer_id;
uint64_t session_data_count = 0;
bool is_counter_collection_mode = false;
bool is_timestamp_collection_mode = false;
bool is_att_collection_mode = false;
bool is_pc_sampling_collection_mode = false;
std::vector<rocprofiler_att_parameter_t> att_parameters_data;
uint32_t replay_mode_count = 0;
std::vector<std::string> kernel_profile_names;
std::vector<std::string> att_counters_names;
rocmtools::Session* session = nullptr;
// Getting Counters count from the Session
if (session_id.handle > 0 && GetROCMToolObj()) {
session = GetROCMToolObj()->GetSession(session_id);
if (session && session->FindFilterWithKind(ROCPROFILER_COUNTERS_COLLECTION)) {
rocprofiler_filter_id_t filter_id =
session->GetFilterIdWithKind(ROCPROFILER_COUNTERS_COLLECTION);
rocmtools::Filter* filter = session->GetFilter(filter_id);
session_data = filter->GetCounterData();
is_counter_collection_mode = true;
session_data_count = session_data.size();
buffer_id = filter->GetBufferId();
} else if (session && session->FindFilterWithKind(ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION)) {
is_timestamp_collection_mode = true;
rocprofiler_filter_id_t filter_id =
session->GetFilterIdWithKind(ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION);
rocmtools::Filter* filter = session->GetFilter(filter_id);
buffer_id = filter->GetBufferId();
} else if (session && session->FindFilterWithKind(ROCPROFILER_ATT_TRACE_COLLECTION)) {
rocprofiler_filter_id_t filter_id =
session->GetFilterIdWithKind(ROCPROFILER_ATT_TRACE_COLLECTION);
rocmtools::Filter* filter = session->GetFilter(filter_id);
att_parameters_data = filter->GetAttParametersData();
is_att_collection_mode = true;
buffer_id = session->GetFilter(session->GetFilterIdWithKind(ROCPROFILER_ATT_TRACE_COLLECTION))
->GetBufferId();
att_counters_names = filter->GetCounterData();
kernel_profile_names =
std::get<std::vector<std::string>>(filter->GetProperty(ROCPROFILER_FILTER_KERNEL_NAMES));
} else if (session && session->FindFilterWithKind(ROCPROFILER_PC_SAMPLING_COLLECTION)) {
is_pc_sampling_collection_mode = true;
}
}
CheckNeededProfileConfigs();
if (session_id.handle > 0 && pkt_count > 0 &&
(is_counter_collection_mode || is_timestamp_collection_mode ||
@@ -764,21 +699,6 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
if (session_data_count > 0 && is_counter_collection_mode && profiles &&
replay_mode_count > 0) {
// hsa_signal_t begin_signal{};
// CreateSignal(0, &begin_signal);
// hsa_barrier_and_packet_t barrier{0};
// barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
// CreateSignal(0, &barrier.completion_signal);
// barrier.dep_signal[0] = hsa_signal_t{};
// Packet::packet_t* __attribute__((__may_alias__)) pkt =
// (reinterpret_cast<Packet::packet_t*>(&barrier));
// transformed_packets.emplace_back(*pkt);
// hsa_status_t status = hsa_support::GetAmdExtTable().hsa_amd_signal_async_handler_fn(
// barrier.completion_signal, HSA_SIGNAL_CONDITION_GTE, 1, BeginSignalHandler,
// &profiles->at(profile_id));
// if (status != HSA_STATUS_SUCCESS)
// fatal("hsa_amd_signal_async_handler failed for begin signal");
// Adding start packet and its barrier with a dummy signal
hsa_signal_t dummy_signal{};
dummy_signal.handle = 0;
@@ -1087,8 +1007,6 @@ Queue::Queue(const hsa_agent_t& cpu_agent, const hsa_agent_t& gpu_agent, uint32_
*queue = intercept_queue_;
}
// Queue::~Queue() { std::lock_guard<std::mutex> lk(mutex_); }
hsa_queue_t* Queue::GetCurrentInterceptQueue() { return intercept_queue_; }
hsa_agent_t Queue::GetGPUAgent() { return gpu_agent_; }
@@ -1097,7 +1015,15 @@ hsa_agent_t Queue::GetCPUAgent() { return cpu_agent_; }
uint64_t Queue::GetQueueID() { return intercept_queue_->id; }
void InitializePools(hsa_agent_t cpu_agent) { Packet::InitializePools(cpu_agent); }
void InitializePools(hsa_agent_t cpu_agent, Agent::AgentInfo* agent_info) {
Packet::InitializePools(cpu_agent, agent_info);
}
void InitializeGPUPool(hsa_agent_t gpu_agent, Agent::AgentInfo* agent_info) {
Packet::InitializeGPUPool(gpu_agent, agent_info);
}
void CheckPacketReqiurements(std::vector<hsa_agent_t>& gpu_agents) {
Packet::CheckPacketReqiurements(gpu_agents);
}
} // namespace queue
} // namespace rocmtools
+5 -1
Zobrazit soubor
@@ -86,7 +86,11 @@ struct queue_info_session_t {
void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_signal_t& pending);
void InitializePools(hsa_agent_t cpu_agent);
void InitializePools(hsa_agent_t cpu_agent, Agent::AgentInfo* agent_info);
void InitializeGPUPool(hsa_agent_t gpu_agent, Agent::AgentInfo* agent_info);
void CheckPacketReqiurements(std::vector<hsa_agent_t>& gpu_agents);
void ResetSessionID();
} // namespace queue
} // namespace rocmtools
+14 -9
Zobrazit soubor
@@ -47,6 +47,7 @@ Session::~Session() {
while (GetCurrentActiveInterruptSignalsCount() > 0) {
}
if (profiler_started_.load(std::memory_order_release)) {
rocmtools::queue::ResetSessionID();
delete profiler_;
profiler_started_.exchange(false, std::memory_order_release);
}
@@ -77,7 +78,8 @@ void Session::DisableTools(rocprofiler_buffer_id_t buffer_id) {
}
}
if (FindFilterWithKind(ROCPROFILER_API_TRACE) &&
GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetBufferId().value == buffer_id.value) {
GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetBufferId().value ==
buffer_id.value) {
if (tracer_started_.load(std::memory_order_release)) {
tracer_->DisableRoctracer();
}
@@ -131,9 +133,10 @@ void Session::Start() {
GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetTraceData();
if (!tracer_started_.load(std::memory_order_release)) {
tracer_ = new tracer::Tracer(
session_id_, (GetFilter(
GetFilterIdWithKind(ROCPROFILER_API_TRACE))->HasCallback() ? GetFilter(
GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetCallback() : nullptr),
session_id_,
(GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->HasCallback()
? GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetCallback()
: nullptr),
GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetBufferId(), domains);
tracer_started_.exchange(true, std::memory_order_release);
}
@@ -144,7 +147,8 @@ void Session::Start() {
if (!pc_sampler_started_.load(std::memory_order_release)) {
pc_sampler_ = new pc_sampler::PCSampler(
GetFilter(GetFilterIdWithKind(ROCPROFILER_PC_SAMPLING_COLLECTION))->GetBufferId(),
GetFilter(GetFilterIdWithKind(ROCPROFILER_PC_SAMPLING_COLLECTION))->GetId(), session_id_);
GetFilter(GetFilterIdWithKind(ROCPROFILER_PC_SAMPLING_COLLECTION))->GetId(),
session_id_);
pc_sampler_started_.exchange(true, std::memory_order_release);
}
pc_sampler_->Start();
@@ -167,6 +171,7 @@ void Session::Start() {
void Session::Terminate() {
if (is_active_) {
rocmtools::queue::ResetSessionID();
std::lock_guard<std::mutex> lock(session_lock_);
if (FindFilterWithKind(ROCPROFILER_SPM_COLLECTION)) {
{
@@ -214,9 +219,9 @@ pc_sampler::PCSampler* Session::GetPCSampler() { return pc_sampler_; }
CountersSampler* Session::GetCountersSampler() { return counters_sampler_; }
rocprofiler_filter_id_t Session::CreateFilter(rocprofiler_filter_kind_t filter_kind,
rocprofiler_filter_data_t filter_data,
uint64_t data_count,
rocprofiler_filter_property_t property) {
rocprofiler_filter_data_t filter_data,
uint64_t data_count,
rocprofiler_filter_property_t property) {
rocprofiler_filter_id_t id =
rocprofiler_filter_id_t{filters_counter_.fetch_add(1, std::memory_order_release)};
{
@@ -287,7 +292,7 @@ rocprofiler_filter_id_t Session::GetFilterIdWithKind(rocprofiler_filter_kind_t k
bool Session::HasBuffer() { return buffers_.size() > 0; }
rocprofiler_buffer_id_t Session::CreateBuffer(rocprofiler_buffer_callback_t buffer_callback,
size_t buffer_size) {
size_t buffer_size) {
rocprofiler_buffer_id_t id =
rocprofiler_buffer_id_t{buffers_counter_.fetch_add(1, std::memory_order_release)};
{
+1 -1
Zobrazit soubor
@@ -489,7 +489,7 @@ uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t
}
// Allocate memory to pass kernel parameters.
// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter.
// Memory is allocated accessible for all CPU agents and for GPU given by AgentInfo parameter.
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.