From 935b40b837cd2cd12191ccbf4bdf7a6f813d10cb Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Fri, 26 May 2023 05:38:49 +0000 Subject: [PATCH] Fixing Multiple Profiler Issues: 1- Fixing Output Buffer Issues 2- Metrics Instances created for all GPUs in Init 3- Multi CPU/GPU/KernArg Pools are initialized for every Agent in Init 4- Lowering OverHead in the Packet WriteInterceptor in the ProxyQueue for both Profile Counting & ATT Change-Id: Ic3d78156af8405bb134d01584574c339237d265f [ROCm/rocprofiler commit: 00ecca25c79887ac195fa771ab2f2723685d975f] --- .../core/counters/metrics/eval_metrics.cpp | 6 +- .../src/core/hardware/hsa_info.cpp | 39 +-- .../rocprofiler/src/core/hardware/hsa_info.h | 14 + .../rocprofiler/src/core/hsa/hsa_support.cpp | 100 +++++++- .../core/hsa/packets/packets_generator.cpp | 127 ++++----- .../src/core/hsa/packets/packets_generator.h | 10 +- .../rocprofiler/src/core/hsa/queues/queue.cpp | 240 ++++++------------ .../rocprofiler/src/core/hsa/queues/queue.h | 6 +- .../rocprofiler/src/core/session/session.cpp | 23 +- .../rocprofiler/src/util/hsa_rsrc_factory.cpp | 2 +- 10 files changed, 289 insertions(+), 278 deletions(-) diff --git a/projects/rocprofiler/src/core/counters/metrics/eval_metrics.cpp b/projects/rocprofiler/src/core/counters/metrics/eval_metrics.cpp index e6c91608c6..9bc894fe71 100644 --- a/projects/rocprofiler/src/core/counters/metrics/eval_metrics.cpp +++ b/projects/rocprofiler/src/core/counters/metrics/eval_metrics.cpp @@ -99,10 +99,10 @@ bool metrics::ExtractMetricEvents( const Metric* metric = metrics_dict->Get(metric_names[i]); if (metric == nullptr) { Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle); - fatal("input metric'%s' not supported on this hardware: %s ", metric_names[i].c_str(), + fatal("input metric'%s' not supported on this hardware: %s ", metric_names[i].c_str(), agentInfo.getName().data()); - } + } // adding result object for derived metric std::lock_guard lock(extract_metric_events_lock); @@ -185,7 +185,7 @@ bool metrics::ExtractMetricEvents( } -bool metrics::GetCounterData(hsa_ven_amd_aqlprofile_profile_t* profile, hsa_agent_t gpu_agent, +bool metrics::GetCounterData(hsa_ven_amd_aqlprofile_profile_t* profile, hsa_agent_t gpu_agent, std::vector& results_list) { uint32_t xcc_count = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle).getXccCount(); uint32_t single_xcc_buff_size = profile->output_buffer.size /(sizeof(uint64_t) * xcc_count); diff --git a/projects/rocprofiler/src/core/hardware/hsa_info.cpp b/projects/rocprofiler/src/core/hardware/hsa_info.cpp index 7c3ea2ca66..e2255f839f 100644 --- a/projects/rocprofiler/src/core/hardware/hsa_info.cpp +++ b/projects/rocprofiler/src/core/hardware/hsa_info.cpp @@ -27,7 +27,7 @@ if ((status) != HSA_STATUS_SUCCESS) { \ const char* emsg = 0; \ hsa_status_string(status, &emsg); \ - throw(ROCPROFILER_STATUS_ERROR_HSA_SUPPORT, \ + throw(ROCPROFILER_STATUS_ERROR_HSA_SUPPORT, \ "Error: " << msg << ": " << emsg ? emsg : ""); \ } \ } while (0) @@ -62,32 +62,27 @@ AgentInfo::AgentInfo(const hsa_agent_t agent, ::CoreApiTable* table) : handle_(a table->hsa_agent_get_info_fn( agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &se_num_); - if (table->hsa_agent_get_info_fn( - agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE, - &shader_arrays_per_se_) != HSA_STATUS_SUCCESS || - table->hsa_agent_get_info_fn( - agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, - &waves_per_cu_) != HSA_STATUS_SUCCESS) - { - rocmtools::fatal("hsa_agent_get_info for gfxip hardware configuration failed"); + if (table->hsa_agent_get_info_fn(agent, + (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE, + &shader_arrays_per_se_) != HSA_STATUS_SUCCESS || + table->hsa_agent_get_info_fn(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, + &waves_per_cu_) != HSA_STATUS_SUCCESS) { + rocmtools::fatal("hsa_agent_get_info for gfxip hardware configuration failed"); } compute_units_per_sh_ = cu_num_ / (se_num_ * shader_arrays_per_se_); wave_slots_per_simd_ = waves_per_cu_ / simds_per_cu_; - if (table->hsa_agent_get_info_fn( - agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_DOMAIN, - &pci_domain_) != HSA_STATUS_SUCCESS || - table->hsa_agent_get_info_fn( - agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, - &pci_location_id_) != HSA_STATUS_SUCCESS) - { - rocmtools::fatal("hsa_agent_get_info for PCI info failed"); + if (table->hsa_agent_get_info_fn(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_DOMAIN, + &pci_domain_) != HSA_STATUS_SUCCESS || + table->hsa_agent_get_info_fn(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, + &pci_location_id_) != HSA_STATUS_SUCCESS) { + rocmtools::fatal("hsa_agent_get_info for PCI info failed"); } - // TODO: (sauverma) use hsa_agent_get_info_fn(HSA_AMD_AGENT_INFO_NUM_XCC) + // TODO: (sauverma) use hsa_agent_get_info_fn(HSA_AMD_AGENT_INFO_NUM_XCC) // to get xcc_num once hsa headers are updated from rocr/hsa - std::string gpu_name = std::string(name_).substr(0,6); + std::string gpu_name = std::string(name_).substr(0, 6); if (gpu_name == "gfx940") xcc_num_ = 6; else @@ -117,6 +112,12 @@ void AgentInfo::setType(hsa_device_type_t type) { type_ = type; } void AgentInfo::setHandle(uint64_t handle) { handle_ = handle; } void AgentInfo::setName(const std::string& name) { strcpy(name_, name.c_str()); } +void AgentInfo::setNumaNode(uint32_t numa_node) { numa_node_ = numa_node; } +uint32_t AgentInfo::getNumaNode() { return numa_node_; } + +void AgentInfo::setNearCpuAgent(hsa_agent_t near_cpu_agent) { near_cpu_agent_ = near_cpu_agent; } +hsa_agent_t AgentInfo::getNearCpuAgent() { return near_cpu_agent_; } + // CounterHardwareInfo Class CounterHardwareInfo::CounterHardwareInfo(uint64_t event_id, const char* block_id) diff --git a/projects/rocprofiler/src/core/hardware/hsa_info.h b/projects/rocprofiler/src/core/hardware/hsa_info.h index 2586065f19..b73c6197a7 100644 --- a/projects/rocprofiler/src/core/hardware/hsa_info.h +++ b/projects/rocprofiler/src/core/hardware/hsa_info.h @@ -23,6 +23,7 @@ #include #include +#include #include #include @@ -66,6 +67,16 @@ class AgentInfo { void setHandle(uint64_t handle); void setName(const std::string& name); + void setNumaNode(uint32_t numa_node); + uint32_t getNumaNode(); + + void setNearCpuAgent(hsa_agent_t near_cpu_agent); + hsa_agent_t getNearCpuAgent(); + + hsa_amd_memory_pool_t cpu_pool; + hsa_amd_memory_pool_t kernarg_pool; + hsa_amd_memory_pool_t gpu_pool; + private: int index_; hsa_device_type_t type_; // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2 @@ -87,6 +98,9 @@ class AgentInfo { uint32_t pci_domain_; uint32_t pci_location_id_; + + uint32_t numa_node_; + hsa_agent_t near_cpu_agent_; }; // XXX TODO: This should be moved somewhere else so this file can be deleted diff --git a/projects/rocprofiler/src/core/hsa/hsa_support.cpp b/projects/rocprofiler/src/core/hsa/hsa_support.cpp index 481f658bd1..6e164af3c5 100644 --- a/projects/rocprofiler/src/core/hsa/hsa_support.cpp +++ b/projects/rocprofiler/src/core/hsa/hsa_support.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,7 @@ #include #include +#include "core/hardware/hsa_info.h" #include "src/core/session/tracer/src/correlation_id.h" #include "src/core/session/tracer/src/exception.h" #include "src/core/session/tracer/src/roctracer.h" @@ -48,6 +50,9 @@ #include "src/core/hsa/queues/queue.h" #include "src/api/rocmtool.h" +#include +namespace fs = std::experimental::filesystem; + namespace { hsa_status_t hsa_executable_iteration_callback(hsa_executable_t executable, hsa_agent_t agent, @@ -761,11 +766,74 @@ hsa_status_t QueueDestroyInterceptor(hsa_queue_t* hsa_queue) { return HSA_STATUS_SUCCESS; } +std::unordered_map numa_node_to_cpu_agent; +std::unordered_map gpu_numa_nodes_near_cpu; +std::vector gpu_agents; + void Initialize(HsaApiTable* table) { InitKsymbols(); // Save the HSA core api and amd_ext api. + long long gpu_numa_nodes_start = 0; + SetCoreApiTable(*table->core_); SetAmdExtTable(table->amd_ext_); + + // TODO(aelwazir): FIXME, this is a workaround for the issue of allocating buffers on KernArg + // Pools that are nearest to the GPU which is not NUMA local to the CPU. This should be remove + // once ROCR provides such API. + std::string path = "/sys/class/kfd/kfd/topology/nodes"; + for (const auto& entry : fs::directory_iterator(path)) { + long long node_id = std::stoll(entry.path().filename().c_str()); + std::ifstream gpu_id_file; + std::string gpu_path = entry.path().c_str(); + gpu_path += "/gpu_id"; + gpu_id_file.open(gpu_path); + std::string gpu_id_str; + if (gpu_id_file.is_open()) { + gpu_id_file >> gpu_id_str; + long long gpu_id = std::stoll(gpu_id_str); + if (gpu_id > 0) { + gpu_numa_nodes_start = (gpu_numa_nodes_start > node_id || gpu_numa_nodes_start == 0) + ? node_id + : gpu_numa_nodes_start; + } + } + gpu_id_file.close(); + } + path = "/sys/class/kfd/kfd/topology/nodes"; + for (const auto& entry : fs::directory_iterator(path)) { + long long node_id = std::stoll(entry.path().filename().c_str()); + std::string numa_node_path = entry.path().c_str(); + long long agent_id = std::stoll(entry.path().filename().c_str()); + if (agent_id >= gpu_numa_nodes_start) { + numa_node_path += "/io_links"; + for (const auto& numa_node_entry : fs::directory_iterator(numa_node_path)) { + std::string numa_node_entry_properties_path = numa_node_entry.path().c_str(); + numa_node_entry_properties_path += "/properties"; + std::ifstream gpu_properties_file; + gpu_properties_file.open(numa_node_entry_properties_path); + std::string gpu_properties_file_line; + if (gpu_properties_file.is_open()) { + while (gpu_properties_file) { + std::getline(gpu_properties_file, gpu_properties_file_line); + std::string delimiter = " "; + std::stringstream ss(gpu_properties_file_line); + std::string word; + ss >> word; + if (word.compare("node_to") == 0) { + ss >> word; + long long near_cpu_node_id = std::stoll(word); + if (near_cpu_node_id < gpu_numa_nodes_start) { + gpu_numa_nodes_near_cpu[node_id] = near_cpu_node_id; + } + } + } + } + gpu_properties_file.close(); + } + } + } + // Enumerate the agents. if (GetCoreApiTable().hsa_iterate_agents_fn( [](hsa_agent_t agent, void* data) { @@ -777,10 +845,16 @@ void Initialize(HsaApiTable* table) { case HSA_DEVICE_TYPE_CPU: agent_info.setIndex(cpu_agent_count++); cpu_agent = agent; - rocmtools::queue::InitializePools(cpu_agent); + rocmtools::queue::InitializePools(cpu_agent, &agent_info); + uint32_t cpu_numa_node_id; + if (GetCoreApiTable().hsa_agent_get_info_fn( + agent, HSA_AGENT_INFO_NODE, &cpu_numa_node_id) != HSA_STATUS_SUCCESS) + rocmtools::fatal("hsa_agent_get_info(HSA_AGENT_INFO_NODE) failed"); + agent_info.setNumaNode(cpu_numa_node_id); + numa_node_to_cpu_agent[cpu_numa_node_id] = agent; break; case HSA_DEVICE_TYPE_GPU: - // XXX FIXME: When multiple ranks are used, each rank's first + // TODO(FIXME): When multiple ranks are used, each rank's first // logical device always has GPU ID 0, regardless of which // physical device is selected with CUDA_VISIBLE_DEVICES. // Because of this, when merging traces from multiple ranks, @@ -794,6 +868,15 @@ void Initialize(HsaApiTable* table) { // is currently doing as well as the roctracer compatibility // code earlier in this file. agent_info.setIndex(gpu_agent_count++); + uint32_t gpu_cpu_numa_node_id; + if (GetCoreApiTable().hsa_agent_get_info_fn( + agent, HSA_AGENT_INFO_NODE, &gpu_cpu_numa_node_id) != HSA_STATUS_SUCCESS) + rocmtools::fatal("hsa_agent_get_info(HSA_AGENT_INFO_NODE) failed"); + agent_info.setNumaNode(gpu_cpu_numa_node_id); + agent_info.setNearCpuAgent( + numa_node_to_cpu_agent[gpu_numa_nodes_near_cpu[gpu_cpu_numa_node_id]]); + rocmtools::queue::InitializeGPUPool(agent, &agent_info); + gpu_agents.push_back(agent); break; default: agent_info.setIndex(other_agent_count++); @@ -805,6 +888,19 @@ void Initialize(HsaApiTable* table) { nullptr) != HSA_STATUS_SUCCESS) rocmtools::fatal("hsa_iterate_agents failed"); + for (auto& agent : gpu_agents) { + GetAgentInfo(agent.handle).cpu_pool = + GetAgentInfo(GetAgentInfo(agent.handle).getNearCpuAgent().handle).cpu_pool; + GetAgentInfo(agent.handle).kernarg_pool = + GetAgentInfo(GetAgentInfo(agent.handle).getNearCpuAgent().handle).kernarg_pool; + } + + rocmtools::queue::CheckPacketReqiurements(gpu_agents); + + gpu_agents.clear(); + numa_node_to_cpu_agent.clear(); + gpu_numa_nodes_near_cpu.clear(); + SetHSALoaderApi(); roctracer::hsa_support::Initialize_roctracer(table); diff --git a/projects/rocprofiler/src/core/hsa/packets/packets_generator.cpp b/projects/rocprofiler/src/core/hsa/packets/packets_generator.cpp index 58ba619879..0cb19013b6 100644 --- a/projects/rocprofiler/src/core/hsa/packets/packets_generator.cpp +++ b/projects/rocprofiler/src/core/hsa/packets/packets_generator.cpp @@ -40,6 +40,7 @@ #include "src/core/hsa/hsa_common.h" #include "src/core/counters/metrics/metrics.h" +#include "src/core/hardware/hsa_info.h" #define ASSERTM(exp, msg) assert(((void)msg, exp)) @@ -58,11 +59,11 @@ namespace Packet { static const size_t MEM_PAGE_BYTES = 0x1000; static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; -hsa_amd_memory_pool_t command_pool; -hsa_amd_memory_pool_t output_pool; +// hsa_amd_memory_pool_t command_pool; +// hsa_amd_memory_pool_t output_pool; -hsa_amd_memory_pool_t& GetCommandPool() { return command_pool; } -hsa_amd_memory_pool_t& GetOutputPool() { return output_pool; } +// hsa_amd_memory_pool_t& GetCommandPool() { return command_pool; } +// hsa_amd_memory_pool_t& GetOutputPool() { return output_pool; } // This function checks to see if the provided // pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true, @@ -111,22 +112,38 @@ hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { return FindGlobalPool(pool, data, true); } -void InitializePools(hsa_agent_t cpu_agent) { +void InitializePools(hsa_agent_t cpu_agent, Agent::AgentInfo* agent_info) { hsa_status_t status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn( - cpu_agent, FindStandardPool, &command_pool); + cpu_agent, FindStandardPool, &(agent_info->cpu_pool)); if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Command Buffer Pool is not initialized\n"); status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn( - cpu_agent, FindKernArgPool, &output_pool); + cpu_agent, FindKernArgPool, &(agent_info->kernarg_pool)); if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Output Buffer Pool is not initialized\n"); } +void InitializeGPUPool(hsa_agent_t gpu_agent, Agent::AgentInfo* agent_info) { + hsa_status_t status = + hsa_amd_agent_iterate_memory_pools(gpu_agent, FindStandardPool, &(agent_info->gpu_pool)); + CHECK_HSA_STATUS("hsa_amd_agent_iterate_memory_pools(gpu_pool)", status); +} + struct block_des_t { uint32_t id; uint32_t index; }; +std::map metricsDict; + +void CheckPacketReqiurements(std::vector& gpu_agents) { + for (auto& gpu_agent : gpu_agents) { + // get the instance of MetricsDict + Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle); + metricsDict[gpu_agent.handle] = rocmtools::MetricsDict::Create(&agentInfo); + } +} + // Initialize the PM4 commands with having the CPU&GPU agents, the counters, // counters count to output three packets which are start, stop and read // packets @@ -136,8 +153,6 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, hsa_status_t status = HSA_STATUS_SUCCESS; Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle); - // get the instance of MetricsDict - rocmtools::MetricsDict* metricsDict = rocmtools::MetricsDict::Create(&agentInfo); std::map results_map; std::vector events_list; std::vector results_list; @@ -158,9 +173,9 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, counters_count++; } - rocmtools::metrics::ExtractMetricEvents(counter_names, gpu_agent, metricsDict, results_map, - events_list, results_list, event_to_max_block_count, - metrics_counters); + rocmtools::metrics::ExtractMetricEvents(counter_names, gpu_agent, metricsDict[gpu_agent.handle], + results_map, events_list, results_list, + event_to_max_block_count, metrics_counters); // TODO: validate needs to be called on each events_list[i] // Validating the events array for the specified gpu agent @@ -171,8 +186,6 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, throw("Error: Events are not valid for the current gpu agent"); } - // std::cout << "Max Block Counters: " << max_block_counters << std::endl; - std::vector>* profiles = new std::vector< std::pair>(); @@ -197,27 +210,9 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, counter_val_iteration++; block_max_events_count[std::make_pair( static_cast(event->block_name), static_cast(event->block_index))]++; - // std::cout << "Block Name: " << event->block_name << " Block Index: " << - // event->block_index - // << " Current Count: " - // << block_max_events_count[std::make_pair( - // static_cast(event->block_name), - // static_cast(event->block_index))] - // << std::endl; - // std::cout << "Counter Taken: " << event->block_index << ", " << event->counter_id << " " - // << block_max_events_count[std::make_pair( - // static_cast(event->block_name), - // static_cast(event->block_index))] - // << ":" - // << event_to_max_block_count[std::make_pair( - // static_cast(event->block_name), - // static_cast(event->block_index))] - // << std::endl; results_list.erase(result); events_list.erase(event); } else { - // std::cout << "Counter Left: " << event->block_index << ", " << event->counter_id - // << std::endl; event++; result++; } @@ -234,7 +229,7 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, counter_names.end()) { // std::cout << "Counter from Result List: " << result->name << std::endl; counters_taken.insert(result->name); - metric = const_cast(metricsDict->Get(result->name)); + metric = const_cast(metricsDict[gpu_agent.handle]->Get(result->name)); if (metric == nullptr) std::cout << result->name << " not found in metricsDict\n"; context->metrics_list.push_back(metric); } else { @@ -257,7 +252,6 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, } } if (flag) metrics_taken.insert(result.first); - // std::cout << "Metric to be checked from map: " << result.first << std::endl; } } @@ -267,8 +261,6 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, for (auto metric_counter_name : metrics_counters.at(metric_name)) { if (metrics_counters_taken.find(metric_counter_name) == metrics_counters_taken.end() && counters_taken.find(metric_counter_name) == counters_taken.end()) { - // std::cout << metric_counter_name << " for " << metric_name << " is not found!" - // << std::endl; flag = false; continue; } @@ -276,14 +268,15 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, if (flag) { // std::cout << "Counter from Result Map: " << metric_name << std::endl; counters_taken.insert(metric_name); - rocmtools::Metric* metric = const_cast(metricsDict->Get(metric_name)); + rocmtools::Metric* metric = + const_cast(metricsDict[gpu_agent.handle]->Get(metric_name)); if (metric == nullptr) std::cout << metric_name << " not found in metricsDict\n"; context->metrics_list.push_back(metric); } } context->results_map = results_map; - context->metrics_dict = metricsDict; + context->metrics_dict = metricsDict[gpu_agent.handle]; hsa_ven_amd_aqlprofile_parameter_t* params = {}; @@ -323,7 +316,7 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, size_t size = profile->command_buffer.size; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( - command_pool, size, 0, reinterpret_cast(&(profile->command_buffer.ptr))); + agentInfo.cpu_pool, size, 0, reinterpret_cast(&(profile->command_buffer.ptr))); // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { @@ -349,7 +342,7 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, size = profile->output_buffer.size; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( - output_pool, size, 0, reinterpret_cast(&profile->output_buffer.ptr)); + agentInfo.kernarg_pool, size, 0, reinterpret_cast(&profile->output_buffer.ptr)); if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) { printf("Error: Ran out of GPU memory to allocate Output Buffer\n"); @@ -435,20 +428,14 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets( // Preparing an Getting the size of the command and output buffers status = hsa_ven_amd_aqlprofile_start(profile, NULL); - // Preparing and Initializing bool of buffers for command and output buffers - status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn( - cpu_agent, FindStandardPool, &command_pool); - if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Command Buffer Pool is not initialized\n"); - status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn( - cpu_agent, FindKernArgPool, &output_pool); - if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Output Buffer Pool is not initialized\n"); + Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle); // Allocating Command Buffer status = HSA_STATUS_ERROR; size_t size = profile->command_buffer.size; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( - command_pool, size, 0, reinterpret_cast(&command_buffer)); + agentInfo.cpu_pool, size, 0, reinterpret_cast(&command_buffer)); // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {gpu_agent}; @@ -463,7 +450,7 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets( size = profile->output_buffer.size; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( - output_pool, size, 0, reinterpret_cast(&output_buffer)); + agentInfo.kernarg_pool, size, 0, reinterpret_cast(&output_buffer)); // Both the CPU and GPU can access the kernel arguments if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {gpu_agent}; @@ -517,14 +504,14 @@ uint8_t* AllocateLocalMemory(size_t size, hsa_amd_memory_pool_t* gpu_pool) { return ptr; } -hsa_status_t Allocate(hsa_agent_t gpu_agent, hsa_ven_amd_aqlprofile_profile_t* profile, - hsa_amd_memory_pool_t* cpu_pool, hsa_amd_memory_pool_t* gpu_pool) { +hsa_status_t Allocate(hsa_agent_t gpu_agent, hsa_ven_amd_aqlprofile_profile_t* profile) { + Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle); profile->command_buffer.ptr = - AllocateSysMemory(gpu_agent, profile->command_buffer.size, cpu_pool); + AllocateSysMemory(gpu_agent, profile->command_buffer.size, &agentInfo.cpu_pool); profile->output_buffer.size = g_output_buffer_size; profile->output_buffer.ptr = (g_output_buffer_local) - ? AllocateLocalMemory(profile->output_buffer.size, gpu_pool) - : AllocateSysMemory(gpu_agent, profile->output_buffer.size, cpu_pool); + ? AllocateLocalMemory(profile->output_buffer.size, &agentInfo.gpu_pool) + : AllocateSysMemory(gpu_agent, profile->output_buffer.size, &agentInfo.cpu_pool); return (profile->command_buffer.ptr && profile->output_buffer.ptr) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; } @@ -574,49 +561,27 @@ hsa_ven_amd_aqlprofile_profile_t* GenerateATTPackets( hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, std::vector& att_params, packet_t* start_packet, packet_t* stop_packet) { - att_memory_pools_t* att_mem_pools = NULL; - auto it = GetAttMemPoolsMap()->find(gpu_agent.handle); - if (it == GetAttMemPoolsMap()->end()) { - att_mem_pools = new att_memory_pools_t; - - // Allocate memory pools for cpu and gpu - AllocateMemoryPools(cpu_agent, gpu_agent, &att_mem_pools->cpu_mem_pool, - &att_mem_pools->gpu_mem_pool); - - GetAttMemPoolsMap()->emplace(gpu_agent.handle, att_mem_pools); - } else - att_mem_pools = it->second; - #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wconversion-null" // Preparing the profile structure to get the packets hsa_ven_amd_aqlprofile_profile_t* profile = - new hsa_ven_amd_aqlprofile_profile_t{gpu_agent, - HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE, - nullptr, - 0, - &att_params[0], - (uint32_t)att_params.size(), - NULL, - NULL}; + new hsa_ven_amd_aqlprofile_profile_t{gpu_agent, HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE, + nullptr, 0, + &att_params[0], (uint32_t)att_params.size(), + NULL, NULL}; #pragma GCC diagnostic pop // Check the profile buffer sizes hsa_status_t status = hsa_ven_amd_aqlprofile_start(profile, NULL); if (status != HSA_STATUS_SUCCESS) printf("Error: aqlprofile_start(NULL)"); - // // Double output buffer size if concurrent - // if (is_concurrent) profile.output_buffer.size *= 2; - // TODO: create a separate class for memory allocations // Maintain pools per device // handle allocation and resource cleanup - // Allocate command and output buffers // command buffer -> from CPU memory pool // output buffer -> from GPU memory pool - status = - Allocate(gpu_agent, profile, &att_mem_pools->cpu_mem_pool, &att_mem_pools->gpu_mem_pool); + status = Allocate(gpu_agent, profile); if (status != HSA_STATUS_SUCCESS) printf("Error: Allocate()"); // Generate start/stop/read profiling packets diff --git a/projects/rocprofiler/src/core/hsa/packets/packets_generator.h b/projects/rocprofiler/src/core/hsa/packets/packets_generator.h index 29a562b281..df3b2ef0a5 100644 --- a/projects/rocprofiler/src/core/hsa/packets/packets_generator.h +++ b/projects/rocprofiler/src/core/hsa/packets/packets_generator.h @@ -37,8 +37,6 @@ namespace Packet { -static std::mutex pool_lock; - typedef hsa_ext_amd_aql_pm4_packet_t packet_t; std::vector>* @@ -47,7 +45,8 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_pool_t* cpu_pool); void GetCommandBufferMap(std::map); void GetOutputBufferMap(std::map); -void InitializePools(hsa_agent_t cpu_agent); +void InitializePools(hsa_agent_t cpu_agent, Agent::AgentInfo* agent_info); +void InitializeGPUPool(hsa_agent_t gpu_agent, Agent::AgentInfo* agent_info); hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets( hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, hsa_ven_amd_aqlprofile_event_t* events, uint32_t event_count, packet_t* start_packet, packet_t* stop_packet, packet_t* read_packet); @@ -63,9 +62,10 @@ hsa_ven_amd_aqlprofile_profile_t* GenerateATTPackets( uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_pool_t* cpu_pool); -void get_command_buffer_map(std::map ); -void get_outbuffer_map(std::map ); +void get_command_buffer_map(std::map); +void get_outbuffer_map(std::map); void initialize_pools(hsa_agent_t cpu_agent); +void CheckPacketReqiurements(std::vector& gpu_agents); typedef struct { hsa_amd_memory_pool_t cpu_mem_pool; diff --git a/projects/rocprofiler/src/core/hsa/queues/queue.cpp b/projects/rocprofiler/src/core/hsa/queues/queue.cpp index 539b771b7d..647599aea9 100644 --- a/projects/rocprofiler/src/core/hsa/queues/queue.cpp +++ b/projects/rocprofiler/src/core/hsa/queues/queue.cpp @@ -27,6 +27,7 @@ #include #include +#include "rocprofiler.h" #include "src/api/rocmtool.h" #include "src/core/hsa/packets/packets_generator.h" #include "src/core/hsa/hsa_support.h" @@ -284,7 +285,7 @@ hsa_status_t attTraceDataCallback(hsa_ven_amd_aqlprofile_info_type_t info_type, att_trace_callback_data_t* passed_data = reinterpret_cast(data); passed_data->push_back(*info_data); // TODO: clear output buffers after copying - // either copy here or in AddattRecord + // either copy here or in ::AddAttRecord return status; } @@ -314,12 +315,10 @@ void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_sign void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent, att_pending_signal_t& pending) { + Agent::AgentInfo agent_info = hsa_support::GetAgentInfo(gpu_agent.handle); att_trace_callback_data_t data; hsa_ven_amd_aqlprofile_iterate_data(pending.profile, attTraceDataCallback, &data); - // Get CPU and GPU memory pools - Packet::att_memory_pools_t* att_mem_pools = Packet::GetAttMemPools(gpu_agent); - // Allocate memory for shader_engine_data record->shader_engine_data = static_cast( calloc(data.size(), sizeof(rocprofiler_record_se_att_data_t))); @@ -336,7 +335,7 @@ void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent void* buffer = NULL; if (data_size != 0) { // Allocate buffer on CPU to copy out trace data - buffer = Packet::AllocateSysMemory(gpu_agent, data_size, &att_mem_pools->cpu_mem_pool); + buffer = Packet::AllocateSysMemory(gpu_agent, data_size, &agent_info.cpu_pool); if (buffer == NULL) fatal("Trace data buffer allocation failed"); auto status = @@ -353,83 +352,6 @@ void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent record->shader_engine_data_count = data.size(); } -// static const size_t MEM_PAGE_BYTES = 0x1000; -// static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; -// static std::mutex begin_signal_lock; - -// bool BeginSignalHandler(hsa_signal_value_t signal_value, void* data) { -// std::lock_guard lock(begin_signal_lock); -// auto profiling_context = -// static_cast*>( -// data); -// if (!profiling_context->first->begin_completed.load(std::memory_order_relaxed)) { -// std::cout << "BeginSignalHandler is called" << std::endl; -// hsa_status_t status = HSA_STATUS_ERROR; -// size_t size = profiling_context->second->command_buffer.size; -// size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; -// status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( -// Packet::GetCommandPool(), size, 0, -// reinterpret_cast(&(profiling_context->second->command_buffer.ptr))); - -// // Both the CPU and GPU can access the memory -// if (status == HSA_STATUS_SUCCESS) { -// hsa_agent_t ag_list[1] = {profiling_context->first->gpu_agent}; -// status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn( -// 1, ag_list, NULL, profiling_context->second->command_buffer.ptr); - -// if (status != HSA_STATUS_SUCCESS) { -// printf("Error: Can't allow access for both agents to Command Buffer\n"); -// } -// } else if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) { -// printf("Error: Ran out of GPU memory to allocate Command Buffer\n"); -// } else { -// const char* hsa_err_str = NULL; -// if (hsa_status_string(status, &hsa_err_str) != HSA_STATUS_SUCCESS) hsa_err_str = "Unknown"; -// printf("Error: Allocating command Buffer (Size=%lu) (%s)\n", size, hsa_err_str); -// } - -// status = HSA_STATUS_ERROR; -// size = profiling_context->second->output_buffer.size; -// size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; -// status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( -// Packet::GetOutputPool(), size, 0, -// reinterpret_cast(&profiling_context->second->output_buffer.ptr)); - -// if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) { -// printf("Error: Ran out of GPU memory to allocate Output Buffer\n"); -// } - -// if (status == HSA_STATUS_SUCCESS) { -// hsa_agent_t ag_list[1] = {profiling_context->first->gpu_agent}; -// status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn( -// 1, ag_list, NULL, profiling_context->second->output_buffer.ptr); - -// if (status == HSA_STATUS_SUCCESS) { -// memset(profiling_context->second->output_buffer.ptr, 0x0, -// profiling_context->second->output_buffer.size); - -// // Creating the start/stop/read packets -// status = hsa_ven_amd_aqlprofile_start(profiling_context->second, -// profiling_context->first->start_packet); -// status = hsa_ven_amd_aqlprofile_stop(profiling_context->second, -// profiling_context->first->stop_packet); -// status = hsa_ven_amd_aqlprofile_read(profiling_context->second, -// profiling_context->first->read_packet); -// } else { -// printf("Error: Can't allow access for both agents to output Buffer\n"); -// } -// } else { -// const char* hsa_err_str = NULL; -// if (hsa_status_string(status, &hsa_err_str) != HSA_STATUS_SUCCESS) hsa_err_str = "Unknown"; -// printf("Error: Allocating output Buffer (%s)\n", hsa_err_str); -// } - -// profiling_context->first->begin_completed.exchange(true, std::memory_order_relaxed); -// } -// return true; -// } - bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) { auto queue_info_session = static_cast(data); if (!queue_info_session || !GetROCMToolObj() || @@ -647,6 +569,75 @@ template constexpr Integral bit_extract(Integral x, int firs return (x >> first) & bit_mask(0, last - first); } +rocprofiler_session_id_t session_id = rocprofiler_session_id_t{0}; +// Counter Names declaration +std::vector session_data; + +rocprofiler_buffer_id_t buffer_id; + +uint64_t session_data_count = 0; + +bool is_counter_collection_mode = false; +bool is_timestamp_collection_mode = false; +bool is_att_collection_mode = false; +bool is_pc_sampling_collection_mode = false; +std::vector att_parameters_data; +uint32_t replay_mode_count = 0; +std::vector kernel_profile_names; +std::vector att_counters_names; + +rocmtools::Session* session = nullptr; + +void ResetSessionID() { session_id = rocprofiler_session_id_t{0}; } + +void CheckNeededProfileConfigs() { + rocprofiler_session_id_t internal_session_id; + if (GetROCMToolObj()) + // Getting Session ID + internal_session_id = GetROCMToolObj()->GetCurrentSessionId(); + else + internal_session_id = {0}; + + if (session_id.handle == 0 || internal_session_id.handle != session_id.handle) { + session_id = internal_session_id; + // Getting Counters count from the Session + if (session_id.handle > 0 && GetROCMToolObj()) { + session = GetROCMToolObj()->GetSession(session_id); + if (session && session->FindFilterWithKind(ROCPROFILER_COUNTERS_COLLECTION)) { + rocprofiler_filter_id_t filter_id = + session->GetFilterIdWithKind(ROCPROFILER_COUNTERS_COLLECTION); + rocmtools::Filter* filter = session->GetFilter(filter_id); + session_data = filter->GetCounterData(); + is_counter_collection_mode = true; + session_data_count = session_data.size(); + buffer_id = filter->GetBufferId(); + } else if (session && + session->FindFilterWithKind(ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION)) { + is_timestamp_collection_mode = true; + rocprofiler_filter_id_t filter_id = + session->GetFilterIdWithKind(ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION); + rocmtools::Filter* filter = session->GetFilter(filter_id); + buffer_id = filter->GetBufferId(); + } else if (session && session->FindFilterWithKind(ROCPROFILER_ATT_TRACE_COLLECTION)) { + rocprofiler_filter_id_t filter_id = + session->GetFilterIdWithKind(ROCPROFILER_ATT_TRACE_COLLECTION); + rocmtools::Filter* filter = session->GetFilter(filter_id); + att_parameters_data = filter->GetAttParametersData(); + is_att_collection_mode = true; + buffer_id = + session->GetFilter(session->GetFilterIdWithKind(ROCPROFILER_ATT_TRACE_COLLECTION)) + ->GetBufferId(); + + att_counters_names = filter->GetCounterData(); + kernel_profile_names = std::get>( + filter->GetProperty(ROCPROFILER_FILTER_KERNEL_NAMES)); + } else if (session && session->FindFilterWithKind(ROCPROFILER_PC_SAMPLING_COLLECTION)) { + is_pc_sampling_collection_mode = true; + } + } + } +} + static int KernelInterceptCount = 0; std::atomic WRITER_ID{0}; /** @@ -660,64 +651,8 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt hsa_amd_queue_intercept_packet_writer writer) { const Packet::packet_t* packets_arr = reinterpret_cast(packets); std::vector transformed_packets; - rocprofiler_session_id_t session_id; - if (GetROCMToolObj()) - // Getting Session ID - session_id = GetROCMToolObj()->GetCurrentSessionId(); - else - session_id = {0}; - // Counter Names declaration - std::vector session_data; - - rocprofiler_buffer_id_t buffer_id; - - uint64_t session_data_count = 0; - - bool is_counter_collection_mode = false; - bool is_timestamp_collection_mode = false; - bool is_att_collection_mode = false; - bool is_pc_sampling_collection_mode = false; - std::vector att_parameters_data; - uint32_t replay_mode_count = 0; - std::vector kernel_profile_names; - std::vector att_counters_names; - - rocmtools::Session* session = nullptr; - - // Getting Counters count from the Session - if (session_id.handle > 0 && GetROCMToolObj()) { - session = GetROCMToolObj()->GetSession(session_id); - if (session && session->FindFilterWithKind(ROCPROFILER_COUNTERS_COLLECTION)) { - rocprofiler_filter_id_t filter_id = - session->GetFilterIdWithKind(ROCPROFILER_COUNTERS_COLLECTION); - rocmtools::Filter* filter = session->GetFilter(filter_id); - session_data = filter->GetCounterData(); - is_counter_collection_mode = true; - session_data_count = session_data.size(); - buffer_id = filter->GetBufferId(); - } else if (session && session->FindFilterWithKind(ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION)) { - is_timestamp_collection_mode = true; - rocprofiler_filter_id_t filter_id = - session->GetFilterIdWithKind(ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION); - rocmtools::Filter* filter = session->GetFilter(filter_id); - buffer_id = filter->GetBufferId(); - } else if (session && session->FindFilterWithKind(ROCPROFILER_ATT_TRACE_COLLECTION)) { - rocprofiler_filter_id_t filter_id = - session->GetFilterIdWithKind(ROCPROFILER_ATT_TRACE_COLLECTION); - rocmtools::Filter* filter = session->GetFilter(filter_id); - att_parameters_data = filter->GetAttParametersData(); - is_att_collection_mode = true; - buffer_id = session->GetFilter(session->GetFilterIdWithKind(ROCPROFILER_ATT_TRACE_COLLECTION)) - ->GetBufferId(); - - att_counters_names = filter->GetCounterData(); - kernel_profile_names = - std::get>(filter->GetProperty(ROCPROFILER_FILTER_KERNEL_NAMES)); - } else if (session && session->FindFilterWithKind(ROCPROFILER_PC_SAMPLING_COLLECTION)) { - is_pc_sampling_collection_mode = true; - } - } + CheckNeededProfileConfigs(); if (session_id.handle > 0 && pkt_count > 0 && (is_counter_collection_mode || is_timestamp_collection_mode || @@ -764,21 +699,6 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt if (session_data_count > 0 && is_counter_collection_mode && profiles && replay_mode_count > 0) { - // hsa_signal_t begin_signal{}; - // CreateSignal(0, &begin_signal); - // hsa_barrier_and_packet_t barrier{0}; - // barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; - // CreateSignal(0, &barrier.completion_signal); - // barrier.dep_signal[0] = hsa_signal_t{}; - // Packet::packet_t* __attribute__((__may_alias__)) pkt = - // (reinterpret_cast(&barrier)); - // transformed_packets.emplace_back(*pkt); - // hsa_status_t status = hsa_support::GetAmdExtTable().hsa_amd_signal_async_handler_fn( - // barrier.completion_signal, HSA_SIGNAL_CONDITION_GTE, 1, BeginSignalHandler, - // &profiles->at(profile_id)); - // if (status != HSA_STATUS_SUCCESS) - // fatal("hsa_amd_signal_async_handler failed for begin signal"); - // Adding start packet and its barrier with a dummy signal hsa_signal_t dummy_signal{}; dummy_signal.handle = 0; @@ -1087,8 +1007,6 @@ Queue::Queue(const hsa_agent_t& cpu_agent, const hsa_agent_t& gpu_agent, uint32_ *queue = intercept_queue_; } -// Queue::~Queue() { std::lock_guard lk(mutex_); } - hsa_queue_t* Queue::GetCurrentInterceptQueue() { return intercept_queue_; } hsa_agent_t Queue::GetGPUAgent() { return gpu_agent_; } @@ -1097,7 +1015,15 @@ hsa_agent_t Queue::GetCPUAgent() { return cpu_agent_; } uint64_t Queue::GetQueueID() { return intercept_queue_->id; } -void InitializePools(hsa_agent_t cpu_agent) { Packet::InitializePools(cpu_agent); } +void InitializePools(hsa_agent_t cpu_agent, Agent::AgentInfo* agent_info) { + Packet::InitializePools(cpu_agent, agent_info); +} +void InitializeGPUPool(hsa_agent_t gpu_agent, Agent::AgentInfo* agent_info) { + Packet::InitializeGPUPool(gpu_agent, agent_info); +} +void CheckPacketReqiurements(std::vector& gpu_agents) { + Packet::CheckPacketReqiurements(gpu_agents); +} } // namespace queue } // namespace rocmtools diff --git a/projects/rocprofiler/src/core/hsa/queues/queue.h b/projects/rocprofiler/src/core/hsa/queues/queue.h index 1b1ee121a0..39aacd78f5 100644 --- a/projects/rocprofiler/src/core/hsa/queues/queue.h +++ b/projects/rocprofiler/src/core/hsa/queues/queue.h @@ -86,7 +86,11 @@ struct queue_info_session_t { void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_signal_t& pending); -void InitializePools(hsa_agent_t cpu_agent); +void InitializePools(hsa_agent_t cpu_agent, Agent::AgentInfo* agent_info); +void InitializeGPUPool(hsa_agent_t gpu_agent, Agent::AgentInfo* agent_info); +void CheckPacketReqiurements(std::vector& gpu_agents); + +void ResetSessionID(); } // namespace queue } // namespace rocmtools diff --git a/projects/rocprofiler/src/core/session/session.cpp b/projects/rocprofiler/src/core/session/session.cpp index 9b01951913..bcf30b5f77 100644 --- a/projects/rocprofiler/src/core/session/session.cpp +++ b/projects/rocprofiler/src/core/session/session.cpp @@ -47,6 +47,7 @@ Session::~Session() { while (GetCurrentActiveInterruptSignalsCount() > 0) { } if (profiler_started_.load(std::memory_order_release)) { + rocmtools::queue::ResetSessionID(); delete profiler_; profiler_started_.exchange(false, std::memory_order_release); } @@ -77,7 +78,8 @@ void Session::DisableTools(rocprofiler_buffer_id_t buffer_id) { } } if (FindFilterWithKind(ROCPROFILER_API_TRACE) && - GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetBufferId().value == buffer_id.value) { + GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetBufferId().value == + buffer_id.value) { if (tracer_started_.load(std::memory_order_release)) { tracer_->DisableRoctracer(); } @@ -131,9 +133,10 @@ void Session::Start() { GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetTraceData(); if (!tracer_started_.load(std::memory_order_release)) { tracer_ = new tracer::Tracer( - session_id_, (GetFilter( - GetFilterIdWithKind(ROCPROFILER_API_TRACE))->HasCallback() ? GetFilter( - GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetCallback() : nullptr), + session_id_, + (GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->HasCallback() + ? GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetCallback() + : nullptr), GetFilter(GetFilterIdWithKind(ROCPROFILER_API_TRACE))->GetBufferId(), domains); tracer_started_.exchange(true, std::memory_order_release); } @@ -144,7 +147,8 @@ void Session::Start() { if (!pc_sampler_started_.load(std::memory_order_release)) { pc_sampler_ = new pc_sampler::PCSampler( GetFilter(GetFilterIdWithKind(ROCPROFILER_PC_SAMPLING_COLLECTION))->GetBufferId(), - GetFilter(GetFilterIdWithKind(ROCPROFILER_PC_SAMPLING_COLLECTION))->GetId(), session_id_); + GetFilter(GetFilterIdWithKind(ROCPROFILER_PC_SAMPLING_COLLECTION))->GetId(), + session_id_); pc_sampler_started_.exchange(true, std::memory_order_release); } pc_sampler_->Start(); @@ -167,6 +171,7 @@ void Session::Start() { void Session::Terminate() { if (is_active_) { + rocmtools::queue::ResetSessionID(); std::lock_guard lock(session_lock_); if (FindFilterWithKind(ROCPROFILER_SPM_COLLECTION)) { { @@ -214,9 +219,9 @@ pc_sampler::PCSampler* Session::GetPCSampler() { return pc_sampler_; } CountersSampler* Session::GetCountersSampler() { return counters_sampler_; } rocprofiler_filter_id_t Session::CreateFilter(rocprofiler_filter_kind_t filter_kind, - rocprofiler_filter_data_t filter_data, - uint64_t data_count, - rocprofiler_filter_property_t property) { + rocprofiler_filter_data_t filter_data, + uint64_t data_count, + rocprofiler_filter_property_t property) { rocprofiler_filter_id_t id = rocprofiler_filter_id_t{filters_counter_.fetch_add(1, std::memory_order_release)}; { @@ -287,7 +292,7 @@ rocprofiler_filter_id_t Session::GetFilterIdWithKind(rocprofiler_filter_kind_t k bool Session::HasBuffer() { return buffers_.size() > 0; } rocprofiler_buffer_id_t Session::CreateBuffer(rocprofiler_buffer_callback_t buffer_callback, - size_t buffer_size) { + size_t buffer_size) { rocprofiler_buffer_id_t id = rocprofiler_buffer_id_t{buffers_counter_.fetch_add(1, std::memory_order_release)}; { diff --git a/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp b/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp index cd4948ff66..dafdfca8d4 100644 --- a/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp +++ b/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp @@ -489,7 +489,7 @@ uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t } // Allocate memory to pass kernel parameters. -// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. +// Memory is allocated accessible for all CPU agents and for GPU given by AgentInfo parameter. // @param agent_info Agent from whose memory region to allocate // @param size Size of memory in terms of bytes // @return uint8_t* Pointer to buffer, null if allocation fails.