SWDEV-403050: Multiple fixes for Memory Leaks in profiler

Change-Id: Ib720a81105af13898ff745ce0cbc2a48c1f4a980
Этот коммит содержится в:
Ammar ELWazir
2023-05-27 09:34:42 +00:00
коммит произвёл Ammar ELWazir
родитель 00ecca25c7
Коммит 08fc21ac31
26 изменённых файлов: 673 добавлений и 592 удалений
+3
Просмотреть файл
@@ -116,6 +116,9 @@ find_package(
PATHS
${ROCM_PATH})
find_library(NUMA NAME numa REQUIRED)
link_libraries(${NUMA})
get_property(
HSA_RUNTIME_INCLUDE_DIRECTORIES
TARGET hsa-runtime64::hsa-runtime64
-18
Просмотреть файл
@@ -302,24 +302,6 @@ get_pmc_results_txt_path(){
done
}
if [ -n "$COUNTERS_PMC_DIRS" ]; then
COUNTERS_RESULTS_TXT=""
for PMC_DIR in $COUNTERS_PMC_DIRS; do
COUNTERS_RESULTS_TXT="$COUNTERS_RESULTS_TXT $(get_pmc_results_txt_path $PMC_DIR)"
done
CSV_RESULTS=$OUTPUT_PATH_INTERNAL/results.csv
if [ -n "$OUT_FILE_NAME" ]; then
CSV_RESULTS=$OUTPUT_PATH_INTERNAL/$OUT_FILE_NAME
fi
echo "csv results path: $CSV_RESULTS"
export ROCP_MERGE_PIDS=1 #required for tblextr.py to work correctly for counters
if [ $RUN_FROM_BUILD == 1 ]; then
eval "python3 $ROCM_DIR/bin/tblextr.py $CSV_RESULTS $COUNTERS_RESULTS_TXT"
else
eval "python3 $ROCPROFV2_DIR/../libexec/rocprofiler/tblextr.py $CSV_RESULTS $COUNTERS_RESULTS_TXT"
fi
fi
if [ -n "$ATT_PATH" ]; then
if [ -n "$ATT_ARGV" ]; then
eval "python3 $ATT_PATH $ATT_ARGV"
+16 -4
Просмотреть файл
@@ -110,7 +110,7 @@ THE SOFTWARE.
#if !defined(ROCPROFILER)
#if defined(ROCPROFILER_EXPORTS)
#define ROCPROFILER_API ROCPROFILER_EXPORT
#else /* !defined (ROCPROFILER_EXPORTS) */
#else /* !defined (ROCPROFILER_EXPORTS) */
#define ROCPROFILER_API ROCPROFILER_IMPORT
#endif /* !defined (ROCPROFILER_EXPORTS) */
#endif /* !defined (ROCPROFILER) */
@@ -949,6 +949,14 @@ typedef struct {
uint64_t signal_handle;
} rocprofiler_kernel_properties_t;
/**
* Correlation ID
*/
typedef struct {
uint64_t value;
} rocprofiler_correlation_id_t;
/**
* Profiling record, this will represent all the information reported by the
* profiler regarding kernel dispatches and their counters that were collected
@@ -986,7 +994,11 @@ typedef struct {
* Counters, including identifiers to get counter information and Counters
* values
*/
rocprofiler_record_counter_instance_t* counters;
const rocprofiler_record_counter_instance_t* counters;
/**
* The count of the counters that were collected by the profiler
*/
rocprofiler_record_counters_instances_count_t counters_count; /* Counters Count */
/**
* kernel properties, including the grid size, work group size,
* registers count, wave size and completion signal
@@ -1001,9 +1013,9 @@ typedef struct {
*/
rocprofiler_queue_index_t queue_idx;
/**
* The count of the counters that were collected by the profiler
* Correlation id
*/
rocprofiler_record_counters_instances_count_t counters_count; /* Counters Count */
rocprofiler_correlation_id_t correlation_id;
} rocprofiler_record_profiler_t;
typedef struct {
+1 -2
Просмотреть файл
@@ -233,7 +233,6 @@ class file_plugin_t {
tracer_record.api_data_handle, tracer_record.operation_id, &function_name_c));
}
}
//return;
output_file_t* output_file = get_output_file(output_type_t::TRACER, tracer_record.domain);
*output_file << "Record(" << tracer_record.header.id.handle << "), Domain("
<< GetDomainName(tracer_record.domain) << "),";
@@ -336,7 +335,7 @@ class file_plugin_t {
session_id, ROCPROFILER_COUNTER_NAME, profiler_record->counters[i].counter_handler,
&counter_name_length));
if (counter_name_length > 1) {
const char* name_c = static_cast<const char*>(malloc(name_length * sizeof(char)));
const char* name_c = nullptr;
CHECK_ROCPROFILER(rocprofiler_query_counter_info(
session_id, ROCPROFILER_COUNTER_NAME, profiler_record->counters[i].counter_handler,
&name_c));
+2 -1
Просмотреть файл
@@ -87,7 +87,8 @@ std::string string_printf(const char* format, ...) {
std::string errmsg("ROCProfiler: fatal error: " + message);
fputs(errmsg.c_str(), stderr);
throw(errmsg);
// throw(errmsg);
abort();
}
/* The function extracts the kernel name from
+3 -3
Просмотреть файл
@@ -245,7 +245,7 @@ set_target_properties(rocprofiler-v2 PROPERTIES
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/v2
VERSION ${PROJECT_VERSION}
SOVERSION ${PROJECT_VERSION_MAJOR})
# Add custom command to copy the v2 library to buil-dir as well
add_custom_command(TARGET rocprofiler-v2 POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/v2/librocprofiler64.so.2 ${CMAKE_BINARY_DIR}
@@ -271,10 +271,10 @@ target_include_directories(rocprofiler-v2
if(ASAN)
target_compile_options(rocprofiler-v2 PRIVATE -fsanitize=address)
target_link_options(rocprofiler-v2 PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap -Wl,--no-undefined,-fsanitize=address)
target_link_libraries(rocprofiler-v2 PRIVATE ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 Threads::Threads atomic asan dl c stdc++ stdc++fs amd_comgr ${PCIACCESS_LIBRARIES})
target_link_libraries(rocprofiler-v2 PRIVATE ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 Threads::Threads atomic numa asan dl c stdc++ stdc++fs amd_comgr ${PCIACCESS_LIBRARIES})
else()
target_link_options(rocprofiler-v2 PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap -Wl,--no-undefined)
target_link_libraries(rocprofiler-v2 PRIVATE ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 Threads::Threads atomic dl c stdc++ stdc++fs amd_comgr ${PCIACCESS_LIBRARIES})
target_link_libraries(rocprofiler-v2 PRIVATE ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 Threads::Threads atomic numa dl c stdc++ stdc++fs amd_comgr ${PCIACCESS_LIBRARIES})
endif()
## Install libraries: Non versioned lib file in dev package
# install(TARGETS rocprofiler-v2 LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT dev)
+2 -1
Просмотреть файл
@@ -207,7 +207,8 @@ class Context {
} catch(...) {
delete obj;
obj = NULL;
throw;
std::cerr << "Error: Context Create failed" << std::endl;
abort();
}
return obj;
}
+24 -21
Просмотреть файл
@@ -46,8 +46,10 @@ hsa_status_t pmcCallback(hsa_ven_amd_aqlprofile_info_type_t info_type,
if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA) {
if (IsEventMatch(info_data->pmc_data.event, (*data_it)->event)) {
uint32_t xcc_index = floor(passed_data->index / passed_data->single_xcc_buff_size);
(*data_it)->xcc_vals[xcc_index] += info_data->pmc_data.result; // stores event result from each xcc separately
(*data_it)->val_double += info_data->pmc_data.result; // stores accumulated event result from all xccs
(*data_it)->xcc_vals[xcc_index] +=
info_data->pmc_data.result; // stores event result from each xcc separately
(*data_it)->val_double +=
info_data->pmc_data.result; // stores accumulated event result from all xccs
}
}
}
@@ -98,19 +100,16 @@ bool metrics::ExtractMetricEvents(
// const Metric* metric = metrics_dict->GetMetricByName(metric_names[i]);
const Metric* metric = metrics_dict->Get(metric_names[i]);
if (metric == nullptr) {
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
fatal("input metric'%s' not supported on this hardware: %s ", metric_names[i].c_str(),
agentInfo.getName().data());
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
fatal("input metric'%s' not supported on this hardware: %s ", metric_names[i].c_str(),
agentInfo.getName().data());
}
// adding result object for derived metric
std::lock_guard<std::mutex> lock(extract_metric_events_lock);
if (results_map.find(metric_names[i]) == results_map.end()) {
results_map[metric_names[i]] = new results_t(metric_names[i], {}, xcc_count);
} // else {
// continue;
// }
}
counters_vec = metric->GetCounters();
if (counters_vec.empty())
@@ -129,7 +128,8 @@ bool metrics::ExtractMetricEvents(
} else {
// result object for base metric
// std::cout << "Metric : " << metric->GetName() << " : " << counter->name << std::endl;
result = new results_t(counter->name, {}, xcc_count); // TODO: set correct initial value
result =
new results_t(counter->name, {}, xcc_count); // TODO: set correct initial value
results_map[counter->name] = result;
}
} else {
@@ -188,7 +188,7 @@ bool metrics::ExtractMetricEvents(
bool metrics::GetCounterData(hsa_ven_amd_aqlprofile_profile_t* profile, hsa_agent_t gpu_agent,
std::vector<results_t*>& results_list) {
uint32_t xcc_count = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle).getXccCount();
uint32_t single_xcc_buff_size = profile->output_buffer.size /(sizeof(uint64_t) * xcc_count);
uint32_t single_xcc_buff_size = profile->output_buffer.size / (sizeof(uint64_t) * xcc_count);
callback_data_t callback_data{&results_list, 0, single_xcc_buff_size};
hsa_status_t status = hsa_ven_amd_aqlprofile_iterate_data(profile, pmcCallback, &callback_data);
return (status == HSA_STATUS_SUCCESS);
@@ -210,16 +210,19 @@ bool metrics::GetMetricsData(std::map<std::string, results_t*>& results_map,
return true;
}
void metrics::GetCountersAndMetricResultsByXcc(uint32_t xcc_index, std::vector<results_t*>& results_list,
std::map<std::string, results_t*>& results_map,
std::vector<const Metric*>& metrics_list){
for(auto it = results_list.begin(); it != results_list.end(); it++){
(*it)->val_double = (*it)->xcc_vals[xcc_index]; // set val_double to hold value for specific xcc
}
void metrics::GetCountersAndMetricResultsByXcc(uint32_t xcc_index,
std::vector<results_t*>& results_list,
std::map<std::string, results_t*>& results_map,
std::vector<const Metric*>& metrics_list) {
for (auto it = results_list.begin(); it != results_list.end(); it++) {
(*it)->val_double =
(*it)->xcc_vals[xcc_index]; // set val_double to hold value for specific xcc
}
for(auto it = results_map.begin(); it != results_map.end(); it++){
it->second->val_double = it->second->xcc_vals[xcc_index]; // set val_double to hold value for specific xcc
}
for (auto it = results_map.begin(); it != results_map.end(); it++) {
it->second->val_double =
it->second->xcc_vals[xcc_index]; // set val_double to hold value for specific xcc
}
GetMetricsData(results_map, metrics_list);
GetMetricsData(results_map, metrics_list);
}
+5 -3
Просмотреть файл
@@ -282,10 +282,12 @@ class MetricsDict {
try {
expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_));
} catch (const xml::exception_t& exc) {
if (do_lookup)
if (do_lookup) {
metrics_list.push_back(node);
else
throw(exc);
} else {
std::cerr << "Error: " << exc.what() << std::endl;
abort();
}
}
if (expr_obj) {
#if 0
+2 -2
Просмотреть файл
@@ -89,7 +89,7 @@ AgentInfo::AgentInfo(const hsa_agent_t agent, ::CoreApiTable* table) : handle_(a
xcc_num_ = 1;
}
int AgentInfo::getIndex() const { return index_; }
uint64_t AgentInfo::getIndex() const { return index_; }
hsa_device_type_t AgentInfo::getType() const { return type_; }
uint64_t AgentInfo::getHandle() const { return handle_; }
const std::string_view AgentInfo::getName() const { return name_; }
@@ -107,7 +107,7 @@ uint32_t AgentInfo::getPCIDomain() const { return pci_domain_; }
uint32_t AgentInfo::getPCILocationID() const { return pci_location_id_; }
uint32_t AgentInfo::getXccCount() const { return xcc_num_; }
void AgentInfo::setIndex(int index) { index_ = index; }
void AgentInfo::setIndex(uint64_t index) { index_ = index; }
void AgentInfo::setType(hsa_device_type_t type) { type_ = type; }
void AgentInfo::setHandle(uint64_t handle) { handle_ = handle; }
void AgentInfo::setName(const std::string& name) { strcpy(name_, name.c_str()); }
+3 -3
Просмотреть файл
@@ -43,7 +43,7 @@ class AgentInfo {
AgentInfo();
AgentInfo(const hsa_agent_t agent, ::CoreApiTable* table);
int getIndex() const;
uint64_t getIndex() const;
hsa_device_type_t getType() const;
uint64_t getHandle() const;
const std::string_view getName() const;
@@ -62,7 +62,7 @@ class AgentInfo {
uint32_t getPCILocationID() const;
uint32_t getXccCount() const;
void setIndex(int index);
void setIndex(uint64_t index);
void setType(hsa_device_type_t type);
void setHandle(uint64_t handle);
void setName(const std::string& name);
@@ -78,7 +78,7 @@ class AgentInfo {
hsa_amd_memory_pool_t gpu_pool;
private:
int index_;
uint64_t index_;
hsa_device_type_t type_; // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
uint64_t handle_;
char name_[64];
+20 -7
Просмотреть файл
@@ -30,26 +30,39 @@ std::mutex agents_map_lock;
std::map<decltype(hsa_agent_t::handle), Agent::AgentInfo> agent_info_map;
Agent::AgentInfo& GetAgentInfo(decltype(hsa_agent_t::handle) handle) {
std::lock_guard<std::mutex> lock(agents_map_lock);
if (agent_info_map.find(handle) != agent_info_map.end())
if (agent_info_map.find(handle) != agent_info_map.end()) {
return agent_info_map.at(handle);
else
throw(std::string("Error: Can't find Agent with handle(") + std::to_string(handle) +
") in this system");
} else {
std::cerr << std::string("Error: Can't find Agent with handle(") << std::to_string(handle) <<
") in this system" << std::endl;
abort();
}
}
std::vector<hsa_agent_t> cpu_agents_list;
void SetAgentInfo(decltype(hsa_agent_t::handle) handle, const Agent::AgentInfo& agent_info) {
std::lock_guard<std::mutex> lock(agents_map_lock);
agent_info_map.emplace(handle, agent_info);
if (agent_info.getType() == HSA_DEVICE_TYPE_GPU) {
cpu_agents_list.emplace_back(hsa_agent_t{handle});
}
}
hsa_agent_t GetAgentByIndex(int agent_index) {
std::vector<hsa_agent_t>& GetCPUAgentList() {
return cpu_agents_list;
}
hsa_agent_t GetAgentByIndex(uint64_t agent_index) {
std::lock_guard<std::mutex> lock(agents_map_lock);
for (auto& agent_info : agent_info_map) {
if (agent_info.second.getIndex() == agent_index) {
return hsa_agent_t{agent_info.second.getHandle()};
}
}
throw(std::string("Error: Can't find Agent with Index(") + std::to_string(agent_index) +
") in this system");
std::cerr << std::string("Error: Can't find Agent with Index(") << std::to_string(agent_index) <<
") in this system" << std::endl;
abort();
}
CoreApiTable saved_core_api{};
+4 -1
Просмотреть файл
@@ -38,9 +38,12 @@
namespace rocmtools {
namespace hsa_support {
std::vector<hsa_agent_t>& GetCPUAgentList();
Agent::AgentInfo& GetAgentInfo(decltype(hsa_agent_t::handle) handle);
void SetAgentInfo(decltype(hsa_agent_t::handle) handle, const Agent::AgentInfo& agent_info);
hsa_agent_t GetAgentByIndex(int agent_index);
hsa_agent_t GetAgentByIndex(uint64_t agent_index);
CoreApiTable& GetCoreApiTable();
void SetCoreApiTable(const CoreApiTable& table);
+263 -280
Просмотреть файл
@@ -25,8 +25,11 @@
#include <hsa/hsa_ven_amd_aqlprofile.h>
#include <stddef.h>
#include <stdint.h>
#include <numa.h>
#include <algorithm>
#include <atomic>
#include <exception>
#include <iostream>
#include <map>
#include <string>
@@ -47,23 +50,22 @@
#define CHECK_HSA_STATUS(msg, status) \
do { \
if ((status) != HSA_STATUS_INFO_BREAK) { \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
if ((status) != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) { \
try { \
const char* emsg = nullptr; \
hsa_status_string(status, &emsg); \
if (!emsg) emsg = "<Unknown HSA Error>"; \
std::cerr << msg << std::endl; \
std::cerr << emsg << std::endl; \
} catch (std::exception & e) { \
} \
abort(); \
} \
} while (0)
namespace Packet {
static const size_t MEM_PAGE_BYTES = 0x1000;
static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
// hsa_amd_memory_pool_t command_pool;
// hsa_amd_memory_pool_t output_pool;
// hsa_amd_memory_pool_t& GetCommandPool() { return command_pool; }
// hsa_amd_memory_pool_t& GetOutputPool() { return output_pool; }
// This function checks to see if the provided
// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true,
@@ -116,11 +118,11 @@ void InitializePools(hsa_agent_t cpu_agent, Agent::AgentInfo* agent_info) {
hsa_status_t status =
rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
cpu_agent, FindStandardPool, &(agent_info->cpu_pool));
if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Command Buffer Pool is not initialized\n");
CHECK_HSA_STATUS("Error: Command Buffer Pool is not initialized", status);
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
cpu_agent, FindKernArgPool, &(agent_info->kernarg_pool));
if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Output Buffer Pool is not initialized\n");
CHECK_HSA_STATUS("Error: Output Buffer Pool is not initialized", status);
}
void InitializeGPUPool(hsa_agent_t gpu_agent, Agent::AgentInfo* agent_info) {
@@ -135,6 +137,7 @@ struct block_des_t {
};
std::map<uint32_t, rocmtools::MetricsDict*> metricsDict;
static std::atomic<bool> counters_added{false};
void CheckPacketReqiurements(std::vector<hsa_agent_t>& gpu_agents) {
for (auto& gpu_agent : gpu_agents) {
@@ -147,253 +150,240 @@ void CheckPacketReqiurements(std::vector<hsa_agent_t>& gpu_agents) {
// Initialize the PM4 commands with having the CPU&GPU agents, the counters,
// counters count to output three packets which are start, stop and read
// packets
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>*
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>
InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
std::vector<std::string>& counter_names, bool is_spm) {
hsa_status_t status = HSA_STATUS_SUCCESS;
if (!counters_added.load(std::memory_order_acquire)) {
for (auto& name : counter_names) {
if (rocmtools::GetROCMToolObj()->HasActiveSession()) {
rocmtools::GetROCMToolObj()
->GetSession(rocmtools::GetROCMToolObj()->GetCurrentSessionId())
->GetProfiler()
->AddCounterName(name);
}
}
counters_added.exchange(true, std::memory_order_release);
}
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
std::map<std::string, rocmtools::results_t*> results_map;
std::vector<rocmtools::event_t> events_list;
std::vector<rocmtools::results_t*> results_list;
std::map<std::pair<uint32_t, uint32_t>, uint64_t> event_to_max_block_count;
std::map<std::string, std::set<std::string>> metrics_counters;
uint32_t counters_count = 0;
for (auto& name : counter_names) {
// std::cout << "Counter from Counter Names: " << name << std::endl;
if (rocmtools::GetROCMToolObj()->HasActiveSession()) {
rocmtools::GetROCMToolObj()
->GetSession(rocmtools::GetROCMToolObj()->GetCurrentSessionId())
->GetProfiler()
->AddCounterName(name);
}
counters_count++;
if (!rocmtools::metrics::ExtractMetricEvents(
counter_names, gpu_agent, metricsDict[gpu_agent.handle], results_map, events_list,
results_list, event_to_max_block_count, metrics_counters)) {
std::cerr << "Error: Failed to extract metric events" << std::endl;
abort();
}
rocmtools::metrics::ExtractMetricEvents(counter_names, gpu_agent, metricsDict[gpu_agent.handle],
results_map, events_list, results_list,
event_to_max_block_count, metrics_counters);
// TODO: validate needs to be called on each events_list[i]
// Validating the events array for the specified gpu agent
bool result;
hsa_ven_amd_aqlprofile_validate_event(gpu_agent, &events_list[0], &result);
if (!result) {
printf("Error: Events are not valid for the current gpu agent\n");
throw("Error: Events are not valid for the current gpu agent");
bool validate_event_result;
status =
hsa_ven_amd_aqlprofile_validate_event(gpu_agent, &events_list[0], &validate_event_result);
CHECK_HSA_STATUS("Error: Validating Counters", status);
if (!validate_event_result) {
std::cerr << "Error: Events are not valid for the current gpu agent" << std::endl;
abort();
}
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>*
profiles = new std::vector<
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>
profiles = std::vector<
std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>();
do {
rocmtools::profiling_context_t* context = new rocmtools::profiling_context_t();
context->gpu_agent = gpu_agent;
uint64_t i = 0;
uint32_t counter_val_iteration = 0;
auto result = results_list.begin();
std::map<std::pair<uint32_t, uint32_t>, uint32_t> block_max_events_count;
std::set<hsa_ven_amd_aqlprofile_block_name_t> block_names_taken;
for (auto event = events_list.begin(); event != events_list.end();) {
if (block_max_events_count[std::make_pair<uint32_t, uint32_t>(
static_cast<uint32_t>(event->block_name),
static_cast<uint32_t>(event->block_index))] <
event_to_max_block_count[std::make_pair<uint32_t, uint32_t>(
static_cast<uint32_t>(event->block_name),
static_cast<uint32_t>(event->block_index))]) {
context->events_list.push_back(*event);
context->results_list.emplace_back(*result);
counter_val_iteration++;
block_max_events_count[std::make_pair<uint32_t, uint32_t>(
static_cast<uint32_t>(event->block_name), static_cast<uint32_t>(event->block_index))]++;
results_list.erase(result);
events_list.erase(event);
} else {
event++;
result++;
}
i++;
}
std::set<std::string> counters_taken;
std::set<std::string> metrics_counters_taken;
for (auto result : context->results_list) {
rocmtools::Metric* metric;
if (std::find(counter_names.begin(), counter_names.end(), result->name) !=
counter_names.end()) {
// std::cout << "Counter from Result List: " << result->name << std::endl;
counters_taken.insert(result->name);
metric = const_cast<rocmtools::Metric*>(metricsDict[gpu_agent.handle]->Get(result->name));
if (metric == nullptr) std::cout << result->name << " not found in metricsDict\n";
context->metrics_list.push_back(metric);
} else {
metrics_counters_taken.insert(result->name);
// std::cout << "Counter Added: " << result->name << std::endl;
}
}
std::set<std::string> metrics_taken;
for (auto result : results_map) {
if (counters_taken.find(result.first) == counters_taken.end() &&
std::find(counter_names.begin(), counter_names.end(), result.first) !=
counter_names.end()) {
bool flag = true;
for (auto result_basic : results_list) {
if (result_basic->name.compare(result.first)) {
flag = false;
break;
}
}
if (flag) metrics_taken.insert(result.first);
}
}
for (auto metric_name : metrics_taken) {
bool flag = true;
if (metrics_counters.find(metric_name) == metrics_counters.end()) continue;
for (auto metric_counter_name : metrics_counters.at(metric_name)) {
if (metrics_counters_taken.find(metric_counter_name) == metrics_counters_taken.end() &&
counters_taken.find(metric_counter_name) == counters_taken.end()) {
flag = false;
continue;
}
}
if (flag) {
// std::cout << "Counter from Result Map: " << metric_name << std::endl;
counters_taken.insert(metric_name);
rocmtools::Metric* metric =
const_cast<rocmtools::Metric*>(metricsDict[gpu_agent.handle]->Get(metric_name));
if (metric == nullptr) std::cout << metric_name << " not found in metricsDict\n";
context->metrics_list.push_back(metric);
}
}
context->results_map = results_map;
context->metrics_dict = metricsDict[gpu_agent.handle];
hsa_ven_amd_aqlprofile_parameter_t* params = {};
packet_t* start_packet = new packet_t();
packet_t* stop_packet = new packet_t();
packet_t* read_packet = new packet_t();
if (context->events_list.size() <= 0) {
continue;
}
// Preparing the profile structure to get the packets
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion-null"
hsa_ven_amd_aqlprofile_event_type_t profile_type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC;
if (is_spm) profile_type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE;
hsa_ven_amd_aqlprofile_profile_t* profile =
new hsa_ven_amd_aqlprofile_profile_t{gpu_agent,
profile_type,
&(context->events_list[0]),
static_cast<uint32_t>(context->events_list.size()),
params,
0,
NULL,
NULL};
#pragma GCC diagnostic pop
// Preparing an Getting the size of the command and output buffers
status = hsa_ven_amd_aqlprofile_start(profile, NULL);
if (status != HSA_STATUS_SUCCESS) {
const char* hsa_err_str = nullptr;
if (hsa_status_string(status, &hsa_err_str) != HSA_STATUS_SUCCESS) hsa_err_str = "Unknown";
printf("Error: %s\n", hsa_err_str);
continue;
// do {
rocmtools::profiling_context_t* context = new rocmtools::profiling_context_t();
context->gpu_agent = gpu_agent;
auto result = results_list.begin();
std::map<std::pair<uint32_t, uint32_t>, uint32_t> block_max_events_count;
std::set<hsa_ven_amd_aqlprofile_block_name_t> block_names_taken;
for (auto event = events_list.begin(); event != events_list.end();) {
if (block_max_events_count[std::make_pair<uint32_t, uint32_t>(
static_cast<uint32_t>(event->block_name), static_cast<uint32_t>(event->block_index))] <
event_to_max_block_count[std::make_pair<uint32_t, uint32_t>(
static_cast<uint32_t>(event->block_name), static_cast<uint32_t>(event->block_index))]) {
context->events_list.push_back(*event);
context->results_list.emplace_back(*result);
block_max_events_count[std::make_pair<uint32_t, uint32_t>(
static_cast<uint32_t>(event->block_name), static_cast<uint32_t>(event->block_index))]++;
results_list.erase(result);
events_list.erase(event);
} else {
status = HSA_STATUS_ERROR;
size_t size = profile->command_buffer.size;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
agentInfo.cpu_pool, size, 0, reinterpret_cast<void**>(&(profile->command_buffer.ptr)));
event++;
result++;
}
}
// Both the CPU and GPU can access the memory
if (status == HSA_STATUS_SUCCESS) {
hsa_agent_t ag_list[1] = {gpu_agent};
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
1, ag_list, NULL, profile->command_buffer.ptr);
std::set<std::string> counters_taken;
if (status != HSA_STATUS_SUCCESS) {
printf("Error: Can't allow access for both agents to Command Buffer\n");
continue;
} else if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
printf("Error: Ran out of GPU memory to allocate Command Buffer\n");
continue;
std::set<std::string> metrics_counters_taken;
for (auto result : context->results_list) {
rocmtools::Metric* metric;
if (std::find(counter_names.begin(), counter_names.end(), result->name) !=
counter_names.end()) {
// std::cout << "Counter from Result List: " << result->name << std::endl;
counters_taken.insert(result->name);
metric = const_cast<rocmtools::Metric*>(metricsDict[gpu_agent.handle]->Get(result->name));
if (metric == nullptr) std::cout << result->name << " not found in metricsDict\n";
context->metrics_list.push_back(metric);
} else {
metrics_counters_taken.insert(result->name);
// std::cout << "Counter Added: " << result->name << std::endl;
}
}
std::set<std::string> metrics_taken;
for (auto result : results_map) {
if (counters_taken.find(result.first) == counters_taken.end() &&
std::find(counter_names.begin(), counter_names.end(), result.first) !=
counter_names.end()) {
bool flag = true;
for (auto result_basic : results_list) {
if (result_basic->name.compare(result.first)) {
flag = false;
break;
}
} else {
const char* hsa_err_str = NULL;
if (hsa_status_string(status, &hsa_err_str) != HSA_STATUS_SUCCESS) hsa_err_str = "Unknown";
printf("Error: Allocating command Buffer (Size=%lu) (%s)\n", size, hsa_err_str);
}
if (flag) metrics_taken.insert(result.first);
}
}
if (!is_spm) {
status = HSA_STATUS_ERROR;
size = profile->output_buffer.size;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
agentInfo.kernarg_pool, size, 0, reinterpret_cast<void**>(&profile->output_buffer.ptr));
if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
printf("Error: Ran out of GPU memory to allocate Output Buffer\n");
continue;
}
if (status == HSA_STATUS_SUCCESS) {
hsa_agent_t ag_list[1] = {gpu_agent};
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
1, ag_list, NULL, profile->output_buffer.ptr);
if (status == HSA_STATUS_SUCCESS) {
memset(profile->output_buffer.ptr, 0x0, profile->output_buffer.size);
// Creating the start/stop/read packets
status = hsa_ven_amd_aqlprofile_start(profile, start_packet);
status = hsa_ven_amd_aqlprofile_stop(profile, stop_packet);
status = hsa_ven_amd_aqlprofile_read(profile, read_packet);
context->start_packet = start_packet;
context->stop_packet = stop_packet;
context->read_packet = read_packet;
// add profiles
profiles->emplace_back(std::make_pair(context, profile));
} else {
printf("Error: Can't allow access for both agents to output Buffer\n");
continue;
}
} else {
const char* hsa_err_str = NULL;
if (hsa_status_string(status, &hsa_err_str) != HSA_STATUS_SUCCESS)
hsa_err_str = "Unknown";
printf("Error: Allocating output Buffer (%s)\n", hsa_err_str);
continue;
}
} else {
profile->output_buffer.size = 0;
status = hsa_ven_amd_aqlprofile_start(profile, start_packet);
status = hsa_ven_amd_aqlprofile_stop(profile, stop_packet);
status = hsa_ven_amd_aqlprofile_read(profile, read_packet);
context->start_packet = start_packet;
context->stop_packet = stop_packet;
context->read_packet = read_packet;
// add profiles
profiles->emplace_back(std::make_pair(context, profile));
for (auto metric_name : metrics_taken) {
bool flag = true;
if (metrics_counters.find(metric_name) == metrics_counters.end()) continue;
for (auto metric_counter_name : metrics_counters.at(metric_name)) {
if (metrics_counters_taken.find(metric_counter_name) == metrics_counters_taken.end() &&
counters_taken.find(metric_counter_name) == counters_taken.end()) {
flag = false;
continue;
}
}
} while (events_list.size() > 0);
if (flag) {
// std::cout << "Counter from Result Map: " << metric_name << std::endl;
counters_taken.insert(metric_name);
rocmtools::Metric* metric =
const_cast<rocmtools::Metric*>(metricsDict[gpu_agent.handle]->Get(metric_name));
if (metric == nullptr) std::cout << metric_name << " not found in metricsDict\n";
context->metrics_list.push_back(metric);
}
}
context->results_map = results_map;
context->metrics_dict = metricsDict[gpu_agent.handle];
hsa_ven_amd_aqlprofile_parameter_t* params = {};
packet_t* start_packet = new packet_t();
packet_t* stop_packet = new packet_t();
packet_t* read_packet = new packet_t();
if (context->events_list.size() <= 0) {
std::cerr << "Error: No events to profile" << std::endl;
abort();
}
// Preparing the profile structure to get the packets
hsa_ven_amd_aqlprofile_event_type_t profile_type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC;
if (is_spm) profile_type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE;
hsa_ven_amd_aqlprofile_profile_t* profile =
new hsa_ven_amd_aqlprofile_profile_t{gpu_agent,
profile_type,
&(context->events_list[0]),
static_cast<uint32_t>(context->events_list.size()),
params,
0,
0,
0};
size_t ag_list_count = 1; // rocmtools::hsa_support::GetCPUAgentList().size();
hsa_agent_t ag_list[ag_list_count];
ag_list[0] = gpu_agent;
// Preparing an Getting the size of the command and output buffers
status = hsa_ven_amd_aqlprofile_start(profile, NULL);
// CHECK_HSA_STATUS("Error: Getting Buffers Size", status);
if (profile->command_buffer.size > 0 && profile->output_buffer.size > 0) {
status = HSA_STATUS_ERROR;
size_t size = profile->command_buffer.size;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
if (size <= 0) {
std::cerr << __FILE__ << ":" << __LINE__ << " "
<< "Error: Command buffer given size is " << size << std::endl;
abort();
}
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
agentInfo.cpu_pool, size, 0, reinterpret_cast<void**>(&(profile->command_buffer.ptr)));
if (status != HSA_STATUS_SUCCESS) {
profile->command_buffer.ptr = malloc(size);
/*numa_alloc_onnode(
size,
rocmtools::hsa_support::GetAgentInfo(agentInfo.getNearCpuAgent().handle).getNumaNode());*/
if (profile->command_buffer.ptr == NULL) {
std::cerr << __FILE__ << ":" << __LINE__ << " "
<< "Error: allocating memory for command buffer using NUMA" << std::endl;
abort();
}
} else {
// Both the CPU and GPU can access the memory
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
ag_list_count, ag_list, NULL, profile->command_buffer.ptr);
CHECK_HSA_STATUS("Error: Allowing access to Command Buffer", status);
}
if (!is_spm) {
status = HSA_STATUS_ERROR;
size_t size = profile->output_buffer.size;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
if (size <= 0) {
std::cerr << __FILE__ << ":" << __LINE__ << " "
<< "Error: Output buffer given size is " << size << std::endl;
abort();
}
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
agentInfo.kernarg_pool, size, 0, reinterpret_cast<void**>(&profile->output_buffer.ptr));
if (status != HSA_STATUS_SUCCESS) {
profile->output_buffer.ptr = malloc(size);
/*numa_alloc_onnode(
size,
rocmtools::hsa_support::GetAgentInfo(agentInfo.getNearCpuAgent().handle)
.getNumaNode());*/
if (profile->output_buffer.ptr == NULL) {
std::cerr << __FILE__ << ":" << __LINE__ << " "
<< "Error: allocating memory for output buffer using NUMA" << std::endl;
abort();
}
} else {
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
ag_list_count, ag_list, NULL, profile->output_buffer.ptr);
CHECK_HSA_STATUS("Error: GPU Agent can't have output buffer access", status);
memset(profile->output_buffer.ptr, 0x0, profile->output_buffer.size);
}
} else {
profile->output_buffer.size = 0;
}
status = hsa_ven_amd_aqlprofile_start(profile, start_packet);
// CHECK_HSA_STATUS("Error: Creating Start Packet\n", status);
status = hsa_ven_amd_aqlprofile_stop(profile, stop_packet);
// CHECK_HSA_STATUS("Error: Creating Stop Packet\n", status);
status = hsa_ven_amd_aqlprofile_read(profile, read_packet);
// CHECK_HSA_STATUS("Error: Creating Read Packet\n", status);
context->start_packet = start_packet;
context->stop_packet = stop_packet;
context->read_packet = read_packet;
// add profiles
profiles.emplace_back(std::make_pair(context, profile));
}
// } while (events_list.size() > 0);
return profiles;
}
@@ -407,67 +397,71 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets(
// Validating the events array for the specified gpu agent
bool result;
hsa_ven_amd_aqlprofile_validate_event(gpu_agent, events, &result);
if (!result) {
printf("Error: Events are not valid for the current gpu agent\n");
throw("Error: Events are not valid for the current gpu agent");
}
status = hsa_ven_amd_aqlprofile_validate_event(gpu_agent, events, &result);
CHECK_HSA_STATUS("Error: Events are not valid for the current gpu agent\n", status);
hsa_ven_amd_aqlprofile_parameter_t* params = {};
uint8_t* command_buffer = nullptr;
uint8_t* output_buffer = nullptr;
// Preparing the profile structure to get the packets
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion-null"
// Preparing the profile structure to get the packets
hsa_ven_amd_aqlprofile_profile_t* profile = new hsa_ven_amd_aqlprofile_profile_t{
gpu_agent, HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC, events, event_count, params, 0, NULL, NULL};
#pragma GCC diagnostic pop
gpu_agent, HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC, events, event_count, params, 0, 0, 0};
// Preparing an Getting the size of the command and output buffers
status = hsa_ven_amd_aqlprofile_start(profile, NULL);
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
size_t ag_list_count = 1;
hsa_agent_t ag_list[ag_list_count];
ag_list[0] = gpu_agent;
// Allocating Command Buffer
status = HSA_STATUS_ERROR;
size_t size = profile->command_buffer.size;
profile->command_buffer.ptr = nullptr;
if (size <= 0) return nullptr;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
agentInfo.cpu_pool, size, 0, reinterpret_cast<void**>(&command_buffer));
agentInfo.cpu_pool, size, 0, reinterpret_cast<void**>(&(profile->command_buffer.ptr)));
// Both the CPU and GPU can access the memory
if (status == HSA_STATUS_SUCCESS) {
hsa_agent_t ag_list[1] = {gpu_agent};
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
1, ag_list, NULL, command_buffer);
ag_list_count, ag_list, NULL, profile->command_buffer.ptr);
CHECK_HSA_STATUS("Error: GPU Agent can't have command buffer access", status);
} else {
profile->command_buffer.ptr = numa_alloc_onnode(
profile->command_buffer.size,
rocmtools::hsa_support::GetAgentInfo(agentInfo.getNearCpuAgent().handle).getNumaNode());
if (profile->command_buffer.ptr != nullptr) {
status = HSA_STATUS_SUCCESS;
} else {
CHECK_HSA_STATUS("Error: Allocating Command Buffer", status);
}
}
profile->command_buffer.ptr = (status == HSA_STATUS_SUCCESS) ? command_buffer : nullptr;
if (status != HSA_STATUS_SUCCESS) printf("Error: Allocating Command Buffer\n");
// Allocating Output Buffer
status = HSA_STATUS_ERROR;
size = profile->output_buffer.size;
profile->output_buffer.ptr = nullptr;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
agentInfo.kernarg_pool, size, 0, reinterpret_cast<void**>(&output_buffer));
agentInfo.gpu_pool, size, 0, reinterpret_cast<void**>(&(profile->output_buffer.ptr)));
CHECK_HSA_STATUS("Error: Can't Allocate Output Buffer", status);
// Both the CPU and GPU can access the kernel arguments
if (status == HSA_STATUS_SUCCESS) {
hsa_agent_t ag_list[1] = {gpu_agent};
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
1, ag_list, NULL, output_buffer);
}
if (status == HSA_STATUS_SUCCESS) {
profile->output_buffer.ptr = output_buffer;
ag_list_count, ag_list, NULL, profile->output_buffer.ptr);
CHECK_HSA_STATUS("Error: Can't allow access on the Output Buffer for the GPU", status);
memset(profile->output_buffer.ptr, 0x0, profile->output_buffer.size);
} else {
profile->output_buffer.ptr = nullptr;
}
// Creating the start/stop/read packets
status = hsa_ven_amd_aqlprofile_start(profile, start_packet);
CHECK_HSA_STATUS("Error: Creating Start Packet\n", status);
status = hsa_ven_amd_aqlprofile_stop(profile, stop_packet);
CHECK_HSA_STATUS("Error: Creating Stop Packet\n", status);
status = hsa_ven_amd_aqlprofile_read(profile, read_packet);
CHECK_HSA_STATUS("Error: Creating Read Packet\n", status);
if (status == HSA_STATUS_ERROR) return nullptr;
return profile;
@@ -479,17 +473,19 @@ bool g_output_buffer_local = true;
// Allocate system memory accessible by both CPU and GPU
uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_pool_t* cpu_pool) {
size_t ag_list_count = 1; // rocmtools::hsa_support::GetCPUAgentList().size();
hsa_agent_t ag_list[ag_list_count];
ag_list[0] = gpu_agent;
hsa_status_t status = HSA_STATUS_ERROR;
uint8_t* buffer = NULL;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
// if (!cpu_agents_.empty()) {
status = hsa_amd_memory_pool_allocate(*cpu_pool, size, 0, reinterpret_cast<void**>(&buffer));
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
*cpu_pool, size, 0, reinterpret_cast<void**>(&buffer));
// Both the CPU and GPU can access the memory
if (status == HSA_STATUS_SUCCESS) {
hsa_agent_t ag_list[1] = {gpu_agent};
status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer);
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
ag_list_count, ag_list, NULL, buffer);
}
// }
uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
return ptr;
}
@@ -542,18 +538,6 @@ att_mem_pools_map_t* GetAttMemPoolsMap() {
return agent_att_mem_pools_map;
}
att_memory_pools_t* GetAttMemPools(hsa_agent_t gpu_agent) {
auto it = GetAttMemPoolsMap()->find(gpu_agent.handle);
if (it != GetAttMemPoolsMap()->end()) {
return it->second;
}
printf("Error: att_memory_pools_t instance not found for given gpu agent handle: %lu\n",
gpu_agent.handle);
return nullptr;
}
// Generate start and stop packets for collecting ATT traces
// Also generate and return the profile object which has the PM4
// command buffer and the output buffer for retrieving the traces
@@ -573,7 +557,7 @@ hsa_ven_amd_aqlprofile_profile_t* GenerateATTPackets(
// Check the profile buffer sizes
hsa_status_t status = hsa_ven_amd_aqlprofile_start(profile, NULL);
if (status != HSA_STATUS_SUCCESS) printf("Error: aqlprofile_start(NULL)");
CHECK_HSA_STATUS("Error: Getting PM4 Start Packet", status);
// TODO: create a separate class for memory allocations
// Maintain pools per device
// handle allocation and resource cleanup
@@ -582,14 +566,13 @@ hsa_ven_amd_aqlprofile_profile_t* GenerateATTPackets(
// command buffer -> from CPU memory pool
// output buffer -> from GPU memory pool
status = Allocate(gpu_agent, profile);
if (status != HSA_STATUS_SUCCESS) printf("Error: Allocate()");
CHECK_HSA_STATUS("Error: Att Buffers Allocation", status);
// Generate start/stop/read profiling packets
status = hsa_ven_amd_aqlprofile_start(profile, start_packet);
if (status != HSA_STATUS_SUCCESS) printf("Error: aqlprofile_start");
CHECK_HSA_STATUS("Error: Creating Start PM4 Packet", status);
status = hsa_ven_amd_aqlprofile_stop(profile, stop_packet);
if (status != HSA_STATUS_SUCCESS) printf("Error: aqlprofile_stop");
if (status == HSA_STATUS_ERROR) return nullptr;
CHECK_HSA_STATUS("Error: Creating Stop PM4 Packet", status);
return profile;
}
+1 -1
Просмотреть файл
@@ -39,7 +39,7 @@ namespace Packet {
typedef hsa_ext_amd_aql_pm4_packet_t packet_t;
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>*
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>
InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
std::vector<std::string>& counter_names, bool is_spm = false);
uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_pool_t* cpu_pool);
+196 -173
Просмотреть файл
@@ -26,6 +26,7 @@
#include <vector>
#include <utility>
#include <algorithm>
#include <numa.h>
#include "rocprofiler.h"
#include "src/api/rocmtool.h"
@@ -33,6 +34,21 @@
#include "src/core/hsa/hsa_support.h"
#include "src/utils/helper.h"
#define CHECK_HSA_STATUS(msg, status) \
do { \
if ((status) != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) { \
try { \
const char* emsg = nullptr; \
hsa_status_string(status, &emsg); \
if (!emsg) emsg = "<Unknown HSA Error>"; \
std::cerr << msg << std::endl; \
std::cerr << emsg << std::endl; \
} catch (std::exception & e) { \
} \
abort(); \
} \
} while (0)
#define __NR_gettid 186
#define MAX_ATT_PROFILES 16
@@ -290,27 +306,41 @@ hsa_status_t attTraceDataCallback(hsa_ven_amd_aqlprofile_info_type_t info_type,
return status;
}
void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_signal_t& pending) {
std::vector<rocprofiler_record_counter_instance_t> counters_vec;
for (size_t i = 0; i < pending.context->metrics_list.size(); i++) {
const rocmtools::Metric* metric = pending.context->metrics_list[i];
void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_signal_t* pending) {
record->counters_count =
rocprofiler_record_counters_instances_count_t{pending->context->metrics_list.size()};
size_t counters_list_size =
record->counters_count.value * sizeof(rocprofiler_record_counter_instance_t);
rocprofiler_record_counter_instance_t* counters =
static_cast<rocprofiler_record_counter_instance_t*>(malloc(counters_list_size));
for (size_t i = 0; i < pending->context->metrics_list.size(); i++) {
const rocmtools::Metric* metric = pending->context->metrics_list[i];
double value = 0;
std::string metric_name = metric->GetName();
auto it = pending.context->results_map.find(metric_name);
if (it != pending.context->results_map.end()) {
auto it = pending->context->results_map.find(metric_name);
if (it != pending->context->results_map.end()) {
value = it->second->val_double;
}
counters_vec.emplace_back(rocprofiler_record_counter_instance_t{
counters[i] = (rocprofiler_record_counter_instance_t{
// TODO(aelwazir): Moving to span once C++20 is adopted, strdup can be
// removed after that
rocprofiler_counter_id_t{rocmtools::profiler::GetCounterID(metric_name)},
rocprofiler_record_counter_value_t{value}});
}
record->counters = static_cast<rocprofiler_record_counter_instance_t*>(
malloc(counters_vec.size() * sizeof(rocprofiler_record_counter_instance_t)));
::memcpy(record->counters, &(counters_vec)[0],
counters_vec.size() * sizeof(rocprofiler_record_counter_instance_t));
record->counters_count = rocprofiler_record_counters_instances_count_t{counters_vec.size()};
record->counters = counters;
rocmtools::Session* session = GetROCMToolObj()->GetSession(pending->session_id);
void* initial_handle = const_cast<rocprofiler_record_counter_instance_t*>(record->counters);
if (session->FindBuffer(pending->buffer_id)) {
Memory::GenericBuffer* buffer = session->GetBuffer(pending->buffer_id);
buffer->AddRecord(*record, record->counters, counters_list_size,
[initial_handle](auto& record, const void* data) {
if (record.counters == initial_handle && data != initial_handle) {
free(initial_handle);
}
record.counters =
static_cast<const rocprofiler_record_counter_instance_t*>(data);
});
}
}
void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent,
@@ -330,7 +360,6 @@ void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent
for (trace_data_it = data.begin(); trace_data_it != data.end(); trace_data_it++) {
const void* data_ptr = trace_data_it->trace_data.ptr;
const uint32_t data_size = trace_data_it->trace_data.size;
// fprintf(arg->file, " SE(%u) size(%u)\n", data.sample_id, data_size);
void* buffer = NULL;
if (data_size != 0) {
@@ -359,23 +388,22 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) {
!GetROCMToolObj()->GetSession(queue_info_session->session_id)->GetProfiler())
return true;
rocmtools::Session* session = GetROCMToolObj()->GetSession(queue_info_session->session_id);
std::lock_guard<std::mutex> lock(session->GetSessionLock());
rocmtools::profiler::Profiler* profiler = session->GetProfiler();
std::vector<pending_signal_t>& pending_signals = const_cast<std::vector<pending_signal_t>&>(
std::vector<pending_signal_t*> pending_signals = const_cast<std::vector<pending_signal_t*>&>(
profiler->GetPendingSignals(queue_info_session->writer_id));
if (!pending_signals.empty()) {
for (auto it = pending_signals.begin(); it != pending_signals.end();
it = pending_signals.erase(it)) {
auto& pending = *it;
std::lock_guard<std::mutex> lock(session->GetSessionLock());
if (hsa_support::GetCoreApiTable().hsa_signal_load_relaxed_fn(pending.signal)) return true;
if (hsa_support::GetCoreApiTable().hsa_signal_load_relaxed_fn(pending->signal)) return true;
hsa_amd_profiling_dispatch_time_t time;
hsa_support::GetAmdExtTable().hsa_amd_profiling_get_dispatch_time_fn(
queue_info_session->agent, pending.signal, &time);
queue_info_session->agent, pending->signal, &time);
uint32_t record_count = 1;
bool is_individual_xcc_mode = false;
uint32_t xcc_count =
hsa_support::GetAgentInfo(queue_info_session->agent.handle).getXccCount();
uint32_t xcc_count = queue_info_session->xcc_count;
if (xcc_count > 1) { // for MI300
const char* str = getenv("ROCPROFILER_INDIVIDUAL_XCC_MODE");
if (str != NULL) is_individual_xcc_mode = (atol(str) > 0);
@@ -387,70 +415,62 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) {
rocprofiler_record_profiler_t record{};
// TODO: (sauverma) gpu-id will need to support xcc like so- 1.1, 1.2, 1.3 ... 1.5 for
// different xcc
record.gpu_id = rocprofiler_agent_id_t{
(uint64_t)hsa_support::GetAgentInfo(queue_info_session->agent.handle).getIndex()};
record.kernel_properties = pending.kernel_properties;
record.thread_id = rocprofiler_thread_id_t{pending.thread_id};
record.queue_idx = rocprofiler_queue_index_t{pending.queue_index};
record.gpu_id = rocprofiler_agent_id_t{(uint64_t)queue_info_session->gpu_index};
record.kernel_properties = pending->kernel_properties;
record.thread_id = rocprofiler_thread_id_t{pending->thread_id};
record.queue_idx = rocprofiler_queue_index_t{pending->queue_index};
record.timestamps = rocprofiler_record_header_timestamp_t{time.start, time.end};
record.queue_id = rocprofiler_queue_id_t{queue_info_session->queue_id};
if (pending.counters_count > 0 && pending.context->metrics_list.size() > 0 &&
pending.profile) {
if (xcc_id == 0) // call to GetCounterData() is required only once for a dispatch
rocmtools::metrics::GetCounterData(pending.profile, queue_info_session->agent,
pending.context->results_list);
if (is_individual_xcc_mode)
rocmtools::metrics::GetCountersAndMetricResultsByXcc(
xcc_id, pending.context->results_list, pending.context->results_map,
pending.context->metrics_list);
else
rocmtools::metrics::GetMetricsData(pending.context->results_map,
pending.context->metrics_list);
AddRecordCounters(&record, pending);
}
// Kernel Descriptor is the right record id generated in the WriteInterceptor function and
// will be used to handle the kernel name of that dispatch
record.header = {ROCPROFILER_PROFILER_RECORD,
rocprofiler_record_id_t{pending.kernel_descriptor}};
record.kernel_id = rocprofiler_kernel_id_t{pending.kernel_descriptor};
record.header = rocprofiler_record_header_t{
ROCPROFILER_PROFILER_RECORD, rocprofiler_record_id_t{pending->kernel_descriptor}};
record.kernel_id = rocprofiler_kernel_id_t{pending->kernel_descriptor};
record.correlation_id = rocprofiler_correlation_id_t{pending->correlation_id};
if (pending.session_id.handle == 0) {
pending.session_id = GetROCMToolObj()->GetCurrentSessionId();
if (pending->session_id.handle == 0) {
pending->session_id = GetROCMToolObj()->GetCurrentSessionId();
}
if (session->FindBuffer(pending.buffer_id)) {
Memory::GenericBuffer* buffer = session->GetBuffer(pending.buffer_id);
if (pending.profile && pending.counters_count > 0) {
rocprofiler_record_counter_instance_t* record_counters = record.counters;
buffer->AddRecord(
record, record.counters,
(record.counters_count.value * (sizeof(rocprofiler_record_counter_instance_t) + 1)),
[](auto& record, const void* data) {
record.counters = const_cast<rocprofiler_record_counter_instance_t*>(
static_cast<const rocprofiler_record_counter_instance_t*>(data));
});
free(record_counters);
} else {
if (pending->counters_count > 0 && pending->context->metrics_list.size() > 0 &&
pending->profile) {
if (xcc_id == 0) // call to GetCounterData() is required only once for a dispatch
rocmtools::metrics::GetCounterData(pending->profile, queue_info_session->agent,
pending->context->results_list);
if (is_individual_xcc_mode)
rocmtools::metrics::GetCountersAndMetricResultsByXcc(
xcc_id, pending->context->results_list, pending->context->results_map,
pending->context->metrics_list);
else
rocmtools::metrics::GetMetricsData(pending->context->results_map,
pending->context->metrics_list);
AddRecordCounters(&record, pending);
} else {
if (session->FindBuffer(pending->buffer_id)) {
Memory::GenericBuffer* buffer = session->GetBuffer(pending->buffer_id);
buffer->AddRecord(record);
}
}
}
if (pending.counters_count > 0 && pending.profile && pending.profile->events) {
if (pending->counters_count > 0 && pending->profile && pending->profile->events) {
// TODO(aelwazir): we need a better way of distributing events and free them
// free(const_cast<hsa_ven_amd_aqlprofile_event_t*>(pending.profile->events));
// if (pending->profile->output_buffer.ptr)
// numa_free(pending->profile->output_buffer.ptr, pending->profile->output_buffer.size);
hsa_status_t status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_free_fn(
(pending.profile->output_buffer.ptr));
if (status != HSA_STATUS_SUCCESS) {
printf("Error: Couldn't free output buffer memory\n");
}
(pending->profile->output_buffer.ptr));
CHECK_HSA_STATUS("Error: Couldn't free output buffer memory", status);
// if (pending->profile->command_buffer.ptr)
// numa_free(pending->profile->command_buffer.ptr, pending->profile->command_buffer.size);
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_free_fn(
(pending.profile->command_buffer.ptr));
if (status != HSA_STATUS_SUCCESS) {
printf("Error: Couldn't free command buffer memory\n");
(pending->profile->command_buffer.ptr));
CHECK_HSA_STATUS("Error: Couldn't free command buffer memory", status);
delete pending->profile;
for (auto& it : pending->context->results_map) {
delete it.second;
}
delete pending.profile;
delete pending->context;
}
if (pending.signal.handle)
hsa_support::GetCoreApiTable().hsa_signal_destroy_fn(pending.signal);
if (pending->signal.handle)
hsa_support::GetCoreApiTable().hsa_signal_destroy_fn(pending->signal);
if (queue_info_session->interrupt_signal.handle)
hsa_support::GetCoreApiTable().hsa_signal_destroy_fn(queue_info_session->interrupt_signal);
}
@@ -483,8 +503,7 @@ bool AsyncSignalHandlerATT(hsa_signal_value_t /* signal */, void* data) {
if (hsa_support::GetCoreApiTable().hsa_signal_load_relaxed_fn(pending.signal)) return true;
rocprofiler_record_att_tracer_t record{};
record.kernel_id = rocprofiler_kernel_id_t{pending.kernel_descriptor};
record.gpu_id = rocprofiler_agent_id_t{
(uint64_t)hsa_support::GetAgentInfo(queue_info_session->agent.handle).getIndex()};
record.gpu_id = rocprofiler_agent_id_t{(uint64_t)queue_info_session->gpu_index};
record.kernel_properties = pending.kernel_properties;
record.thread_id = rocprofiler_thread_id_t{pending.thread_id};
record.queue_idx = rocprofiler_queue_index_t{pending.queue_index};
@@ -506,14 +525,10 @@ bool AsyncSignalHandlerATT(hsa_signal_value_t /* signal */, void* data) {
}
hsa_status_t status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_free_fn(
(pending.profile->output_buffer.ptr));
if (status != HSA_STATUS_SUCCESS) {
printf("Error: Couldn't free output buffer memory\n");
}
CHECK_HSA_STATUS("Error: Couldn't free output buffer memory", status);
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_free_fn(
(pending.profile->command_buffer.ptr));
if (status != HSA_STATUS_SUCCESS) {
printf("Error: Couldn't free command buffer memory\n");
}
CHECK_HSA_STATUS("Error: Couldn't free command buffer memory", status);
delete pending.profile;
}
}
@@ -540,19 +555,19 @@ void AddVendorSpecificPacket(const Packet::packet_t* packet,
void SignalAsyncHandler(const hsa_signal_t& signal, void* data) {
hsa_status_t status = hsa_support::GetAmdExtTable().hsa_amd_signal_async_handler_fn(
signal, HSA_SIGNAL_CONDITION_EQ, 0, AsyncSignalHandler, data);
if (status != HSA_STATUS_SUCCESS) fatal("hsa_amd_signal_async_handler failed");
CHECK_HSA_STATUS("Error: hsa_amd_signal_async_handler failed", status);
}
void signalAsyncHandlerATT(const hsa_signal_t& signal, void* data) {
hsa_status_t status = hsa_support::GetAmdExtTable().hsa_amd_signal_async_handler_fn(
signal, HSA_SIGNAL_CONDITION_EQ, 0, AsyncSignalHandlerATT, data);
if (status != HSA_STATUS_SUCCESS) fatal("hsa_amd_signal_async_handler failed");
CHECK_HSA_STATUS("Error: hsa_amd_signal_async_handler for ATT failed", status);
}
void CreateSignal(uint32_t attribute, hsa_signal_t* signal) {
hsa_status_t status =
hsa_support::GetAmdExtTable().hsa_amd_signal_create_fn(1, 0, nullptr, attribute, signal);
if (status != HSA_STATUS_SUCCESS) fatal("hsa_amd_signal_create failed");
CHECK_HSA_STATUS("Error: hsa_amd_signal_create failed", status);
}
template <typename Integral = uint64_t> constexpr Integral bit_mask(int first, int last) {
@@ -659,13 +674,13 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
is_pc_sampling_collection_mode) &&
session) {
// Getting Queue Data and Information
auto& queue_info = *static_cast<Queue*>(data);
Queue& queue_info = *reinterpret_cast<Queue*>(data);
std::lock_guard<std::mutex> lk(queue_info.qw_mutex);
// hsa_ven_amd_aqlprofile_profile_t* profile;
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>*
profiles = nullptr;
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>
profiles;
// Searching accross all the packets given during this write
@@ -686,106 +701,109 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
// Get the PM4 Packets using packets_generator
profiles = Packet::InitializeAqlPackets(queue_info.GetCPUAgent(), queue_info.GetGPUAgent(),
session_data);
replay_mode_count = profiles->size();
replay_mode_count = profiles.size();
}
uint32_t profile_id = 0;
hsa_signal_t interrupt_signal;
do {
std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*> profile;
if (profiles && replay_mode_count > 0) profile = profiles->at(profile_id);
// do {
std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*> profile;
if (profiles.size() > 0 && replay_mode_count > 0) profile = profiles.at(profile_id);
uint32_t writer_id = WRITER_ID.fetch_add(1, std::memory_order_release);
uint32_t writer_id = WRITER_ID.fetch_add(1, std::memory_order_release);
if (session_data_count > 0 && is_counter_collection_mode && profiles &&
replay_mode_count > 0) {
// Adding start packet and its barrier with a dummy signal
hsa_signal_t dummy_signal{};
dummy_signal.handle = 0;
profile.first->start_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
<< HSA_PACKET_HEADER_TYPE;
AddVendorSpecificPacket(profile.first->start_packet, &transformed_packets, dummy_signal);
if (session_data_count > 0 && is_counter_collection_mode && profiles.size() > 0 &&
replay_mode_count > 0) {
// Adding start packet and its barrier with a dummy signal
hsa_signal_t dummy_signal{};
dummy_signal.handle = 0;
profile.first->start_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
<< HSA_PACKET_HEADER_TYPE;
AddVendorSpecificPacket(profile.first->start_packet, &transformed_packets, dummy_signal);
CreateBarrierPacket(profile.first->start_packet->completion_signal, &transformed_packets);
}
CreateBarrierPacket(profile.first->start_packet->completion_signal, &transformed_packets);
}
auto& packet = transformed_packets.emplace_back(packets_arr[i]);
auto& dispatch_packet = reinterpret_cast<hsa_kernel_dispatch_packet_t&>(packet);
auto& packet = transformed_packets.emplace_back(packets_arr[i]);
auto& dispatch_packet = reinterpret_cast<hsa_kernel_dispatch_packet_t&>(packet);
/*
* Only PC sampling relies on this right now, so it would be better to
* only generate an ID if PC sampling is active to conserve IDs, but it's
* unlikely 64 bits' worth of identifiers will be exhausted during the
* lifetime of the ROCMToolObj.
*/
dispatch_packet.reserved2 = GetROCMToolObj()->GetUniqueKernelDispatchId();
/*
* Only PC sampling relies on this right now, so it would be better to
* only generate an ID if PC sampling is active to conserve IDs, but it's
* unlikely 64 bits' worth of identifiers will be exhausted during the
* lifetime of the ROCMToolObj.
*/
uint64_t correlation_id = dispatch_packet.reserved2;
// dispatch_packet.reserved2 = GetROCMToolObj()->GetUniqueKernelDispatchId();
CreateSignal(HSA_AMD_SIGNAL_AMD_GPU_ONLY, &packet.completion_signal);
// Adding the dispatch packet newly created signal to the pending signals
// list to be processed by the signal interrupt
rocprofiler_kernel_properties_t kernel_properties =
set_kernel_properties(dispatch_packet, queue_info.GetGPUAgent());
if (session) {
uint64_t record_id = GetROCMToolObj()->GetUniqueRecordId();
AddKernelNameWithDispatchID(GetKernelNameFromKsymbols(dispatch_packet.kernel_object),
record_id);
if (profiles && replay_mode_count > 0) {
session->GetProfiler()->AddPendingSignals(
writer_id, record_id, dispatch_packet.completion_signal, session_id, buffer_id,
profile.first, profile.first->metrics_list.size(), profile.second,
kernel_properties, (uint32_t)syscall(__NR_gettid), user_pkt_index);
} else {
session->GetProfiler()->AddPendingSignals(
writer_id, record_id, dispatch_packet.completion_signal, session_id, buffer_id,
nullptr, 0, nullptr, kernel_properties, (uint32_t)syscall(__NR_gettid),
user_pkt_index);
}
}
// Make a copy of the original packet, adding its signal to a barrier
// packet and create a new signal for it to get timestamps
if (original_packet.completion_signal.handle) {
hsa_barrier_and_packet_t barrier{0};
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
Packet::packet_t* __attribute__((__may_alias__)) pkt =
(reinterpret_cast<Packet::packet_t*>(&barrier));
transformed_packets.emplace_back(*pkt).completion_signal =
original_packet.completion_signal;
}
// Adding a barrier packet with the original packet's completion signal.
CreateSignal(0, &interrupt_signal);
// Adding Stop and Read PM4 Packets
if (session_data_count > 0 && is_counter_collection_mode && profiles &&
replay_mode_count > 0) {
hsa_signal_t dummy_signal{};
profile.first->stop_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
<< HSA_PACKET_HEADER_TYPE;
AddVendorSpecificPacket(profile.first->stop_packet, &transformed_packets, dummy_signal);
profile.first->read_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
<< HSA_PACKET_HEADER_TYPE;
AddVendorSpecificPacket(profile.first->read_packet, &transformed_packets,
interrupt_signal);
// Added Interrupt Signal with barrier and provided handler for it
CreateBarrierPacket(interrupt_signal, &transformed_packets);
CreateSignal(HSA_AMD_SIGNAL_AMD_GPU_ONLY, &packet.completion_signal);
// Adding the dispatch packet newly created signal to the pending signals
// list to be processed by the signal interrupt
rocprofiler_kernel_properties_t kernel_properties =
set_kernel_properties(dispatch_packet, queue_info.GetGPUAgent());
if (session) {
uint64_t record_id = GetROCMToolObj()->GetUniqueRecordId();
AddKernelNameWithDispatchID(GetKernelNameFromKsymbols(dispatch_packet.kernel_object),
record_id);
if (profiles.size() > 0 && replay_mode_count > 0) {
session->GetProfiler()->AddPendingSignals(
writer_id, record_id, dispatch_packet.completion_signal, session_id, buffer_id,
profile.first, profile.first->metrics_list.size(), profile.second, kernel_properties,
(uint32_t)syscall(__NR_gettid), user_pkt_index, correlation_id);
} else {
hsa_barrier_and_packet_t barrier{0};
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
barrier.completion_signal = interrupt_signal;
Packet::packet_t* __attribute__((__may_alias__)) pkt =
(reinterpret_cast<Packet::packet_t*>(&barrier));
transformed_packets.emplace_back(*pkt);
session->GetProfiler()->AddPendingSignals(
writer_id, record_id, dispatch_packet.completion_signal, session_id, buffer_id,
nullptr, 0, nullptr, kernel_properties, (uint32_t)syscall(__NR_gettid),
user_pkt_index, correlation_id);
}
// Creating Async Handler to be called every time the interrupt signal is
// marked complete
SignalAsyncHandler(interrupt_signal,
new queue_info_session_t{queue_info.GetGPUAgent(), session_id,
queue_info.GetQueueID(), writer_id});
ACTIVE_INTERRUPT_SIGNAL_COUNT.fetch_add(1, std::memory_order_relaxed);
profile_id++;
} while (replay_mode_count > 0 && profile_id < replay_mode_count); // Profiles loop end
}
// Make a copy of the original packet, adding its signal to a barrier
// packet and create a new signal for it to get timestamps
if (original_packet.completion_signal.handle) {
hsa_barrier_and_packet_t barrier{0};
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
Packet::packet_t* __attribute__((__may_alias__)) pkt =
(reinterpret_cast<Packet::packet_t*>(&barrier));
transformed_packets.emplace_back(*pkt).completion_signal =
original_packet.completion_signal;
}
hsa_signal_t interrupt_signal{};
// Adding a barrier packet with the original packet's completion signal.
CreateSignal(0, &interrupt_signal);
// Adding Stop and Read PM4 Packets
if (session_data_count > 0 && is_counter_collection_mode) {
hsa_signal_t dummy_signal{};
profile.first->stop_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
<< HSA_PACKET_HEADER_TYPE;
AddVendorSpecificPacket(profile.first->stop_packet, &transformed_packets, dummy_signal);
profile.first->read_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
<< HSA_PACKET_HEADER_TYPE;
AddVendorSpecificPacket(profile.first->read_packet, &transformed_packets, interrupt_signal);
// Added Interrupt Signal with barrier and provided handler for it
CreateBarrierPacket(interrupt_signal, &transformed_packets);
} else {
hsa_barrier_and_packet_t barrier{0};
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
barrier.completion_signal = interrupt_signal;
Packet::packet_t* __attribute__((__may_alias__)) pkt =
(reinterpret_cast<Packet::packet_t*>(&barrier));
transformed_packets.emplace_back(*pkt);
}
Agent::AgentInfo& agentInfo =
rocmtools::hsa_support::GetAgentInfo(queue_info.GetGPUAgent().handle);
// Creating Async Handler to be called every time the interrupt signal is
// marked complete
SignalAsyncHandler(
interrupt_signal,
new queue_info_session_t{queue_info.GetGPUAgent(), session_id, queue_info.GetQueueID(),
writer_id, interrupt_signal, agentInfo.getIndex(),
agentInfo.getXccCount()});
ACTIVE_INTERRUPT_SIGNAL_COUNT.fetch_add(1, std::memory_order_relaxed);
// profile_id++;
// } while (replay_mode_count > 0 && profile_id < replay_mode_count); // Profiles loop end
}
/* Write the transformed packets to the hardware queue. */
writer(&transformed_packets[0], transformed_packets.size());
@@ -795,7 +813,7 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
// Getting Queue Data and Information
auto& queue_info = *static_cast<Queue*>(data);
std::lock_guard<std::mutex> lk(queue_info.qw_mutex);
Agent::AgentInfo* agentInfo = &(hsa_support::GetAgentInfo(queue_info.GetGPUAgent().handle));
Agent::AgentInfo agentInfo = hsa_support::GetAgentInfo(queue_info.GetGPUAgent().handle);
bool can_profile_anypacket = false;
std::vector<bool> can_profile_packet;
@@ -858,7 +876,7 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
}
if (att_counters_names.size() > 0) {
MetricsDict* metrics_dict_ = MetricsDict::Create(agentInfo);
MetricsDict* metrics_dict_ = MetricsDict::Create(&agentInfo);
for (const std::string& counter_name : att_counters_names) {
const Metric* metric = metrics_dict_->Get(counter_name);
@@ -1007,6 +1025,11 @@ Queue::Queue(const hsa_agent_t& cpu_agent, const hsa_agent_t& gpu_agent, uint32_
*queue = intercept_queue_;
}
Queue::~Queue() {
while (ACTIVE_INTERRUPT_SIGNAL_COUNT.load(std::memory_order_acquire) > 0) {
}
}
hsa_queue_t* Queue::GetCurrentInterceptQueue() { return intercept_queue_; }
hsa_agent_t Queue::GetGPUAgent() { return gpu_agent_; }
+3 -1
Просмотреть файл
@@ -56,7 +56,7 @@ class Queue {
hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data,
uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue);
~Queue() {}
~Queue();
hsa_queue_t* GetCurrentInterceptQueue();
hsa_agent_t GetGPUAgent();
@@ -82,6 +82,8 @@ struct queue_info_session_t {
uint64_t queue_id;
uint32_t writer_id;
hsa_signal_t interrupt_signal;
uint64_t gpu_index;
uint32_t xcc_count;
};
void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_signal_t& pending);
+3 -2
Просмотреть файл
@@ -23,6 +23,7 @@
#include <algorithm>
#include <atomic>
#include "rocprofiler.h"
#include "src/api/rocmtool.h"
namespace Memory {
@@ -68,8 +69,8 @@ GenericBuffer::GenericBuffer(rocprofiler_session_id_t session_id, rocprofiler_bu
GenericBuffer::~GenericBuffer() {
if (is_valid_.load(std::memory_order_release)) {
std::lock_guard lock(buffer_lock_);
if (rocmtools::GetROCMToolObj()->GetSession(session_id_))
rocmtools::GetROCMToolObj()->GetSession(session_id_)->DisableTools(id_);
// if (rocmtools::GetROCMToolObj()->GetSession(session_id_))
// rocmtools::GetROCMToolObj()->GetSession(session_id_)->DisableTools(id_);
Flush();
+3 -3
Просмотреть файл
@@ -75,8 +75,7 @@ class GenericBuffer {
}
// Store data in the record. Copy the data first if it fits in the buffer
// (reserve_data_size != 0).
if (reserve_data_size) {
if (reserve_data_size != 0) {
data_ptr_ -= data_size;
::memcpy(data_ptr_, data, data_size);
store_data(record, data_ptr_);
@@ -160,7 +159,8 @@ class GenericBuffer {
std::mutex buffer_lock_;
};
bool GetNextRecord(const rocprofiler_record_header_t* record, const rocprofiler_record_header_t** next);
bool GetNextRecord(const rocprofiler_record_header_t* record,
const rocprofiler_record_header_t** next);
} // namespace Memory
#endif // SRC_CORE_MEMORY_GENERIC_BUFFER_H_
+5 -3
Просмотреть файл
@@ -281,10 +281,12 @@ class MetricsDict {
try {
expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_));
} catch (const xml::exception_t& exc) {
if (do_lookup)
if (do_lookup) {
metrics_list.push_back(node);
else
throw(exc);
} else {
std::cerr << "Error: " << exc.what() << std::endl;
abort();
}
}
if (expr_obj) {
#if 0
+29 -20
Просмотреть файл
@@ -50,8 +50,17 @@ uint64_t GetCounterID(std::string& counter_name) {
Profiler::Profiler(rocprofiler_buffer_id_t buffer_id, rocprofiler_filter_id_t filter_id,
rocprofiler_session_id_t session_id)
: buffer_id_(buffer_id), filter_id_(filter_id), session_id_(session_id) {}
Profiler::~Profiler() {}
: buffer_id_(buffer_id), filter_id_(filter_id), session_id_(session_id) {
sessions_pending_signals_ = new std::map<uint32_t, std::vector<pending_signal_t*>>();
}
Profiler::~Profiler() {
for (auto& [thread_id, pending_signals] : *sessions_pending_signals_) {
for (auto& pending_signal : pending_signals) {
delete pending_signal;
}
}
delete sessions_pending_signals_;
}
void Profiler::AddCounterName(rocprofiler_counter_id_t counter_id, std::string counter_name) {
std::lock_guard<std::mutex> lock(counter_names_lock_);
@@ -106,39 +115,39 @@ const char* Profiler::GetCounterInfo(rocprofiler_counter_info_kind_t kind,
return nullptr;
}
void Profiler::StartReplayPass(rocprofiler_session_id_t session_id) { warning("Not yet supported!"); }
void Profiler::StartReplayPass(rocprofiler_session_id_t session_id) {
warning("Not yet supported!");
}
void Profiler::EndReplayPass() { warning("Not yet supported!"); }
bool Profiler::HasActivePass() {
warning("Not yet supported!");
return true;
}
void Profiler::AddPendingSignals(uint32_t writer_id, uint64_t kernel_object,
const hsa_signal_t& completion_signal,
rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id,
rocmtools::profiling_context_t* context,
uint64_t session_data_count,
hsa_ven_amd_aqlprofile_profile_t* profile,
rocprofiler_kernel_properties_t kernel_properties,
uint32_t thread_id, uint64_t queue_index) {
void Profiler::AddPendingSignals(
uint32_t writer_id, uint64_t kernel_object, const hsa_signal_t& completion_signal,
rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id,
rocmtools::profiling_context_t* context, uint64_t session_data_count,
hsa_ven_amd_aqlprofile_profile_t* profile, rocprofiler_kernel_properties_t kernel_properties,
uint32_t thread_id, uint64_t queue_index, uint64_t correlation_id) {
std::lock_guard<std::mutex> lock(sessions_pending_signals_lock_);
if (sessions_pending_signals_.find(writer_id) == sessions_pending_signals_.end())
sessions_pending_signals_.emplace(writer_id, std::vector<pending_signal_t>());
sessions_pending_signals_.at(writer_id).emplace_back(
pending_signal_t{kernel_object, completion_signal, session_id_, buffer_id, context,
session_data_count, profile, kernel_properties, thread_id, queue_index});
if (sessions_pending_signals_->find(writer_id) == sessions_pending_signals_->end())
sessions_pending_signals_->emplace(writer_id, std::vector<pending_signal_t*>());
sessions_pending_signals_->at(writer_id).emplace_back(new pending_signal_t{
kernel_object, completion_signal, session_id_, buffer_id, context, session_data_count,
profile, kernel_properties, thread_id, queue_index, correlation_id});
}
const std::vector<pending_signal_t>& Profiler::GetPendingSignals(uint32_t writer_id) {
const std::vector<pending_signal_t*>& Profiler::GetPendingSignals(uint32_t writer_id) {
std::lock_guard<std::mutex> lock(sessions_pending_signals_lock_);
assert(sessions_pending_signals_.find(writer_id) != sessions_pending_signals_.end() &&
assert(sessions_pending_signals_->find(writer_id) != sessions_pending_signals_->end() &&
"writer_id is not found in the pending_signals");
return sessions_pending_signals_.at(writer_id);
return sessions_pending_signals_->at(writer_id);
}
bool Profiler::CheckPendingSignalsIsEmpty() {
std::lock_guard<std::mutex> lock(sessions_pending_signals_lock_);
return sessions_pending_signals_.empty();
return sessions_pending_signals_->empty();
}
} // namespace profiler
+4 -3
Просмотреть файл
@@ -53,6 +53,7 @@ typedef struct {
rocprofiler_kernel_properties_t kernel_properties;
uint32_t thread_id;
uint64_t queue_index;
uint64_t correlation_id;
} pending_signal_t;
namespace profiler {
@@ -71,9 +72,9 @@ class Profiler {
rocmtools::profiling_context_t* context, uint64_t session_data_count,
hsa_ven_amd_aqlprofile_profile_t* profile,
rocprofiler_kernel_properties_t kernel_properties, uint32_t thread_id,
uint64_t queue_index);
uint64_t queue_index, uint64_t correlation_id);
const std::vector<pending_signal_t>& GetPendingSignals(uint32_t writer_id);
const std::vector<pending_signal_t*>& GetPendingSignals(uint32_t writer_id);
bool CheckPendingSignalsIsEmpty();
void AddCounterName(rocprofiler_counter_id_t handler, std::string counter_name);
@@ -97,7 +98,7 @@ class Profiler {
rocprofiler_session_id_t session_id_;
std::mutex sessions_pending_signals_lock_;
std::map<uint32_t, std::vector<pending_signal_t>> sessions_pending_signals_;
std::map<uint32_t, std::vector<pending_signal_t*>>* sessions_pending_signals_;
};
} // namespace profiler
+18 -8
Просмотреть файл
@@ -41,7 +41,9 @@
namespace rocmtools {
Session::Session(rocprofiler_replay_mode_t replay_mode, rocprofiler_session_id_t session_id)
: session_id_(session_id), is_active_(false), replay_mode_(replay_mode) {}
: session_id_(session_id), is_active_(false), replay_mode_(replay_mode) {
buffers_ = new std::map<uint64_t, Memory::GenericBuffer*>();
}
Session::~Session() {
while (GetCurrentActiveInterruptSignalsCount() > 0) {
@@ -63,6 +65,7 @@ Session::~Session() {
// std::lock_guard<std::mutex> lock(filters_lock_);
// buffers_.clear();
// }
delete buffers_;
}
void Session::DisableTools(rocprofiler_buffer_id_t buffer_id) {
@@ -171,6 +174,8 @@ void Session::Start() {
void Session::Terminate() {
if (is_active_) {
while (GetCurrentActiveInterruptSignalsCount() > 0) {
}
rocmtools::queue::ResetSessionID();
std::lock_guard<std::mutex> lock(session_lock_);
if (FindFilterWithKind(ROCPROFILER_SPM_COLLECTION)) {
@@ -204,6 +209,11 @@ void Session::Terminate() {
}
}
for (auto& buffer : *buffers_) {
buffer.second->Flush();
delete buffer.second;
}
is_active_ = false;
}
}
@@ -289,7 +299,7 @@ rocprofiler_filter_id_t Session::GetFilterIdWithKind(rocprofiler_filter_kind_t k
return rocprofiler_filter_id_t{0};
}
bool Session::HasBuffer() { return buffers_.size() > 0; }
bool Session::HasBuffer() { return buffers_->size() > 0; }
rocprofiler_buffer_id_t Session::CreateBuffer(rocprofiler_buffer_callback_t buffer_callback,
size_t buffer_size) {
@@ -297,8 +307,8 @@ rocprofiler_buffer_id_t Session::CreateBuffer(rocprofiler_buffer_callback_t buff
rocprofiler_buffer_id_t{buffers_counter_.fetch_add(1, std::memory_order_release)};
{
std::lock_guard<std::mutex> lock(buffers_lock_);
buffers_.emplace(id.value,
new Memory::GenericBuffer(session_id_, id, buffer_size, buffer_callback));
buffers_->emplace(id.value,
new Memory::GenericBuffer(session_id_, id, buffer_size, buffer_callback));
}
return id;
}
@@ -306,7 +316,7 @@ rocprofiler_buffer_id_t Session::CreateBuffer(rocprofiler_buffer_callback_t buff
bool Session::FindBuffer(rocprofiler_buffer_id_t buffer_id) {
{
std::lock_guard<std::mutex> lock(buffers_lock_);
return buffers_.find(buffer_id.value) != buffers_.end();
return buffers_->find(buffer_id.value) != buffers_->end();
}
}
@@ -316,8 +326,8 @@ void Session::DestroyTracer() { /* tracer_.reset(); */
void Session::DestroyBuffer(rocprofiler_buffer_id_t buffer_id) {
{
std::lock_guard<std::mutex> lock(filters_lock_);
delete buffers_.at(buffer_id.value);
buffers_.erase(buffer_id.value);
delete buffers_->at(buffer_id.value);
buffers_->erase(buffer_id.value);
// if (buffers_.find(buffer_id.value) != buffers_.end() &&
// buffers_.at(buffer_id.value)->IsValid())
// buffers_.at(buffer_id.value).reset();
@@ -347,7 +357,7 @@ rocprofiler_status_t Session::stopSpm() {
Memory::GenericBuffer* Session::GetBuffer(rocprofiler_buffer_id_t buffer_id) {
{
std::lock_guard<std::mutex> lock(buffers_lock_);
return buffers_.at(buffer_id.value);
return buffers_->at(buffer_id.value);
}
}
+1 -1
Просмотреть файл
@@ -124,7 +124,7 @@ class Session {
std::atomic<uint64_t> buffers_counter_{1};
std::mutex buffers_lock_;
std::map<uint64_t, Memory::GenericBuffer*> buffers_;
std::map<uint64_t, Memory::GenericBuffer*>* buffers_;
std::atomic<uint64_t> records_counter_{1};
+60 -30
Просмотреть файл
@@ -300,13 +300,13 @@ att_parsed_input_t GetATTParams() {
} else if (param_name == "PERFCOUNTER") {
counters_names.push_back(line.substr(pos + 1));
continue;
} else { // param_value is a number
} else { // param_value is a number
try {
auto hexa_pos = line.find("0x", pos); // Is it hex?
auto hexa_pos = line.find("0x", pos); // Is it hex?
if (hexa_pos != std::string::npos)
param_value = stoi(line.substr(hexa_pos + 2), 0, 16); // hexadecimal
else
param_value = stoi(line.substr(pos + 1), 0, 10); // decimal
param_value = stoi(line.substr(pos + 1), 0, 10); // decimal
} catch (...) {
printf("Error: Invalid parameter value %s - (%s)\n",
line.substr(pos + 1, line.size()).c_str(), line.c_str());
@@ -364,6 +364,9 @@ att_parsed_input_t GetATTParams() {
}
void finish() {
for ([[maybe_unused]] rocprofiler_buffer_id_t buffer_id : buffer_ids) {
CHECK_ROCPROFILER(rocprofiler_flush_data(session_id, buffer_id));
}
if (amd_sys_handler.load(std::memory_order_release)) {
amd_sys_handler.exchange(false, std::memory_order_release);
wait_for_start_shm.join();
@@ -371,11 +374,8 @@ void finish() {
}
if (session_created.load(std::memory_order_relaxed)) {
session_created.exchange(false, std::memory_order_release);
CHECK_ROCPROFILER(rocprofiler_terminate_session(session_id));
rocprofiler::TraceBufferBase::FlushAll();
for ([[maybe_unused]] rocprofiler_buffer_id_t buffer_id : buffer_ids) {
CHECK_ROCPROFILER(rocprofiler_flush_data(session_id, buffer_id));
}
CHECK_ROCPROFILER(rocprofiler_terminate_session(session_id));
}
}
@@ -646,49 +646,59 @@ ROCPROFILER_EXPORT bool OnLoad(void* table, uint64_t runtime_version, uint64_t f
if (apis_requested.size() > 0) filters_requested.emplace_back(ROCPROFILER_API_TRACE);
if (parameters.size() > 0) filters_requested.emplace_back(ROCPROFILER_ATT_TRACE_COLLECTION);
rocprofiler_buffer_id_t buffer_id;
CHECK_ROCPROFILER(rocprofiler_create_buffer(
session_id,
[](const rocprofiler_record_header_t* record, const rocprofiler_record_header_t* end_record,
rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id) {
if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id);
},
1 << 20, &buffer_id));
buffer_ids.emplace_back(buffer_id);
rocprofiler_buffer_id_t buffer_id_1;
CHECK_ROCPROFILER(rocprofiler_create_buffer(
session_id,
[](const rocprofiler_record_header_t* record, const rocprofiler_record_header_t* end_record,
rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id_1) {
if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id_1);
},
1 << 20, &buffer_id_1));
buffer_ids.emplace_back(buffer_id_1);
for (rocprofiler_filter_kind_t filter_kind : filters_requested) {
switch (filter_kind) {
case ROCPROFILER_COUNTERS_COLLECTION: {
rocprofiler_buffer_id_t buffer_id;
CHECK_ROCPROFILER(rocprofiler_create_buffer(
session_id,
[](const rocprofiler_record_header_t* record,
const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id,
rocprofiler_buffer_id_t buffer_id) {
if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id);
},
1 << 20, &buffer_id));
buffer_ids.emplace_back(buffer_id);
printf("Enabling Counter Collection\n");
rocprofiler_filter_id_t filter_id;
[[maybe_unused]] rocprofiler_filter_property_t property = {};
CHECK_ROCPROFILER(rocprofiler_create_filter(
session_id, filter_kind, rocprofiler_filter_data_t{.counters_names = &counters_[0]},
counters_.size(), &filter_id, property));
CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id_1));
CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id));
filter_ids.emplace_back(filter_id);
break;
}
case ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION: {
rocprofiler_buffer_id_t buffer_id;
CHECK_ROCPROFILER(rocprofiler_create_buffer(
session_id,
[](const rocprofiler_record_header_t* record,
const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id,
rocprofiler_buffer_id_t buffer_id) {
if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id);
},
1 << 20, &buffer_id));
buffer_ids.emplace_back(buffer_id);
rocprofiler_filter_id_t filter_id;
[[maybe_unused]] rocprofiler_filter_property_t property = {};
CHECK_ROCPROFILER(rocprofiler_create_filter(
session_id, filter_kind, rocprofiler_filter_data_t{}, 0, &filter_id, property));
CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id_1));
CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id));
filter_ids.emplace_back(filter_id);
break;
}
case ROCPROFILER_API_TRACE: {
rocprofiler_buffer_id_t buffer_id;
CHECK_ROCPROFILER(rocprofiler_create_buffer(
session_id,
[](const rocprofiler_record_header_t* record,
const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id,
rocprofiler_buffer_id_t buffer_id) {
if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id);
},
1 << 20, &buffer_id));
buffer_ids.emplace_back(buffer_id);
printf("Enabling API Tracing\n");
rocprofiler_filter_id_t filter_id;
[[maybe_unused]] rocprofiler_filter_property_t property = {};
@@ -702,6 +712,16 @@ ROCPROFILER_EXPORT bool OnLoad(void* table, uint64_t runtime_version, uint64_t f
break;
}
case ROCPROFILER_ATT_TRACE_COLLECTION: {
rocprofiler_buffer_id_t buffer_id;
CHECK_ROCPROFILER(rocprofiler_create_buffer(
session_id,
[](const rocprofiler_record_header_t* record,
const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id,
rocprofiler_buffer_id_t buffer_id) {
if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id);
},
1 << 20, &buffer_id));
buffer_ids.emplace_back(buffer_id);
printf("Enabling ATT Tracing\n");
rocprofiler_filter_id_t filter_id;
@@ -717,11 +737,21 @@ ROCPROFILER_EXPORT bool OnLoad(void* table, uint64_t runtime_version, uint64_t f
rocprofiler_create_filter(session_id, ROCPROFILER_ATT_TRACE_COLLECTION,
rocprofiler_filter_data_t{.att_parameters = &parameters[0]},
parameters.size(), &filter_id, property));
CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id_1));
CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id));
filter_ids.emplace_back(filter_id);
break;
}
case ROCPROFILER_PC_SAMPLING_COLLECTION: {
rocprofiler_buffer_id_t buffer_id;
CHECK_ROCPROFILER(rocprofiler_create_buffer(
session_id,
[](const rocprofiler_record_header_t* record,
const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id,
rocprofiler_buffer_id_t buffer_id) {
if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id);
},
1 << 20, &buffer_id));
buffer_ids.emplace_back(buffer_id);
puts("Enabling PC sampling");
rocprofiler_filter_id_t filter_id;
[[maybe_unused]] rocprofiler_filter_property_t property = {};
+2 -1
Просмотреть файл
@@ -145,7 +145,8 @@ std::string string_printf(const char* format, ...) {
std::string errmsg("ROCMTools: fatal error: " + message);
fputs(errmsg.c_str(), stderr);
throw(errmsg);
std::cerr << errmsg << std::endl;
abort();
}
/* The function extracts the kernel name from