SWDEV-403050: Multiple fixes for Memory Leaks in profiler
Change-Id: Ib720a81105af13898ff745ce0cbc2a48c1f4a980
[ROCm/rocprofiler commit: 08fc21ac31]
Этот коммит содержится в:
коммит произвёл
Ammar ELWazir
родитель
935b40b837
Коммит
5a7dec797e
@@ -116,6 +116,9 @@ find_package(
|
||||
PATHS
|
||||
${ROCM_PATH})
|
||||
|
||||
find_library(NUMA NAME numa REQUIRED)
|
||||
link_libraries(${NUMA})
|
||||
|
||||
get_property(
|
||||
HSA_RUNTIME_INCLUDE_DIRECTORIES
|
||||
TARGET hsa-runtime64::hsa-runtime64
|
||||
|
||||
@@ -302,24 +302,6 @@ get_pmc_results_txt_path(){
|
||||
done
|
||||
}
|
||||
|
||||
if [ -n "$COUNTERS_PMC_DIRS" ]; then
|
||||
COUNTERS_RESULTS_TXT=""
|
||||
for PMC_DIR in $COUNTERS_PMC_DIRS; do
|
||||
COUNTERS_RESULTS_TXT="$COUNTERS_RESULTS_TXT $(get_pmc_results_txt_path $PMC_DIR)"
|
||||
done
|
||||
CSV_RESULTS=$OUTPUT_PATH_INTERNAL/results.csv
|
||||
if [ -n "$OUT_FILE_NAME" ]; then
|
||||
CSV_RESULTS=$OUTPUT_PATH_INTERNAL/$OUT_FILE_NAME
|
||||
fi
|
||||
echo "csv results path: $CSV_RESULTS"
|
||||
export ROCP_MERGE_PIDS=1 #required for tblextr.py to work correctly for counters
|
||||
if [ $RUN_FROM_BUILD == 1 ]; then
|
||||
eval "python3 $ROCM_DIR/bin/tblextr.py $CSV_RESULTS $COUNTERS_RESULTS_TXT"
|
||||
else
|
||||
eval "python3 $ROCPROFV2_DIR/../libexec/rocprofiler/tblextr.py $CSV_RESULTS $COUNTERS_RESULTS_TXT"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -n "$ATT_PATH" ]; then
|
||||
if [ -n "$ATT_ARGV" ]; then
|
||||
eval "python3 $ATT_PATH $ATT_ARGV"
|
||||
|
||||
@@ -110,7 +110,7 @@ THE SOFTWARE.
|
||||
#if !defined(ROCPROFILER)
|
||||
#if defined(ROCPROFILER_EXPORTS)
|
||||
#define ROCPROFILER_API ROCPROFILER_EXPORT
|
||||
#else /* !defined (ROCPROFILER_EXPORTS) */
|
||||
#else /* !defined (ROCPROFILER_EXPORTS) */
|
||||
#define ROCPROFILER_API ROCPROFILER_IMPORT
|
||||
#endif /* !defined (ROCPROFILER_EXPORTS) */
|
||||
#endif /* !defined (ROCPROFILER) */
|
||||
@@ -949,6 +949,14 @@ typedef struct {
|
||||
uint64_t signal_handle;
|
||||
|
||||
} rocprofiler_kernel_properties_t;
|
||||
|
||||
/**
|
||||
* Correlation ID
|
||||
*/
|
||||
typedef struct {
|
||||
uint64_t value;
|
||||
} rocprofiler_correlation_id_t;
|
||||
|
||||
/**
|
||||
* Profiling record, this will represent all the information reported by the
|
||||
* profiler regarding kernel dispatches and their counters that were collected
|
||||
@@ -986,7 +994,11 @@ typedef struct {
|
||||
* Counters, including identifiers to get counter information and Counters
|
||||
* values
|
||||
*/
|
||||
rocprofiler_record_counter_instance_t* counters;
|
||||
const rocprofiler_record_counter_instance_t* counters;
|
||||
/**
|
||||
* The count of the counters that were collected by the profiler
|
||||
*/
|
||||
rocprofiler_record_counters_instances_count_t counters_count; /* Counters Count */
|
||||
/**
|
||||
* kernel properties, including the grid size, work group size,
|
||||
* registers count, wave size and completion signal
|
||||
@@ -1001,9 +1013,9 @@ typedef struct {
|
||||
*/
|
||||
rocprofiler_queue_index_t queue_idx;
|
||||
/**
|
||||
* The count of the counters that were collected by the profiler
|
||||
* Correlation id
|
||||
*/
|
||||
rocprofiler_record_counters_instances_count_t counters_count; /* Counters Count */
|
||||
rocprofiler_correlation_id_t correlation_id;
|
||||
} rocprofiler_record_profiler_t;
|
||||
|
||||
typedef struct {
|
||||
|
||||
@@ -233,7 +233,6 @@ class file_plugin_t {
|
||||
tracer_record.api_data_handle, tracer_record.operation_id, &function_name_c));
|
||||
}
|
||||
}
|
||||
//return;
|
||||
output_file_t* output_file = get_output_file(output_type_t::TRACER, tracer_record.domain);
|
||||
*output_file << "Record(" << tracer_record.header.id.handle << "), Domain("
|
||||
<< GetDomainName(tracer_record.domain) << "),";
|
||||
@@ -336,7 +335,7 @@ class file_plugin_t {
|
||||
session_id, ROCPROFILER_COUNTER_NAME, profiler_record->counters[i].counter_handler,
|
||||
&counter_name_length));
|
||||
if (counter_name_length > 1) {
|
||||
const char* name_c = static_cast<const char*>(malloc(name_length * sizeof(char)));
|
||||
const char* name_c = nullptr;
|
||||
CHECK_ROCPROFILER(rocprofiler_query_counter_info(
|
||||
session_id, ROCPROFILER_COUNTER_NAME, profiler_record->counters[i].counter_handler,
|
||||
&name_c));
|
||||
|
||||
@@ -87,7 +87,8 @@ std::string string_printf(const char* format, ...) {
|
||||
std::string errmsg("ROCProfiler: fatal error: " + message);
|
||||
fputs(errmsg.c_str(), stderr);
|
||||
|
||||
throw(errmsg);
|
||||
// throw(errmsg);
|
||||
abort();
|
||||
}
|
||||
|
||||
/* The function extracts the kernel name from
|
||||
|
||||
@@ -245,7 +245,7 @@ set_target_properties(rocprofiler-v2 PROPERTIES
|
||||
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/v2
|
||||
VERSION ${PROJECT_VERSION}
|
||||
SOVERSION ${PROJECT_VERSION_MAJOR})
|
||||
|
||||
|
||||
# Add custom command to copy the v2 library to buil-dir as well
|
||||
add_custom_command(TARGET rocprofiler-v2 POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/v2/librocprofiler64.so.2 ${CMAKE_BINARY_DIR}
|
||||
@@ -271,10 +271,10 @@ target_include_directories(rocprofiler-v2
|
||||
if(ASAN)
|
||||
target_compile_options(rocprofiler-v2 PRIVATE -fsanitize=address)
|
||||
target_link_options(rocprofiler-v2 PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap -Wl,--no-undefined,-fsanitize=address)
|
||||
target_link_libraries(rocprofiler-v2 PRIVATE ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 Threads::Threads atomic asan dl c stdc++ stdc++fs amd_comgr ${PCIACCESS_LIBRARIES})
|
||||
target_link_libraries(rocprofiler-v2 PRIVATE ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 Threads::Threads atomic numa asan dl c stdc++ stdc++fs amd_comgr ${PCIACCESS_LIBRARIES})
|
||||
else()
|
||||
target_link_options(rocprofiler-v2 PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap -Wl,--no-undefined)
|
||||
target_link_libraries(rocprofiler-v2 PRIVATE ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 Threads::Threads atomic dl c stdc++ stdc++fs amd_comgr ${PCIACCESS_LIBRARIES})
|
||||
target_link_libraries(rocprofiler-v2 PRIVATE ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 Threads::Threads atomic numa dl c stdc++ stdc++fs amd_comgr ${PCIACCESS_LIBRARIES})
|
||||
endif()
|
||||
## Install libraries: Non versioned lib file in dev package
|
||||
# install(TARGETS rocprofiler-v2 LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT dev)
|
||||
|
||||
@@ -207,7 +207,8 @@ class Context {
|
||||
} catch(...) {
|
||||
delete obj;
|
||||
obj = NULL;
|
||||
throw;
|
||||
std::cerr << "Error: Context Create failed" << std::endl;
|
||||
abort();
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
|
||||
@@ -46,8 +46,10 @@ hsa_status_t pmcCallback(hsa_ven_amd_aqlprofile_info_type_t info_type,
|
||||
if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA) {
|
||||
if (IsEventMatch(info_data->pmc_data.event, (*data_it)->event)) {
|
||||
uint32_t xcc_index = floor(passed_data->index / passed_data->single_xcc_buff_size);
|
||||
(*data_it)->xcc_vals[xcc_index] += info_data->pmc_data.result; // stores event result from each xcc separately
|
||||
(*data_it)->val_double += info_data->pmc_data.result; // stores accumulated event result from all xccs
|
||||
(*data_it)->xcc_vals[xcc_index] +=
|
||||
info_data->pmc_data.result; // stores event result from each xcc separately
|
||||
(*data_it)->val_double +=
|
||||
info_data->pmc_data.result; // stores accumulated event result from all xccs
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -98,19 +100,16 @@ bool metrics::ExtractMetricEvents(
|
||||
// const Metric* metric = metrics_dict->GetMetricByName(metric_names[i]);
|
||||
const Metric* metric = metrics_dict->Get(metric_names[i]);
|
||||
if (metric == nullptr) {
|
||||
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
|
||||
fatal("input metric'%s' not supported on this hardware: %s ", metric_names[i].c_str(),
|
||||
agentInfo.getName().data());
|
||||
|
||||
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
|
||||
fatal("input metric'%s' not supported on this hardware: %s ", metric_names[i].c_str(),
|
||||
agentInfo.getName().data());
|
||||
}
|
||||
|
||||
// adding result object for derived metric
|
||||
std::lock_guard<std::mutex> lock(extract_metric_events_lock);
|
||||
if (results_map.find(metric_names[i]) == results_map.end()) {
|
||||
results_map[metric_names[i]] = new results_t(metric_names[i], {}, xcc_count);
|
||||
} // else {
|
||||
// continue;
|
||||
// }
|
||||
}
|
||||
|
||||
counters_vec = metric->GetCounters();
|
||||
if (counters_vec.empty())
|
||||
@@ -129,7 +128,8 @@ bool metrics::ExtractMetricEvents(
|
||||
} else {
|
||||
// result object for base metric
|
||||
// std::cout << "Metric : " << metric->GetName() << " : " << counter->name << std::endl;
|
||||
result = new results_t(counter->name, {}, xcc_count); // TODO: set correct initial value
|
||||
result =
|
||||
new results_t(counter->name, {}, xcc_count); // TODO: set correct initial value
|
||||
results_map[counter->name] = result;
|
||||
}
|
||||
} else {
|
||||
@@ -188,7 +188,7 @@ bool metrics::ExtractMetricEvents(
|
||||
bool metrics::GetCounterData(hsa_ven_amd_aqlprofile_profile_t* profile, hsa_agent_t gpu_agent,
|
||||
std::vector<results_t*>& results_list) {
|
||||
uint32_t xcc_count = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle).getXccCount();
|
||||
uint32_t single_xcc_buff_size = profile->output_buffer.size /(sizeof(uint64_t) * xcc_count);
|
||||
uint32_t single_xcc_buff_size = profile->output_buffer.size / (sizeof(uint64_t) * xcc_count);
|
||||
callback_data_t callback_data{&results_list, 0, single_xcc_buff_size};
|
||||
hsa_status_t status = hsa_ven_amd_aqlprofile_iterate_data(profile, pmcCallback, &callback_data);
|
||||
return (status == HSA_STATUS_SUCCESS);
|
||||
@@ -210,16 +210,19 @@ bool metrics::GetMetricsData(std::map<std::string, results_t*>& results_map,
|
||||
return true;
|
||||
}
|
||||
|
||||
void metrics::GetCountersAndMetricResultsByXcc(uint32_t xcc_index, std::vector<results_t*>& results_list,
|
||||
std::map<std::string, results_t*>& results_map,
|
||||
std::vector<const Metric*>& metrics_list){
|
||||
for(auto it = results_list.begin(); it != results_list.end(); it++){
|
||||
(*it)->val_double = (*it)->xcc_vals[xcc_index]; // set val_double to hold value for specific xcc
|
||||
}
|
||||
void metrics::GetCountersAndMetricResultsByXcc(uint32_t xcc_index,
|
||||
std::vector<results_t*>& results_list,
|
||||
std::map<std::string, results_t*>& results_map,
|
||||
std::vector<const Metric*>& metrics_list) {
|
||||
for (auto it = results_list.begin(); it != results_list.end(); it++) {
|
||||
(*it)->val_double =
|
||||
(*it)->xcc_vals[xcc_index]; // set val_double to hold value for specific xcc
|
||||
}
|
||||
|
||||
for(auto it = results_map.begin(); it != results_map.end(); it++){
|
||||
it->second->val_double = it->second->xcc_vals[xcc_index]; // set val_double to hold value for specific xcc
|
||||
}
|
||||
for (auto it = results_map.begin(); it != results_map.end(); it++) {
|
||||
it->second->val_double =
|
||||
it->second->xcc_vals[xcc_index]; // set val_double to hold value for specific xcc
|
||||
}
|
||||
|
||||
GetMetricsData(results_map, metrics_list);
|
||||
GetMetricsData(results_map, metrics_list);
|
||||
}
|
||||
|
||||
@@ -282,10 +282,12 @@ class MetricsDict {
|
||||
try {
|
||||
expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_));
|
||||
} catch (const xml::exception_t& exc) {
|
||||
if (do_lookup)
|
||||
if (do_lookup) {
|
||||
metrics_list.push_back(node);
|
||||
else
|
||||
throw(exc);
|
||||
} else {
|
||||
std::cerr << "Error: " << exc.what() << std::endl;
|
||||
abort();
|
||||
}
|
||||
}
|
||||
if (expr_obj) {
|
||||
#if 0
|
||||
|
||||
@@ -89,7 +89,7 @@ AgentInfo::AgentInfo(const hsa_agent_t agent, ::CoreApiTable* table) : handle_(a
|
||||
xcc_num_ = 1;
|
||||
}
|
||||
|
||||
int AgentInfo::getIndex() const { return index_; }
|
||||
uint64_t AgentInfo::getIndex() const { return index_; }
|
||||
hsa_device_type_t AgentInfo::getType() const { return type_; }
|
||||
uint64_t AgentInfo::getHandle() const { return handle_; }
|
||||
const std::string_view AgentInfo::getName() const { return name_; }
|
||||
@@ -107,7 +107,7 @@ uint32_t AgentInfo::getPCIDomain() const { return pci_domain_; }
|
||||
uint32_t AgentInfo::getPCILocationID() const { return pci_location_id_; }
|
||||
uint32_t AgentInfo::getXccCount() const { return xcc_num_; }
|
||||
|
||||
void AgentInfo::setIndex(int index) { index_ = index; }
|
||||
void AgentInfo::setIndex(uint64_t index) { index_ = index; }
|
||||
void AgentInfo::setType(hsa_device_type_t type) { type_ = type; }
|
||||
void AgentInfo::setHandle(uint64_t handle) { handle_ = handle; }
|
||||
void AgentInfo::setName(const std::string& name) { strcpy(name_, name.c_str()); }
|
||||
|
||||
@@ -43,7 +43,7 @@ class AgentInfo {
|
||||
AgentInfo();
|
||||
AgentInfo(const hsa_agent_t agent, ::CoreApiTable* table);
|
||||
|
||||
int getIndex() const;
|
||||
uint64_t getIndex() const;
|
||||
hsa_device_type_t getType() const;
|
||||
uint64_t getHandle() const;
|
||||
const std::string_view getName() const;
|
||||
@@ -62,7 +62,7 @@ class AgentInfo {
|
||||
uint32_t getPCILocationID() const;
|
||||
uint32_t getXccCount() const;
|
||||
|
||||
void setIndex(int index);
|
||||
void setIndex(uint64_t index);
|
||||
void setType(hsa_device_type_t type);
|
||||
void setHandle(uint64_t handle);
|
||||
void setName(const std::string& name);
|
||||
@@ -78,7 +78,7 @@ class AgentInfo {
|
||||
hsa_amd_memory_pool_t gpu_pool;
|
||||
|
||||
private:
|
||||
int index_;
|
||||
uint64_t index_;
|
||||
hsa_device_type_t type_; // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
|
||||
uint64_t handle_;
|
||||
char name_[64];
|
||||
|
||||
@@ -30,26 +30,39 @@ std::mutex agents_map_lock;
|
||||
std::map<decltype(hsa_agent_t::handle), Agent::AgentInfo> agent_info_map;
|
||||
Agent::AgentInfo& GetAgentInfo(decltype(hsa_agent_t::handle) handle) {
|
||||
std::lock_guard<std::mutex> lock(agents_map_lock);
|
||||
if (agent_info_map.find(handle) != agent_info_map.end())
|
||||
if (agent_info_map.find(handle) != agent_info_map.end()) {
|
||||
return agent_info_map.at(handle);
|
||||
else
|
||||
throw(std::string("Error: Can't find Agent with handle(") + std::to_string(handle) +
|
||||
") in this system");
|
||||
} else {
|
||||
std::cerr << std::string("Error: Can't find Agent with handle(") << std::to_string(handle) <<
|
||||
") in this system" << std::endl;
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<hsa_agent_t> cpu_agents_list;
|
||||
|
||||
void SetAgentInfo(decltype(hsa_agent_t::handle) handle, const Agent::AgentInfo& agent_info) {
|
||||
std::lock_guard<std::mutex> lock(agents_map_lock);
|
||||
agent_info_map.emplace(handle, agent_info);
|
||||
if (agent_info.getType() == HSA_DEVICE_TYPE_GPU) {
|
||||
cpu_agents_list.emplace_back(hsa_agent_t{handle});
|
||||
}
|
||||
}
|
||||
|
||||
hsa_agent_t GetAgentByIndex(int agent_index) {
|
||||
std::vector<hsa_agent_t>& GetCPUAgentList() {
|
||||
return cpu_agents_list;
|
||||
}
|
||||
|
||||
hsa_agent_t GetAgentByIndex(uint64_t agent_index) {
|
||||
std::lock_guard<std::mutex> lock(agents_map_lock);
|
||||
for (auto& agent_info : agent_info_map) {
|
||||
if (agent_info.second.getIndex() == agent_index) {
|
||||
return hsa_agent_t{agent_info.second.getHandle()};
|
||||
}
|
||||
}
|
||||
throw(std::string("Error: Can't find Agent with Index(") + std::to_string(agent_index) +
|
||||
") in this system");
|
||||
std::cerr << std::string("Error: Can't find Agent with Index(") << std::to_string(agent_index) <<
|
||||
") in this system" << std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
CoreApiTable saved_core_api{};
|
||||
|
||||
@@ -38,9 +38,12 @@
|
||||
namespace rocmtools {
|
||||
namespace hsa_support {
|
||||
|
||||
|
||||
std::vector<hsa_agent_t>& GetCPUAgentList();
|
||||
|
||||
Agent::AgentInfo& GetAgentInfo(decltype(hsa_agent_t::handle) handle);
|
||||
void SetAgentInfo(decltype(hsa_agent_t::handle) handle, const Agent::AgentInfo& agent_info);
|
||||
hsa_agent_t GetAgentByIndex(int agent_index);
|
||||
hsa_agent_t GetAgentByIndex(uint64_t agent_index);
|
||||
|
||||
CoreApiTable& GetCoreApiTable();
|
||||
void SetCoreApiTable(const CoreApiTable& table);
|
||||
|
||||
@@ -25,8 +25,11 @@
|
||||
#include <hsa/hsa_ven_amd_aqlprofile.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <numa.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <exception>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
@@ -47,23 +50,22 @@
|
||||
|
||||
#define CHECK_HSA_STATUS(msg, status) \
|
||||
do { \
|
||||
if ((status) != HSA_STATUS_INFO_BREAK) { \
|
||||
const char* emsg = 0; \
|
||||
hsa_status_string(status, &emsg); \
|
||||
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
|
||||
if ((status) != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) { \
|
||||
try { \
|
||||
const char* emsg = nullptr; \
|
||||
hsa_status_string(status, &emsg); \
|
||||
if (!emsg) emsg = "<Unknown HSA Error>"; \
|
||||
std::cerr << msg << std::endl; \
|
||||
std::cerr << emsg << std::endl; \
|
||||
} catch (std::exception & e) { \
|
||||
} \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
namespace Packet {
|
||||
|
||||
static const size_t MEM_PAGE_BYTES = 0x1000;
|
||||
static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
|
||||
// hsa_amd_memory_pool_t command_pool;
|
||||
// hsa_amd_memory_pool_t output_pool;
|
||||
|
||||
// hsa_amd_memory_pool_t& GetCommandPool() { return command_pool; }
|
||||
// hsa_amd_memory_pool_t& GetOutputPool() { return output_pool; }
|
||||
|
||||
// This function checks to see if the provided
|
||||
// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true,
|
||||
@@ -116,11 +118,11 @@ void InitializePools(hsa_agent_t cpu_agent, Agent::AgentInfo* agent_info) {
|
||||
hsa_status_t status =
|
||||
rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
|
||||
cpu_agent, FindStandardPool, &(agent_info->cpu_pool));
|
||||
if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Command Buffer Pool is not initialized\n");
|
||||
CHECK_HSA_STATUS("Error: Command Buffer Pool is not initialized", status);
|
||||
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
|
||||
cpu_agent, FindKernArgPool, &(agent_info->kernarg_pool));
|
||||
if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Output Buffer Pool is not initialized\n");
|
||||
CHECK_HSA_STATUS("Error: Output Buffer Pool is not initialized", status);
|
||||
}
|
||||
|
||||
void InitializeGPUPool(hsa_agent_t gpu_agent, Agent::AgentInfo* agent_info) {
|
||||
@@ -135,6 +137,7 @@ struct block_des_t {
|
||||
};
|
||||
|
||||
std::map<uint32_t, rocmtools::MetricsDict*> metricsDict;
|
||||
static std::atomic<bool> counters_added{false};
|
||||
|
||||
void CheckPacketReqiurements(std::vector<hsa_agent_t>& gpu_agents) {
|
||||
for (auto& gpu_agent : gpu_agents) {
|
||||
@@ -147,253 +150,240 @@ void CheckPacketReqiurements(std::vector<hsa_agent_t>& gpu_agents) {
|
||||
// Initialize the PM4 commands with having the CPU&GPU agents, the counters,
|
||||
// counters count to output three packets which are start, stop and read
|
||||
// packets
|
||||
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>*
|
||||
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>
|
||||
InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
|
||||
std::vector<std::string>& counter_names, bool is_spm) {
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
|
||||
if (!counters_added.load(std::memory_order_acquire)) {
|
||||
for (auto& name : counter_names) {
|
||||
if (rocmtools::GetROCMToolObj()->HasActiveSession()) {
|
||||
rocmtools::GetROCMToolObj()
|
||||
->GetSession(rocmtools::GetROCMToolObj()->GetCurrentSessionId())
|
||||
->GetProfiler()
|
||||
->AddCounterName(name);
|
||||
}
|
||||
}
|
||||
counters_added.exchange(true, std::memory_order_release);
|
||||
}
|
||||
|
||||
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
|
||||
std::map<std::string, rocmtools::results_t*> results_map;
|
||||
std::vector<rocmtools::event_t> events_list;
|
||||
std::vector<rocmtools::results_t*> results_list;
|
||||
std::map<std::pair<uint32_t, uint32_t>, uint64_t> event_to_max_block_count;
|
||||
std::map<std::string, std::set<std::string>> metrics_counters;
|
||||
uint32_t counters_count = 0;
|
||||
|
||||
for (auto& name : counter_names) {
|
||||
// std::cout << "Counter from Counter Names: " << name << std::endl;
|
||||
|
||||
if (rocmtools::GetROCMToolObj()->HasActiveSession()) {
|
||||
rocmtools::GetROCMToolObj()
|
||||
->GetSession(rocmtools::GetROCMToolObj()->GetCurrentSessionId())
|
||||
->GetProfiler()
|
||||
->AddCounterName(name);
|
||||
}
|
||||
|
||||
counters_count++;
|
||||
if (!rocmtools::metrics::ExtractMetricEvents(
|
||||
counter_names, gpu_agent, metricsDict[gpu_agent.handle], results_map, events_list,
|
||||
results_list, event_to_max_block_count, metrics_counters)) {
|
||||
std::cerr << "Error: Failed to extract metric events" << std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
rocmtools::metrics::ExtractMetricEvents(counter_names, gpu_agent, metricsDict[gpu_agent.handle],
|
||||
results_map, events_list, results_list,
|
||||
event_to_max_block_count, metrics_counters);
|
||||
|
||||
// TODO: validate needs to be called on each events_list[i]
|
||||
// Validating the events array for the specified gpu agent
|
||||
bool result;
|
||||
hsa_ven_amd_aqlprofile_validate_event(gpu_agent, &events_list[0], &result);
|
||||
if (!result) {
|
||||
printf("Error: Events are not valid for the current gpu agent\n");
|
||||
throw("Error: Events are not valid for the current gpu agent");
|
||||
bool validate_event_result;
|
||||
status =
|
||||
hsa_ven_amd_aqlprofile_validate_event(gpu_agent, &events_list[0], &validate_event_result);
|
||||
CHECK_HSA_STATUS("Error: Validating Counters", status);
|
||||
if (!validate_event_result) {
|
||||
std::cerr << "Error: Events are not valid for the current gpu agent" << std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>*
|
||||
profiles = new std::vector<
|
||||
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>
|
||||
profiles = std::vector<
|
||||
std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>();
|
||||
|
||||
do {
|
||||
rocmtools::profiling_context_t* context = new rocmtools::profiling_context_t();
|
||||
context->gpu_agent = gpu_agent;
|
||||
uint64_t i = 0;
|
||||
uint32_t counter_val_iteration = 0;
|
||||
auto result = results_list.begin();
|
||||
std::map<std::pair<uint32_t, uint32_t>, uint32_t> block_max_events_count;
|
||||
std::set<hsa_ven_amd_aqlprofile_block_name_t> block_names_taken;
|
||||
for (auto event = events_list.begin(); event != events_list.end();) {
|
||||
if (block_max_events_count[std::make_pair<uint32_t, uint32_t>(
|
||||
static_cast<uint32_t>(event->block_name),
|
||||
static_cast<uint32_t>(event->block_index))] <
|
||||
event_to_max_block_count[std::make_pair<uint32_t, uint32_t>(
|
||||
static_cast<uint32_t>(event->block_name),
|
||||
static_cast<uint32_t>(event->block_index))]) {
|
||||
context->events_list.push_back(*event);
|
||||
context->results_list.emplace_back(*result);
|
||||
counter_val_iteration++;
|
||||
block_max_events_count[std::make_pair<uint32_t, uint32_t>(
|
||||
static_cast<uint32_t>(event->block_name), static_cast<uint32_t>(event->block_index))]++;
|
||||
results_list.erase(result);
|
||||
events_list.erase(event);
|
||||
} else {
|
||||
event++;
|
||||
result++;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
std::set<std::string> counters_taken;
|
||||
|
||||
std::set<std::string> metrics_counters_taken;
|
||||
|
||||
for (auto result : context->results_list) {
|
||||
rocmtools::Metric* metric;
|
||||
if (std::find(counter_names.begin(), counter_names.end(), result->name) !=
|
||||
counter_names.end()) {
|
||||
// std::cout << "Counter from Result List: " << result->name << std::endl;
|
||||
counters_taken.insert(result->name);
|
||||
metric = const_cast<rocmtools::Metric*>(metricsDict[gpu_agent.handle]->Get(result->name));
|
||||
if (metric == nullptr) std::cout << result->name << " not found in metricsDict\n";
|
||||
context->metrics_list.push_back(metric);
|
||||
} else {
|
||||
metrics_counters_taken.insert(result->name);
|
||||
// std::cout << "Counter Added: " << result->name << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::set<std::string> metrics_taken;
|
||||
|
||||
for (auto result : results_map) {
|
||||
if (counters_taken.find(result.first) == counters_taken.end() &&
|
||||
std::find(counter_names.begin(), counter_names.end(), result.first) !=
|
||||
counter_names.end()) {
|
||||
bool flag = true;
|
||||
for (auto result_basic : results_list) {
|
||||
if (result_basic->name.compare(result.first)) {
|
||||
flag = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (flag) metrics_taken.insert(result.first);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto metric_name : metrics_taken) {
|
||||
bool flag = true;
|
||||
if (metrics_counters.find(metric_name) == metrics_counters.end()) continue;
|
||||
for (auto metric_counter_name : metrics_counters.at(metric_name)) {
|
||||
if (metrics_counters_taken.find(metric_counter_name) == metrics_counters_taken.end() &&
|
||||
counters_taken.find(metric_counter_name) == counters_taken.end()) {
|
||||
flag = false;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (flag) {
|
||||
// std::cout << "Counter from Result Map: " << metric_name << std::endl;
|
||||
counters_taken.insert(metric_name);
|
||||
rocmtools::Metric* metric =
|
||||
const_cast<rocmtools::Metric*>(metricsDict[gpu_agent.handle]->Get(metric_name));
|
||||
if (metric == nullptr) std::cout << metric_name << " not found in metricsDict\n";
|
||||
context->metrics_list.push_back(metric);
|
||||
}
|
||||
}
|
||||
|
||||
context->results_map = results_map;
|
||||
context->metrics_dict = metricsDict[gpu_agent.handle];
|
||||
|
||||
hsa_ven_amd_aqlprofile_parameter_t* params = {};
|
||||
|
||||
packet_t* start_packet = new packet_t();
|
||||
packet_t* stop_packet = new packet_t();
|
||||
packet_t* read_packet = new packet_t();
|
||||
|
||||
if (context->events_list.size() <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Preparing the profile structure to get the packets
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wconversion-null"
|
||||
hsa_ven_amd_aqlprofile_event_type_t profile_type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC;
|
||||
if (is_spm) profile_type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE;
|
||||
hsa_ven_amd_aqlprofile_profile_t* profile =
|
||||
new hsa_ven_amd_aqlprofile_profile_t{gpu_agent,
|
||||
profile_type,
|
||||
&(context->events_list[0]),
|
||||
static_cast<uint32_t>(context->events_list.size()),
|
||||
params,
|
||||
0,
|
||||
NULL,
|
||||
NULL};
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
// Preparing an Getting the size of the command and output buffers
|
||||
status = hsa_ven_amd_aqlprofile_start(profile, NULL);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
const char* hsa_err_str = nullptr;
|
||||
if (hsa_status_string(status, &hsa_err_str) != HSA_STATUS_SUCCESS) hsa_err_str = "Unknown";
|
||||
printf("Error: %s\n", hsa_err_str);
|
||||
continue;
|
||||
// do {
|
||||
rocmtools::profiling_context_t* context = new rocmtools::profiling_context_t();
|
||||
context->gpu_agent = gpu_agent;
|
||||
auto result = results_list.begin();
|
||||
std::map<std::pair<uint32_t, uint32_t>, uint32_t> block_max_events_count;
|
||||
std::set<hsa_ven_amd_aqlprofile_block_name_t> block_names_taken;
|
||||
for (auto event = events_list.begin(); event != events_list.end();) {
|
||||
if (block_max_events_count[std::make_pair<uint32_t, uint32_t>(
|
||||
static_cast<uint32_t>(event->block_name), static_cast<uint32_t>(event->block_index))] <
|
||||
event_to_max_block_count[std::make_pair<uint32_t, uint32_t>(
|
||||
static_cast<uint32_t>(event->block_name), static_cast<uint32_t>(event->block_index))]) {
|
||||
context->events_list.push_back(*event);
|
||||
context->results_list.emplace_back(*result);
|
||||
block_max_events_count[std::make_pair<uint32_t, uint32_t>(
|
||||
static_cast<uint32_t>(event->block_name), static_cast<uint32_t>(event->block_index))]++;
|
||||
results_list.erase(result);
|
||||
events_list.erase(event);
|
||||
} else {
|
||||
status = HSA_STATUS_ERROR;
|
||||
size_t size = profile->command_buffer.size;
|
||||
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
|
||||
agentInfo.cpu_pool, size, 0, reinterpret_cast<void**>(&(profile->command_buffer.ptr)));
|
||||
event++;
|
||||
result++;
|
||||
}
|
||||
}
|
||||
|
||||
// Both the CPU and GPU can access the memory
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
hsa_agent_t ag_list[1] = {gpu_agent};
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
|
||||
1, ag_list, NULL, profile->command_buffer.ptr);
|
||||
std::set<std::string> counters_taken;
|
||||
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
printf("Error: Can't allow access for both agents to Command Buffer\n");
|
||||
continue;
|
||||
} else if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
|
||||
printf("Error: Ran out of GPU memory to allocate Command Buffer\n");
|
||||
continue;
|
||||
std::set<std::string> metrics_counters_taken;
|
||||
|
||||
for (auto result : context->results_list) {
|
||||
rocmtools::Metric* metric;
|
||||
if (std::find(counter_names.begin(), counter_names.end(), result->name) !=
|
||||
counter_names.end()) {
|
||||
// std::cout << "Counter from Result List: " << result->name << std::endl;
|
||||
counters_taken.insert(result->name);
|
||||
metric = const_cast<rocmtools::Metric*>(metricsDict[gpu_agent.handle]->Get(result->name));
|
||||
if (metric == nullptr) std::cout << result->name << " not found in metricsDict\n";
|
||||
context->metrics_list.push_back(metric);
|
||||
} else {
|
||||
metrics_counters_taken.insert(result->name);
|
||||
// std::cout << "Counter Added: " << result->name << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::set<std::string> metrics_taken;
|
||||
|
||||
for (auto result : results_map) {
|
||||
if (counters_taken.find(result.first) == counters_taken.end() &&
|
||||
std::find(counter_names.begin(), counter_names.end(), result.first) !=
|
||||
counter_names.end()) {
|
||||
bool flag = true;
|
||||
for (auto result_basic : results_list) {
|
||||
if (result_basic->name.compare(result.first)) {
|
||||
flag = false;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
const char* hsa_err_str = NULL;
|
||||
if (hsa_status_string(status, &hsa_err_str) != HSA_STATUS_SUCCESS) hsa_err_str = "Unknown";
|
||||
printf("Error: Allocating command Buffer (Size=%lu) (%s)\n", size, hsa_err_str);
|
||||
}
|
||||
if (flag) metrics_taken.insert(result.first);
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_spm) {
|
||||
status = HSA_STATUS_ERROR;
|
||||
size = profile->output_buffer.size;
|
||||
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
|
||||
agentInfo.kernarg_pool, size, 0, reinterpret_cast<void**>(&profile->output_buffer.ptr));
|
||||
|
||||
if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
|
||||
printf("Error: Ran out of GPU memory to allocate Output Buffer\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
hsa_agent_t ag_list[1] = {gpu_agent};
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
|
||||
1, ag_list, NULL, profile->output_buffer.ptr);
|
||||
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
memset(profile->output_buffer.ptr, 0x0, profile->output_buffer.size);
|
||||
|
||||
// Creating the start/stop/read packets
|
||||
status = hsa_ven_amd_aqlprofile_start(profile, start_packet);
|
||||
status = hsa_ven_amd_aqlprofile_stop(profile, stop_packet);
|
||||
status = hsa_ven_amd_aqlprofile_read(profile, read_packet);
|
||||
|
||||
context->start_packet = start_packet;
|
||||
context->stop_packet = stop_packet;
|
||||
context->read_packet = read_packet;
|
||||
|
||||
// add profiles
|
||||
profiles->emplace_back(std::make_pair(context, profile));
|
||||
} else {
|
||||
printf("Error: Can't allow access for both agents to output Buffer\n");
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
const char* hsa_err_str = NULL;
|
||||
if (hsa_status_string(status, &hsa_err_str) != HSA_STATUS_SUCCESS)
|
||||
hsa_err_str = "Unknown";
|
||||
printf("Error: Allocating output Buffer (%s)\n", hsa_err_str);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
profile->output_buffer.size = 0;
|
||||
status = hsa_ven_amd_aqlprofile_start(profile, start_packet);
|
||||
status = hsa_ven_amd_aqlprofile_stop(profile, stop_packet);
|
||||
status = hsa_ven_amd_aqlprofile_read(profile, read_packet);
|
||||
|
||||
context->start_packet = start_packet;
|
||||
context->stop_packet = stop_packet;
|
||||
context->read_packet = read_packet;
|
||||
|
||||
// add profiles
|
||||
profiles->emplace_back(std::make_pair(context, profile));
|
||||
for (auto metric_name : metrics_taken) {
|
||||
bool flag = true;
|
||||
if (metrics_counters.find(metric_name) == metrics_counters.end()) continue;
|
||||
for (auto metric_counter_name : metrics_counters.at(metric_name)) {
|
||||
if (metrics_counters_taken.find(metric_counter_name) == metrics_counters_taken.end() &&
|
||||
counters_taken.find(metric_counter_name) == counters_taken.end()) {
|
||||
flag = false;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} while (events_list.size() > 0);
|
||||
if (flag) {
|
||||
// std::cout << "Counter from Result Map: " << metric_name << std::endl;
|
||||
counters_taken.insert(metric_name);
|
||||
rocmtools::Metric* metric =
|
||||
const_cast<rocmtools::Metric*>(metricsDict[gpu_agent.handle]->Get(metric_name));
|
||||
if (metric == nullptr) std::cout << metric_name << " not found in metricsDict\n";
|
||||
context->metrics_list.push_back(metric);
|
||||
}
|
||||
}
|
||||
|
||||
context->results_map = results_map;
|
||||
context->metrics_dict = metricsDict[gpu_agent.handle];
|
||||
|
||||
hsa_ven_amd_aqlprofile_parameter_t* params = {};
|
||||
|
||||
packet_t* start_packet = new packet_t();
|
||||
packet_t* stop_packet = new packet_t();
|
||||
packet_t* read_packet = new packet_t();
|
||||
|
||||
if (context->events_list.size() <= 0) {
|
||||
std::cerr << "Error: No events to profile" << std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
// Preparing the profile structure to get the packets
|
||||
hsa_ven_amd_aqlprofile_event_type_t profile_type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC;
|
||||
if (is_spm) profile_type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE;
|
||||
hsa_ven_amd_aqlprofile_profile_t* profile =
|
||||
new hsa_ven_amd_aqlprofile_profile_t{gpu_agent,
|
||||
profile_type,
|
||||
&(context->events_list[0]),
|
||||
static_cast<uint32_t>(context->events_list.size()),
|
||||
params,
|
||||
0,
|
||||
0,
|
||||
0};
|
||||
|
||||
size_t ag_list_count = 1; // rocmtools::hsa_support::GetCPUAgentList().size();
|
||||
hsa_agent_t ag_list[ag_list_count];
|
||||
ag_list[0] = gpu_agent;
|
||||
|
||||
// Preparing an Getting the size of the command and output buffers
|
||||
status = hsa_ven_amd_aqlprofile_start(profile, NULL);
|
||||
// CHECK_HSA_STATUS("Error: Getting Buffers Size", status);
|
||||
|
||||
if (profile->command_buffer.size > 0 && profile->output_buffer.size > 0) {
|
||||
status = HSA_STATUS_ERROR;
|
||||
size_t size = profile->command_buffer.size;
|
||||
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
if (size <= 0) {
|
||||
std::cerr << __FILE__ << ":" << __LINE__ << " "
|
||||
<< "Error: Command buffer given size is " << size << std::endl;
|
||||
abort();
|
||||
}
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
|
||||
agentInfo.cpu_pool, size, 0, reinterpret_cast<void**>(&(profile->command_buffer.ptr)));
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
profile->command_buffer.ptr = malloc(size);
|
||||
/*numa_alloc_onnode(
|
||||
size,
|
||||
rocmtools::hsa_support::GetAgentInfo(agentInfo.getNearCpuAgent().handle).getNumaNode());*/
|
||||
if (profile->command_buffer.ptr == NULL) {
|
||||
std::cerr << __FILE__ << ":" << __LINE__ << " "
|
||||
<< "Error: allocating memory for command buffer using NUMA" << std::endl;
|
||||
abort();
|
||||
}
|
||||
} else {
|
||||
// Both the CPU and GPU can access the memory
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
|
||||
ag_list_count, ag_list, NULL, profile->command_buffer.ptr);
|
||||
CHECK_HSA_STATUS("Error: Allowing access to Command Buffer", status);
|
||||
}
|
||||
|
||||
if (!is_spm) {
|
||||
status = HSA_STATUS_ERROR;
|
||||
size_t size = profile->output_buffer.size;
|
||||
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
if (size <= 0) {
|
||||
std::cerr << __FILE__ << ":" << __LINE__ << " "
|
||||
<< "Error: Output buffer given size is " << size << std::endl;
|
||||
abort();
|
||||
}
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
|
||||
agentInfo.kernarg_pool, size, 0, reinterpret_cast<void**>(&profile->output_buffer.ptr));
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
profile->output_buffer.ptr = malloc(size);
|
||||
/*numa_alloc_onnode(
|
||||
size,
|
||||
rocmtools::hsa_support::GetAgentInfo(agentInfo.getNearCpuAgent().handle)
|
||||
.getNumaNode());*/
|
||||
if (profile->output_buffer.ptr == NULL) {
|
||||
std::cerr << __FILE__ << ":" << __LINE__ << " "
|
||||
<< "Error: allocating memory for output buffer using NUMA" << std::endl;
|
||||
abort();
|
||||
}
|
||||
} else {
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
|
||||
ag_list_count, ag_list, NULL, profile->output_buffer.ptr);
|
||||
CHECK_HSA_STATUS("Error: GPU Agent can't have output buffer access", status);
|
||||
memset(profile->output_buffer.ptr, 0x0, profile->output_buffer.size);
|
||||
}
|
||||
} else {
|
||||
profile->output_buffer.size = 0;
|
||||
}
|
||||
status = hsa_ven_amd_aqlprofile_start(profile, start_packet);
|
||||
// CHECK_HSA_STATUS("Error: Creating Start Packet\n", status);
|
||||
status = hsa_ven_amd_aqlprofile_stop(profile, stop_packet);
|
||||
// CHECK_HSA_STATUS("Error: Creating Stop Packet\n", status);
|
||||
status = hsa_ven_amd_aqlprofile_read(profile, read_packet);
|
||||
// CHECK_HSA_STATUS("Error: Creating Read Packet\n", status);
|
||||
|
||||
context->start_packet = start_packet;
|
||||
context->stop_packet = stop_packet;
|
||||
context->read_packet = read_packet;
|
||||
|
||||
// add profiles
|
||||
profiles.emplace_back(std::make_pair(context, profile));
|
||||
}
|
||||
// } while (events_list.size() > 0);
|
||||
return profiles;
|
||||
}
|
||||
|
||||
@@ -407,67 +397,71 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets(
|
||||
|
||||
// Validating the events array for the specified gpu agent
|
||||
bool result;
|
||||
hsa_ven_amd_aqlprofile_validate_event(gpu_agent, events, &result);
|
||||
if (!result) {
|
||||
printf("Error: Events are not valid for the current gpu agent\n");
|
||||
throw("Error: Events are not valid for the current gpu agent");
|
||||
}
|
||||
status = hsa_ven_amd_aqlprofile_validate_event(gpu_agent, events, &result);
|
||||
CHECK_HSA_STATUS("Error: Events are not valid for the current gpu agent\n", status);
|
||||
|
||||
hsa_ven_amd_aqlprofile_parameter_t* params = {};
|
||||
uint8_t* command_buffer = nullptr;
|
||||
uint8_t* output_buffer = nullptr;
|
||||
|
||||
|
||||
// Preparing the profile structure to get the packets
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wconversion-null"
|
||||
// Preparing the profile structure to get the packets
|
||||
hsa_ven_amd_aqlprofile_profile_t* profile = new hsa_ven_amd_aqlprofile_profile_t{
|
||||
gpu_agent, HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC, events, event_count, params, 0, NULL, NULL};
|
||||
#pragma GCC diagnostic pop
|
||||
gpu_agent, HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC, events, event_count, params, 0, 0, 0};
|
||||
|
||||
// Preparing an Getting the size of the command and output buffers
|
||||
status = hsa_ven_amd_aqlprofile_start(profile, NULL);
|
||||
|
||||
Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle);
|
||||
size_t ag_list_count = 1;
|
||||
hsa_agent_t ag_list[ag_list_count];
|
||||
ag_list[0] = gpu_agent;
|
||||
|
||||
// Allocating Command Buffer
|
||||
status = HSA_STATUS_ERROR;
|
||||
size_t size = profile->command_buffer.size;
|
||||
profile->command_buffer.ptr = nullptr;
|
||||
if (size <= 0) return nullptr;
|
||||
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
|
||||
agentInfo.cpu_pool, size, 0, reinterpret_cast<void**>(&command_buffer));
|
||||
agentInfo.cpu_pool, size, 0, reinterpret_cast<void**>(&(profile->command_buffer.ptr)));
|
||||
// Both the CPU and GPU can access the memory
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
hsa_agent_t ag_list[1] = {gpu_agent};
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
|
||||
1, ag_list, NULL, command_buffer);
|
||||
ag_list_count, ag_list, NULL, profile->command_buffer.ptr);
|
||||
CHECK_HSA_STATUS("Error: GPU Agent can't have command buffer access", status);
|
||||
} else {
|
||||
profile->command_buffer.ptr = numa_alloc_onnode(
|
||||
profile->command_buffer.size,
|
||||
rocmtools::hsa_support::GetAgentInfo(agentInfo.getNearCpuAgent().handle).getNumaNode());
|
||||
if (profile->command_buffer.ptr != nullptr) {
|
||||
status = HSA_STATUS_SUCCESS;
|
||||
} else {
|
||||
CHECK_HSA_STATUS("Error: Allocating Command Buffer", status);
|
||||
}
|
||||
}
|
||||
profile->command_buffer.ptr = (status == HSA_STATUS_SUCCESS) ? command_buffer : nullptr;
|
||||
if (status != HSA_STATUS_SUCCESS) printf("Error: Allocating Command Buffer\n");
|
||||
|
||||
// Allocating Output Buffer
|
||||
status = HSA_STATUS_ERROR;
|
||||
size = profile->output_buffer.size;
|
||||
profile->output_buffer.ptr = nullptr;
|
||||
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
|
||||
agentInfo.kernarg_pool, size, 0, reinterpret_cast<void**>(&output_buffer));
|
||||
agentInfo.gpu_pool, size, 0, reinterpret_cast<void**>(&(profile->output_buffer.ptr)));
|
||||
CHECK_HSA_STATUS("Error: Can't Allocate Output Buffer", status);
|
||||
// Both the CPU and GPU can access the kernel arguments
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
hsa_agent_t ag_list[1] = {gpu_agent};
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
|
||||
1, ag_list, NULL, output_buffer);
|
||||
}
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
profile->output_buffer.ptr = output_buffer;
|
||||
ag_list_count, ag_list, NULL, profile->output_buffer.ptr);
|
||||
CHECK_HSA_STATUS("Error: Can't allow access on the Output Buffer for the GPU", status);
|
||||
memset(profile->output_buffer.ptr, 0x0, profile->output_buffer.size);
|
||||
} else {
|
||||
profile->output_buffer.ptr = nullptr;
|
||||
}
|
||||
|
||||
|
||||
// Creating the start/stop/read packets
|
||||
status = hsa_ven_amd_aqlprofile_start(profile, start_packet);
|
||||
CHECK_HSA_STATUS("Error: Creating Start Packet\n", status);
|
||||
status = hsa_ven_amd_aqlprofile_stop(profile, stop_packet);
|
||||
CHECK_HSA_STATUS("Error: Creating Stop Packet\n", status);
|
||||
status = hsa_ven_amd_aqlprofile_read(profile, read_packet);
|
||||
CHECK_HSA_STATUS("Error: Creating Read Packet\n", status);
|
||||
|
||||
if (status == HSA_STATUS_ERROR) return nullptr;
|
||||
return profile;
|
||||
@@ -479,17 +473,19 @@ bool g_output_buffer_local = true;
|
||||
|
||||
// Allocate system memory accessible by both CPU and GPU
|
||||
uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_pool_t* cpu_pool) {
|
||||
size_t ag_list_count = 1; // rocmtools::hsa_support::GetCPUAgentList().size();
|
||||
hsa_agent_t ag_list[ag_list_count];
|
||||
ag_list[0] = gpu_agent;
|
||||
hsa_status_t status = HSA_STATUS_ERROR;
|
||||
uint8_t* buffer = NULL;
|
||||
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
// if (!cpu_agents_.empty()) {
|
||||
status = hsa_amd_memory_pool_allocate(*cpu_pool, size, 0, reinterpret_cast<void**>(&buffer));
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
|
||||
*cpu_pool, size, 0, reinterpret_cast<void**>(&buffer));
|
||||
// Both the CPU and GPU can access the memory
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
hsa_agent_t ag_list[1] = {gpu_agent};
|
||||
status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer);
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn(
|
||||
ag_list_count, ag_list, NULL, buffer);
|
||||
}
|
||||
// }
|
||||
uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
|
||||
return ptr;
|
||||
}
|
||||
@@ -542,18 +538,6 @@ att_mem_pools_map_t* GetAttMemPoolsMap() {
|
||||
return agent_att_mem_pools_map;
|
||||
}
|
||||
|
||||
|
||||
att_memory_pools_t* GetAttMemPools(hsa_agent_t gpu_agent) {
|
||||
auto it = GetAttMemPoolsMap()->find(gpu_agent.handle);
|
||||
if (it != GetAttMemPoolsMap()->end()) {
|
||||
return it->second;
|
||||
}
|
||||
printf("Error: att_memory_pools_t instance not found for given gpu agent handle: %lu\n",
|
||||
gpu_agent.handle);
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Generate start and stop packets for collecting ATT traces
|
||||
// Also generate and return the profile object which has the PM4
|
||||
// command buffer and the output buffer for retrieving the traces
|
||||
@@ -573,7 +557,7 @@ hsa_ven_amd_aqlprofile_profile_t* GenerateATTPackets(
|
||||
|
||||
// Check the profile buffer sizes
|
||||
hsa_status_t status = hsa_ven_amd_aqlprofile_start(profile, NULL);
|
||||
if (status != HSA_STATUS_SUCCESS) printf("Error: aqlprofile_start(NULL)");
|
||||
CHECK_HSA_STATUS("Error: Getting PM4 Start Packet", status);
|
||||
// TODO: create a separate class for memory allocations
|
||||
// Maintain pools per device
|
||||
// handle allocation and resource cleanup
|
||||
@@ -582,14 +566,13 @@ hsa_ven_amd_aqlprofile_profile_t* GenerateATTPackets(
|
||||
// command buffer -> from CPU memory pool
|
||||
// output buffer -> from GPU memory pool
|
||||
status = Allocate(gpu_agent, profile);
|
||||
if (status != HSA_STATUS_SUCCESS) printf("Error: Allocate()");
|
||||
CHECK_HSA_STATUS("Error: Att Buffers Allocation", status);
|
||||
|
||||
// Generate start/stop/read profiling packets
|
||||
status = hsa_ven_amd_aqlprofile_start(profile, start_packet);
|
||||
if (status != HSA_STATUS_SUCCESS) printf("Error: aqlprofile_start");
|
||||
CHECK_HSA_STATUS("Error: Creating Start PM4 Packet", status);
|
||||
status = hsa_ven_amd_aqlprofile_stop(profile, stop_packet);
|
||||
if (status != HSA_STATUS_SUCCESS) printf("Error: aqlprofile_stop");
|
||||
if (status == HSA_STATUS_ERROR) return nullptr;
|
||||
CHECK_HSA_STATUS("Error: Creating Stop PM4 Packet", status);
|
||||
return profile;
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ namespace Packet {
|
||||
|
||||
typedef hsa_ext_amd_aql_pm4_packet_t packet_t;
|
||||
|
||||
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>*
|
||||
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>
|
||||
InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
|
||||
std::vector<std::string>& counter_names, bool is_spm = false);
|
||||
uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_pool_t* cpu_pool);
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
#include <numa.h>
|
||||
|
||||
#include "rocprofiler.h"
|
||||
#include "src/api/rocmtool.h"
|
||||
@@ -33,6 +34,21 @@
|
||||
#include "src/core/hsa/hsa_support.h"
|
||||
#include "src/utils/helper.h"
|
||||
|
||||
#define CHECK_HSA_STATUS(msg, status) \
|
||||
do { \
|
||||
if ((status) != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) { \
|
||||
try { \
|
||||
const char* emsg = nullptr; \
|
||||
hsa_status_string(status, &emsg); \
|
||||
if (!emsg) emsg = "<Unknown HSA Error>"; \
|
||||
std::cerr << msg << std::endl; \
|
||||
std::cerr << emsg << std::endl; \
|
||||
} catch (std::exception & e) { \
|
||||
} \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define __NR_gettid 186
|
||||
#define MAX_ATT_PROFILES 16
|
||||
|
||||
@@ -290,27 +306,41 @@ hsa_status_t attTraceDataCallback(hsa_ven_amd_aqlprofile_info_type_t info_type,
|
||||
return status;
|
||||
}
|
||||
|
||||
void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_signal_t& pending) {
|
||||
std::vector<rocprofiler_record_counter_instance_t> counters_vec;
|
||||
for (size_t i = 0; i < pending.context->metrics_list.size(); i++) {
|
||||
const rocmtools::Metric* metric = pending.context->metrics_list[i];
|
||||
void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_signal_t* pending) {
|
||||
record->counters_count =
|
||||
rocprofiler_record_counters_instances_count_t{pending->context->metrics_list.size()};
|
||||
size_t counters_list_size =
|
||||
record->counters_count.value * sizeof(rocprofiler_record_counter_instance_t);
|
||||
rocprofiler_record_counter_instance_t* counters =
|
||||
static_cast<rocprofiler_record_counter_instance_t*>(malloc(counters_list_size));
|
||||
for (size_t i = 0; i < pending->context->metrics_list.size(); i++) {
|
||||
const rocmtools::Metric* metric = pending->context->metrics_list[i];
|
||||
double value = 0;
|
||||
std::string metric_name = metric->GetName();
|
||||
auto it = pending.context->results_map.find(metric_name);
|
||||
if (it != pending.context->results_map.end()) {
|
||||
auto it = pending->context->results_map.find(metric_name);
|
||||
if (it != pending->context->results_map.end()) {
|
||||
value = it->second->val_double;
|
||||
}
|
||||
counters_vec.emplace_back(rocprofiler_record_counter_instance_t{
|
||||
counters[i] = (rocprofiler_record_counter_instance_t{
|
||||
// TODO(aelwazir): Moving to span once C++20 is adopted, strdup can be
|
||||
// removed after that
|
||||
rocprofiler_counter_id_t{rocmtools::profiler::GetCounterID(metric_name)},
|
||||
rocprofiler_record_counter_value_t{value}});
|
||||
}
|
||||
record->counters = static_cast<rocprofiler_record_counter_instance_t*>(
|
||||
malloc(counters_vec.size() * sizeof(rocprofiler_record_counter_instance_t)));
|
||||
::memcpy(record->counters, &(counters_vec)[0],
|
||||
counters_vec.size() * sizeof(rocprofiler_record_counter_instance_t));
|
||||
record->counters_count = rocprofiler_record_counters_instances_count_t{counters_vec.size()};
|
||||
record->counters = counters;
|
||||
rocmtools::Session* session = GetROCMToolObj()->GetSession(pending->session_id);
|
||||
void* initial_handle = const_cast<rocprofiler_record_counter_instance_t*>(record->counters);
|
||||
if (session->FindBuffer(pending->buffer_id)) {
|
||||
Memory::GenericBuffer* buffer = session->GetBuffer(pending->buffer_id);
|
||||
buffer->AddRecord(*record, record->counters, counters_list_size,
|
||||
[initial_handle](auto& record, const void* data) {
|
||||
if (record.counters == initial_handle && data != initial_handle) {
|
||||
free(initial_handle);
|
||||
}
|
||||
record.counters =
|
||||
static_cast<const rocprofiler_record_counter_instance_t*>(data);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent,
|
||||
@@ -330,7 +360,6 @@ void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent
|
||||
for (trace_data_it = data.begin(); trace_data_it != data.end(); trace_data_it++) {
|
||||
const void* data_ptr = trace_data_it->trace_data.ptr;
|
||||
const uint32_t data_size = trace_data_it->trace_data.size;
|
||||
// fprintf(arg->file, " SE(%u) size(%u)\n", data.sample_id, data_size);
|
||||
|
||||
void* buffer = NULL;
|
||||
if (data_size != 0) {
|
||||
@@ -359,23 +388,22 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) {
|
||||
!GetROCMToolObj()->GetSession(queue_info_session->session_id)->GetProfiler())
|
||||
return true;
|
||||
rocmtools::Session* session = GetROCMToolObj()->GetSession(queue_info_session->session_id);
|
||||
std::lock_guard<std::mutex> lock(session->GetSessionLock());
|
||||
rocmtools::profiler::Profiler* profiler = session->GetProfiler();
|
||||
std::vector<pending_signal_t>& pending_signals = const_cast<std::vector<pending_signal_t>&>(
|
||||
std::vector<pending_signal_t*> pending_signals = const_cast<std::vector<pending_signal_t*>&>(
|
||||
profiler->GetPendingSignals(queue_info_session->writer_id));
|
||||
|
||||
if (!pending_signals.empty()) {
|
||||
for (auto it = pending_signals.begin(); it != pending_signals.end();
|
||||
it = pending_signals.erase(it)) {
|
||||
auto& pending = *it;
|
||||
std::lock_guard<std::mutex> lock(session->GetSessionLock());
|
||||
if (hsa_support::GetCoreApiTable().hsa_signal_load_relaxed_fn(pending.signal)) return true;
|
||||
if (hsa_support::GetCoreApiTable().hsa_signal_load_relaxed_fn(pending->signal)) return true;
|
||||
hsa_amd_profiling_dispatch_time_t time;
|
||||
hsa_support::GetAmdExtTable().hsa_amd_profiling_get_dispatch_time_fn(
|
||||
queue_info_session->agent, pending.signal, &time);
|
||||
queue_info_session->agent, pending->signal, &time);
|
||||
uint32_t record_count = 1;
|
||||
bool is_individual_xcc_mode = false;
|
||||
uint32_t xcc_count =
|
||||
hsa_support::GetAgentInfo(queue_info_session->agent.handle).getXccCount();
|
||||
uint32_t xcc_count = queue_info_session->xcc_count;
|
||||
if (xcc_count > 1) { // for MI300
|
||||
const char* str = getenv("ROCPROFILER_INDIVIDUAL_XCC_MODE");
|
||||
if (str != NULL) is_individual_xcc_mode = (atol(str) > 0);
|
||||
@@ -387,70 +415,62 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) {
|
||||
rocprofiler_record_profiler_t record{};
|
||||
// TODO: (sauverma) gpu-id will need to support xcc like so- 1.1, 1.2, 1.3 ... 1.5 for
|
||||
// different xcc
|
||||
record.gpu_id = rocprofiler_agent_id_t{
|
||||
(uint64_t)hsa_support::GetAgentInfo(queue_info_session->agent.handle).getIndex()};
|
||||
record.kernel_properties = pending.kernel_properties;
|
||||
record.thread_id = rocprofiler_thread_id_t{pending.thread_id};
|
||||
record.queue_idx = rocprofiler_queue_index_t{pending.queue_index};
|
||||
record.gpu_id = rocprofiler_agent_id_t{(uint64_t)queue_info_session->gpu_index};
|
||||
record.kernel_properties = pending->kernel_properties;
|
||||
record.thread_id = rocprofiler_thread_id_t{pending->thread_id};
|
||||
record.queue_idx = rocprofiler_queue_index_t{pending->queue_index};
|
||||
record.timestamps = rocprofiler_record_header_timestamp_t{time.start, time.end};
|
||||
record.queue_id = rocprofiler_queue_id_t{queue_info_session->queue_id};
|
||||
if (pending.counters_count > 0 && pending.context->metrics_list.size() > 0 &&
|
||||
pending.profile) {
|
||||
if (xcc_id == 0) // call to GetCounterData() is required only once for a dispatch
|
||||
rocmtools::metrics::GetCounterData(pending.profile, queue_info_session->agent,
|
||||
pending.context->results_list);
|
||||
if (is_individual_xcc_mode)
|
||||
rocmtools::metrics::GetCountersAndMetricResultsByXcc(
|
||||
xcc_id, pending.context->results_list, pending.context->results_map,
|
||||
pending.context->metrics_list);
|
||||
else
|
||||
rocmtools::metrics::GetMetricsData(pending.context->results_map,
|
||||
pending.context->metrics_list);
|
||||
AddRecordCounters(&record, pending);
|
||||
}
|
||||
// Kernel Descriptor is the right record id generated in the WriteInterceptor function and
|
||||
// will be used to handle the kernel name of that dispatch
|
||||
record.header = {ROCPROFILER_PROFILER_RECORD,
|
||||
rocprofiler_record_id_t{pending.kernel_descriptor}};
|
||||
record.kernel_id = rocprofiler_kernel_id_t{pending.kernel_descriptor};
|
||||
record.header = rocprofiler_record_header_t{
|
||||
ROCPROFILER_PROFILER_RECORD, rocprofiler_record_id_t{pending->kernel_descriptor}};
|
||||
record.kernel_id = rocprofiler_kernel_id_t{pending->kernel_descriptor};
|
||||
record.correlation_id = rocprofiler_correlation_id_t{pending->correlation_id};
|
||||
|
||||
if (pending.session_id.handle == 0) {
|
||||
pending.session_id = GetROCMToolObj()->GetCurrentSessionId();
|
||||
if (pending->session_id.handle == 0) {
|
||||
pending->session_id = GetROCMToolObj()->GetCurrentSessionId();
|
||||
}
|
||||
if (session->FindBuffer(pending.buffer_id)) {
|
||||
Memory::GenericBuffer* buffer = session->GetBuffer(pending.buffer_id);
|
||||
if (pending.profile && pending.counters_count > 0) {
|
||||
rocprofiler_record_counter_instance_t* record_counters = record.counters;
|
||||
buffer->AddRecord(
|
||||
record, record.counters,
|
||||
(record.counters_count.value * (sizeof(rocprofiler_record_counter_instance_t) + 1)),
|
||||
[](auto& record, const void* data) {
|
||||
record.counters = const_cast<rocprofiler_record_counter_instance_t*>(
|
||||
static_cast<const rocprofiler_record_counter_instance_t*>(data));
|
||||
});
|
||||
free(record_counters);
|
||||
} else {
|
||||
if (pending->counters_count > 0 && pending->context->metrics_list.size() > 0 &&
|
||||
pending->profile) {
|
||||
if (xcc_id == 0) // call to GetCounterData() is required only once for a dispatch
|
||||
rocmtools::metrics::GetCounterData(pending->profile, queue_info_session->agent,
|
||||
pending->context->results_list);
|
||||
if (is_individual_xcc_mode)
|
||||
rocmtools::metrics::GetCountersAndMetricResultsByXcc(
|
||||
xcc_id, pending->context->results_list, pending->context->results_map,
|
||||
pending->context->metrics_list);
|
||||
else
|
||||
rocmtools::metrics::GetMetricsData(pending->context->results_map,
|
||||
pending->context->metrics_list);
|
||||
AddRecordCounters(&record, pending);
|
||||
} else {
|
||||
if (session->FindBuffer(pending->buffer_id)) {
|
||||
Memory::GenericBuffer* buffer = session->GetBuffer(pending->buffer_id);
|
||||
buffer->AddRecord(record);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (pending.counters_count > 0 && pending.profile && pending.profile->events) {
|
||||
if (pending->counters_count > 0 && pending->profile && pending->profile->events) {
|
||||
// TODO(aelwazir): we need a better way of distributing events and free them
|
||||
// free(const_cast<hsa_ven_amd_aqlprofile_event_t*>(pending.profile->events));
|
||||
// if (pending->profile->output_buffer.ptr)
|
||||
// numa_free(pending->profile->output_buffer.ptr, pending->profile->output_buffer.size);
|
||||
hsa_status_t status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_free_fn(
|
||||
(pending.profile->output_buffer.ptr));
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
printf("Error: Couldn't free output buffer memory\n");
|
||||
}
|
||||
(pending->profile->output_buffer.ptr));
|
||||
CHECK_HSA_STATUS("Error: Couldn't free output buffer memory", status);
|
||||
// if (pending->profile->command_buffer.ptr)
|
||||
// numa_free(pending->profile->command_buffer.ptr, pending->profile->command_buffer.size);
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_free_fn(
|
||||
(pending.profile->command_buffer.ptr));
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
printf("Error: Couldn't free command buffer memory\n");
|
||||
(pending->profile->command_buffer.ptr));
|
||||
CHECK_HSA_STATUS("Error: Couldn't free command buffer memory", status);
|
||||
delete pending->profile;
|
||||
for (auto& it : pending->context->results_map) {
|
||||
delete it.second;
|
||||
}
|
||||
delete pending.profile;
|
||||
delete pending->context;
|
||||
}
|
||||
if (pending.signal.handle)
|
||||
hsa_support::GetCoreApiTable().hsa_signal_destroy_fn(pending.signal);
|
||||
if (pending->signal.handle)
|
||||
hsa_support::GetCoreApiTable().hsa_signal_destroy_fn(pending->signal);
|
||||
if (queue_info_session->interrupt_signal.handle)
|
||||
hsa_support::GetCoreApiTable().hsa_signal_destroy_fn(queue_info_session->interrupt_signal);
|
||||
}
|
||||
@@ -483,8 +503,7 @@ bool AsyncSignalHandlerATT(hsa_signal_value_t /* signal */, void* data) {
|
||||
if (hsa_support::GetCoreApiTable().hsa_signal_load_relaxed_fn(pending.signal)) return true;
|
||||
rocprofiler_record_att_tracer_t record{};
|
||||
record.kernel_id = rocprofiler_kernel_id_t{pending.kernel_descriptor};
|
||||
record.gpu_id = rocprofiler_agent_id_t{
|
||||
(uint64_t)hsa_support::GetAgentInfo(queue_info_session->agent.handle).getIndex()};
|
||||
record.gpu_id = rocprofiler_agent_id_t{(uint64_t)queue_info_session->gpu_index};
|
||||
record.kernel_properties = pending.kernel_properties;
|
||||
record.thread_id = rocprofiler_thread_id_t{pending.thread_id};
|
||||
record.queue_idx = rocprofiler_queue_index_t{pending.queue_index};
|
||||
@@ -506,14 +525,10 @@ bool AsyncSignalHandlerATT(hsa_signal_value_t /* signal */, void* data) {
|
||||
}
|
||||
hsa_status_t status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_free_fn(
|
||||
(pending.profile->output_buffer.ptr));
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
printf("Error: Couldn't free output buffer memory\n");
|
||||
}
|
||||
CHECK_HSA_STATUS("Error: Couldn't free output buffer memory", status);
|
||||
status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_free_fn(
|
||||
(pending.profile->command_buffer.ptr));
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
printf("Error: Couldn't free command buffer memory\n");
|
||||
}
|
||||
CHECK_HSA_STATUS("Error: Couldn't free command buffer memory", status);
|
||||
delete pending.profile;
|
||||
}
|
||||
}
|
||||
@@ -540,19 +555,19 @@ void AddVendorSpecificPacket(const Packet::packet_t* packet,
|
||||
void SignalAsyncHandler(const hsa_signal_t& signal, void* data) {
|
||||
hsa_status_t status = hsa_support::GetAmdExtTable().hsa_amd_signal_async_handler_fn(
|
||||
signal, HSA_SIGNAL_CONDITION_EQ, 0, AsyncSignalHandler, data);
|
||||
if (status != HSA_STATUS_SUCCESS) fatal("hsa_amd_signal_async_handler failed");
|
||||
CHECK_HSA_STATUS("Error: hsa_amd_signal_async_handler failed", status);
|
||||
}
|
||||
|
||||
void signalAsyncHandlerATT(const hsa_signal_t& signal, void* data) {
|
||||
hsa_status_t status = hsa_support::GetAmdExtTable().hsa_amd_signal_async_handler_fn(
|
||||
signal, HSA_SIGNAL_CONDITION_EQ, 0, AsyncSignalHandlerATT, data);
|
||||
if (status != HSA_STATUS_SUCCESS) fatal("hsa_amd_signal_async_handler failed");
|
||||
CHECK_HSA_STATUS("Error: hsa_amd_signal_async_handler for ATT failed", status);
|
||||
}
|
||||
|
||||
void CreateSignal(uint32_t attribute, hsa_signal_t* signal) {
|
||||
hsa_status_t status =
|
||||
hsa_support::GetAmdExtTable().hsa_amd_signal_create_fn(1, 0, nullptr, attribute, signal);
|
||||
if (status != HSA_STATUS_SUCCESS) fatal("hsa_amd_signal_create failed");
|
||||
CHECK_HSA_STATUS("Error: hsa_amd_signal_create failed", status);
|
||||
}
|
||||
|
||||
template <typename Integral = uint64_t> constexpr Integral bit_mask(int first, int last) {
|
||||
@@ -659,13 +674,13 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
|
||||
is_pc_sampling_collection_mode) &&
|
||||
session) {
|
||||
// Getting Queue Data and Information
|
||||
auto& queue_info = *static_cast<Queue*>(data);
|
||||
Queue& queue_info = *reinterpret_cast<Queue*>(data);
|
||||
std::lock_guard<std::mutex> lk(queue_info.qw_mutex);
|
||||
|
||||
|
||||
// hsa_ven_amd_aqlprofile_profile_t* profile;
|
||||
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>*
|
||||
profiles = nullptr;
|
||||
std::vector<std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*>>
|
||||
profiles;
|
||||
|
||||
|
||||
// Searching accross all the packets given during this write
|
||||
@@ -686,106 +701,109 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
|
||||
// Get the PM4 Packets using packets_generator
|
||||
profiles = Packet::InitializeAqlPackets(queue_info.GetCPUAgent(), queue_info.GetGPUAgent(),
|
||||
session_data);
|
||||
replay_mode_count = profiles->size();
|
||||
replay_mode_count = profiles.size();
|
||||
}
|
||||
|
||||
uint32_t profile_id = 0;
|
||||
hsa_signal_t interrupt_signal;
|
||||
do {
|
||||
std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*> profile;
|
||||
if (profiles && replay_mode_count > 0) profile = profiles->at(profile_id);
|
||||
// do {
|
||||
std::pair<rocmtools::profiling_context_t*, hsa_ven_amd_aqlprofile_profile_t*> profile;
|
||||
if (profiles.size() > 0 && replay_mode_count > 0) profile = profiles.at(profile_id);
|
||||
|
||||
uint32_t writer_id = WRITER_ID.fetch_add(1, std::memory_order_release);
|
||||
uint32_t writer_id = WRITER_ID.fetch_add(1, std::memory_order_release);
|
||||
|
||||
if (session_data_count > 0 && is_counter_collection_mode && profiles &&
|
||||
replay_mode_count > 0) {
|
||||
// Adding start packet and its barrier with a dummy signal
|
||||
hsa_signal_t dummy_signal{};
|
||||
dummy_signal.handle = 0;
|
||||
profile.first->start_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
|
||||
<< HSA_PACKET_HEADER_TYPE;
|
||||
AddVendorSpecificPacket(profile.first->start_packet, &transformed_packets, dummy_signal);
|
||||
if (session_data_count > 0 && is_counter_collection_mode && profiles.size() > 0 &&
|
||||
replay_mode_count > 0) {
|
||||
// Adding start packet and its barrier with a dummy signal
|
||||
hsa_signal_t dummy_signal{};
|
||||
dummy_signal.handle = 0;
|
||||
profile.first->start_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
|
||||
<< HSA_PACKET_HEADER_TYPE;
|
||||
AddVendorSpecificPacket(profile.first->start_packet, &transformed_packets, dummy_signal);
|
||||
|
||||
CreateBarrierPacket(profile.first->start_packet->completion_signal, &transformed_packets);
|
||||
}
|
||||
CreateBarrierPacket(profile.first->start_packet->completion_signal, &transformed_packets);
|
||||
}
|
||||
|
||||
auto& packet = transformed_packets.emplace_back(packets_arr[i]);
|
||||
auto& dispatch_packet = reinterpret_cast<hsa_kernel_dispatch_packet_t&>(packet);
|
||||
auto& packet = transformed_packets.emplace_back(packets_arr[i]);
|
||||
auto& dispatch_packet = reinterpret_cast<hsa_kernel_dispatch_packet_t&>(packet);
|
||||
|
||||
/*
|
||||
* Only PC sampling relies on this right now, so it would be better to
|
||||
* only generate an ID if PC sampling is active to conserve IDs, but it's
|
||||
* unlikely 64 bits' worth of identifiers will be exhausted during the
|
||||
* lifetime of the ROCMToolObj.
|
||||
*/
|
||||
dispatch_packet.reserved2 = GetROCMToolObj()->GetUniqueKernelDispatchId();
|
||||
/*
|
||||
* Only PC sampling relies on this right now, so it would be better to
|
||||
* only generate an ID if PC sampling is active to conserve IDs, but it's
|
||||
* unlikely 64 bits' worth of identifiers will be exhausted during the
|
||||
* lifetime of the ROCMToolObj.
|
||||
*/
|
||||
uint64_t correlation_id = dispatch_packet.reserved2;
|
||||
// dispatch_packet.reserved2 = GetROCMToolObj()->GetUniqueKernelDispatchId();
|
||||
|
||||
CreateSignal(HSA_AMD_SIGNAL_AMD_GPU_ONLY, &packet.completion_signal);
|
||||
// Adding the dispatch packet newly created signal to the pending signals
|
||||
// list to be processed by the signal interrupt
|
||||
rocprofiler_kernel_properties_t kernel_properties =
|
||||
set_kernel_properties(dispatch_packet, queue_info.GetGPUAgent());
|
||||
if (session) {
|
||||
uint64_t record_id = GetROCMToolObj()->GetUniqueRecordId();
|
||||
AddKernelNameWithDispatchID(GetKernelNameFromKsymbols(dispatch_packet.kernel_object),
|
||||
record_id);
|
||||
if (profiles && replay_mode_count > 0) {
|
||||
session->GetProfiler()->AddPendingSignals(
|
||||
writer_id, record_id, dispatch_packet.completion_signal, session_id, buffer_id,
|
||||
profile.first, profile.first->metrics_list.size(), profile.second,
|
||||
kernel_properties, (uint32_t)syscall(__NR_gettid), user_pkt_index);
|
||||
} else {
|
||||
session->GetProfiler()->AddPendingSignals(
|
||||
writer_id, record_id, dispatch_packet.completion_signal, session_id, buffer_id,
|
||||
nullptr, 0, nullptr, kernel_properties, (uint32_t)syscall(__NR_gettid),
|
||||
user_pkt_index);
|
||||
}
|
||||
}
|
||||
|
||||
// Make a copy of the original packet, adding its signal to a barrier
|
||||
// packet and create a new signal for it to get timestamps
|
||||
if (original_packet.completion_signal.handle) {
|
||||
hsa_barrier_and_packet_t barrier{0};
|
||||
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
|
||||
Packet::packet_t* __attribute__((__may_alias__)) pkt =
|
||||
(reinterpret_cast<Packet::packet_t*>(&barrier));
|
||||
transformed_packets.emplace_back(*pkt).completion_signal =
|
||||
original_packet.completion_signal;
|
||||
}
|
||||
|
||||
// Adding a barrier packet with the original packet's completion signal.
|
||||
CreateSignal(0, &interrupt_signal);
|
||||
|
||||
// Adding Stop and Read PM4 Packets
|
||||
if (session_data_count > 0 && is_counter_collection_mode && profiles &&
|
||||
replay_mode_count > 0) {
|
||||
hsa_signal_t dummy_signal{};
|
||||
profile.first->stop_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
|
||||
<< HSA_PACKET_HEADER_TYPE;
|
||||
AddVendorSpecificPacket(profile.first->stop_packet, &transformed_packets, dummy_signal);
|
||||
profile.first->read_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
|
||||
<< HSA_PACKET_HEADER_TYPE;
|
||||
AddVendorSpecificPacket(profile.first->read_packet, &transformed_packets,
|
||||
interrupt_signal);
|
||||
|
||||
// Added Interrupt Signal with barrier and provided handler for it
|
||||
CreateBarrierPacket(interrupt_signal, &transformed_packets);
|
||||
CreateSignal(HSA_AMD_SIGNAL_AMD_GPU_ONLY, &packet.completion_signal);
|
||||
// Adding the dispatch packet newly created signal to the pending signals
|
||||
// list to be processed by the signal interrupt
|
||||
rocprofiler_kernel_properties_t kernel_properties =
|
||||
set_kernel_properties(dispatch_packet, queue_info.GetGPUAgent());
|
||||
if (session) {
|
||||
uint64_t record_id = GetROCMToolObj()->GetUniqueRecordId();
|
||||
AddKernelNameWithDispatchID(GetKernelNameFromKsymbols(dispatch_packet.kernel_object),
|
||||
record_id);
|
||||
if (profiles.size() > 0 && replay_mode_count > 0) {
|
||||
session->GetProfiler()->AddPendingSignals(
|
||||
writer_id, record_id, dispatch_packet.completion_signal, session_id, buffer_id,
|
||||
profile.first, profile.first->metrics_list.size(), profile.second, kernel_properties,
|
||||
(uint32_t)syscall(__NR_gettid), user_pkt_index, correlation_id);
|
||||
} else {
|
||||
hsa_barrier_and_packet_t barrier{0};
|
||||
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
|
||||
barrier.completion_signal = interrupt_signal;
|
||||
Packet::packet_t* __attribute__((__may_alias__)) pkt =
|
||||
(reinterpret_cast<Packet::packet_t*>(&barrier));
|
||||
transformed_packets.emplace_back(*pkt);
|
||||
session->GetProfiler()->AddPendingSignals(
|
||||
writer_id, record_id, dispatch_packet.completion_signal, session_id, buffer_id,
|
||||
nullptr, 0, nullptr, kernel_properties, (uint32_t)syscall(__NR_gettid),
|
||||
user_pkt_index, correlation_id);
|
||||
}
|
||||
// Creating Async Handler to be called every time the interrupt signal is
|
||||
// marked complete
|
||||
SignalAsyncHandler(interrupt_signal,
|
||||
new queue_info_session_t{queue_info.GetGPUAgent(), session_id,
|
||||
queue_info.GetQueueID(), writer_id});
|
||||
ACTIVE_INTERRUPT_SIGNAL_COUNT.fetch_add(1, std::memory_order_relaxed);
|
||||
profile_id++;
|
||||
} while (replay_mode_count > 0 && profile_id < replay_mode_count); // Profiles loop end
|
||||
}
|
||||
|
||||
// Make a copy of the original packet, adding its signal to a barrier
|
||||
// packet and create a new signal for it to get timestamps
|
||||
if (original_packet.completion_signal.handle) {
|
||||
hsa_barrier_and_packet_t barrier{0};
|
||||
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
|
||||
Packet::packet_t* __attribute__((__may_alias__)) pkt =
|
||||
(reinterpret_cast<Packet::packet_t*>(&barrier));
|
||||
transformed_packets.emplace_back(*pkt).completion_signal =
|
||||
original_packet.completion_signal;
|
||||
}
|
||||
|
||||
hsa_signal_t interrupt_signal{};
|
||||
// Adding a barrier packet with the original packet's completion signal.
|
||||
CreateSignal(0, &interrupt_signal);
|
||||
|
||||
// Adding Stop and Read PM4 Packets
|
||||
if (session_data_count > 0 && is_counter_collection_mode) {
|
||||
hsa_signal_t dummy_signal{};
|
||||
profile.first->stop_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
|
||||
<< HSA_PACKET_HEADER_TYPE;
|
||||
AddVendorSpecificPacket(profile.first->stop_packet, &transformed_packets, dummy_signal);
|
||||
profile.first->read_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
|
||||
<< HSA_PACKET_HEADER_TYPE;
|
||||
AddVendorSpecificPacket(profile.first->read_packet, &transformed_packets, interrupt_signal);
|
||||
|
||||
// Added Interrupt Signal with barrier and provided handler for it
|
||||
CreateBarrierPacket(interrupt_signal, &transformed_packets);
|
||||
} else {
|
||||
hsa_barrier_and_packet_t barrier{0};
|
||||
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
|
||||
barrier.completion_signal = interrupt_signal;
|
||||
Packet::packet_t* __attribute__((__may_alias__)) pkt =
|
||||
(reinterpret_cast<Packet::packet_t*>(&barrier));
|
||||
transformed_packets.emplace_back(*pkt);
|
||||
}
|
||||
Agent::AgentInfo& agentInfo =
|
||||
rocmtools::hsa_support::GetAgentInfo(queue_info.GetGPUAgent().handle);
|
||||
// Creating Async Handler to be called every time the interrupt signal is
|
||||
// marked complete
|
||||
SignalAsyncHandler(
|
||||
interrupt_signal,
|
||||
new queue_info_session_t{queue_info.GetGPUAgent(), session_id, queue_info.GetQueueID(),
|
||||
writer_id, interrupt_signal, agentInfo.getIndex(),
|
||||
agentInfo.getXccCount()});
|
||||
ACTIVE_INTERRUPT_SIGNAL_COUNT.fetch_add(1, std::memory_order_relaxed);
|
||||
// profile_id++;
|
||||
// } while (replay_mode_count > 0 && profile_id < replay_mode_count); // Profiles loop end
|
||||
}
|
||||
/* Write the transformed packets to the hardware queue. */
|
||||
writer(&transformed_packets[0], transformed_packets.size());
|
||||
@@ -795,7 +813,7 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
|
||||
// Getting Queue Data and Information
|
||||
auto& queue_info = *static_cast<Queue*>(data);
|
||||
std::lock_guard<std::mutex> lk(queue_info.qw_mutex);
|
||||
Agent::AgentInfo* agentInfo = &(hsa_support::GetAgentInfo(queue_info.GetGPUAgent().handle));
|
||||
Agent::AgentInfo agentInfo = hsa_support::GetAgentInfo(queue_info.GetGPUAgent().handle);
|
||||
|
||||
bool can_profile_anypacket = false;
|
||||
std::vector<bool> can_profile_packet;
|
||||
@@ -858,7 +876,7 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt
|
||||
}
|
||||
|
||||
if (att_counters_names.size() > 0) {
|
||||
MetricsDict* metrics_dict_ = MetricsDict::Create(agentInfo);
|
||||
MetricsDict* metrics_dict_ = MetricsDict::Create(&agentInfo);
|
||||
|
||||
for (const std::string& counter_name : att_counters_names) {
|
||||
const Metric* metric = metrics_dict_->Get(counter_name);
|
||||
@@ -1007,6 +1025,11 @@ Queue::Queue(const hsa_agent_t& cpu_agent, const hsa_agent_t& gpu_agent, uint32_
|
||||
*queue = intercept_queue_;
|
||||
}
|
||||
|
||||
Queue::~Queue() {
|
||||
while (ACTIVE_INTERRUPT_SIGNAL_COUNT.load(std::memory_order_acquire) > 0) {
|
||||
}
|
||||
}
|
||||
|
||||
hsa_queue_t* Queue::GetCurrentInterceptQueue() { return intercept_queue_; }
|
||||
|
||||
hsa_agent_t Queue::GetGPUAgent() { return gpu_agent_; }
|
||||
|
||||
@@ -56,7 +56,7 @@ class Queue {
|
||||
hsa_queue_type32_t type,
|
||||
void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data,
|
||||
uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue);
|
||||
~Queue() {}
|
||||
~Queue();
|
||||
|
||||
hsa_queue_t* GetCurrentInterceptQueue();
|
||||
hsa_agent_t GetGPUAgent();
|
||||
@@ -82,6 +82,8 @@ struct queue_info_session_t {
|
||||
uint64_t queue_id;
|
||||
uint32_t writer_id;
|
||||
hsa_signal_t interrupt_signal;
|
||||
uint64_t gpu_index;
|
||||
uint32_t xcc_count;
|
||||
};
|
||||
|
||||
void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_signal_t& pending);
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
|
||||
#include "rocprofiler.h"
|
||||
#include "src/api/rocmtool.h"
|
||||
|
||||
namespace Memory {
|
||||
@@ -68,8 +69,8 @@ GenericBuffer::GenericBuffer(rocprofiler_session_id_t session_id, rocprofiler_bu
|
||||
GenericBuffer::~GenericBuffer() {
|
||||
if (is_valid_.load(std::memory_order_release)) {
|
||||
std::lock_guard lock(buffer_lock_);
|
||||
if (rocmtools::GetROCMToolObj()->GetSession(session_id_))
|
||||
rocmtools::GetROCMToolObj()->GetSession(session_id_)->DisableTools(id_);
|
||||
// if (rocmtools::GetROCMToolObj()->GetSession(session_id_))
|
||||
// rocmtools::GetROCMToolObj()->GetSession(session_id_)->DisableTools(id_);
|
||||
|
||||
Flush();
|
||||
|
||||
|
||||
@@ -75,8 +75,7 @@ class GenericBuffer {
|
||||
}
|
||||
|
||||
// Store data in the record. Copy the data first if it fits in the buffer
|
||||
// (reserve_data_size != 0).
|
||||
if (reserve_data_size) {
|
||||
if (reserve_data_size != 0) {
|
||||
data_ptr_ -= data_size;
|
||||
::memcpy(data_ptr_, data, data_size);
|
||||
store_data(record, data_ptr_);
|
||||
@@ -160,7 +159,8 @@ class GenericBuffer {
|
||||
std::mutex buffer_lock_;
|
||||
};
|
||||
|
||||
bool GetNextRecord(const rocprofiler_record_header_t* record, const rocprofiler_record_header_t** next);
|
||||
bool GetNextRecord(const rocprofiler_record_header_t* record,
|
||||
const rocprofiler_record_header_t** next);
|
||||
|
||||
} // namespace Memory
|
||||
#endif // SRC_CORE_MEMORY_GENERIC_BUFFER_H_
|
||||
|
||||
@@ -281,10 +281,12 @@ class MetricsDict {
|
||||
try {
|
||||
expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_));
|
||||
} catch (const xml::exception_t& exc) {
|
||||
if (do_lookup)
|
||||
if (do_lookup) {
|
||||
metrics_list.push_back(node);
|
||||
else
|
||||
throw(exc);
|
||||
} else {
|
||||
std::cerr << "Error: " << exc.what() << std::endl;
|
||||
abort();
|
||||
}
|
||||
}
|
||||
if (expr_obj) {
|
||||
#if 0
|
||||
|
||||
@@ -50,8 +50,17 @@ uint64_t GetCounterID(std::string& counter_name) {
|
||||
|
||||
Profiler::Profiler(rocprofiler_buffer_id_t buffer_id, rocprofiler_filter_id_t filter_id,
|
||||
rocprofiler_session_id_t session_id)
|
||||
: buffer_id_(buffer_id), filter_id_(filter_id), session_id_(session_id) {}
|
||||
Profiler::~Profiler() {}
|
||||
: buffer_id_(buffer_id), filter_id_(filter_id), session_id_(session_id) {
|
||||
sessions_pending_signals_ = new std::map<uint32_t, std::vector<pending_signal_t*>>();
|
||||
}
|
||||
Profiler::~Profiler() {
|
||||
for (auto& [thread_id, pending_signals] : *sessions_pending_signals_) {
|
||||
for (auto& pending_signal : pending_signals) {
|
||||
delete pending_signal;
|
||||
}
|
||||
}
|
||||
delete sessions_pending_signals_;
|
||||
}
|
||||
|
||||
void Profiler::AddCounterName(rocprofiler_counter_id_t counter_id, std::string counter_name) {
|
||||
std::lock_guard<std::mutex> lock(counter_names_lock_);
|
||||
@@ -106,39 +115,39 @@ const char* Profiler::GetCounterInfo(rocprofiler_counter_info_kind_t kind,
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void Profiler::StartReplayPass(rocprofiler_session_id_t session_id) { warning("Not yet supported!"); }
|
||||
void Profiler::StartReplayPass(rocprofiler_session_id_t session_id) {
|
||||
warning("Not yet supported!");
|
||||
}
|
||||
void Profiler::EndReplayPass() { warning("Not yet supported!"); }
|
||||
bool Profiler::HasActivePass() {
|
||||
warning("Not yet supported!");
|
||||
return true;
|
||||
}
|
||||
|
||||
void Profiler::AddPendingSignals(uint32_t writer_id, uint64_t kernel_object,
|
||||
const hsa_signal_t& completion_signal,
|
||||
rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id,
|
||||
rocmtools::profiling_context_t* context,
|
||||
uint64_t session_data_count,
|
||||
hsa_ven_amd_aqlprofile_profile_t* profile,
|
||||
rocprofiler_kernel_properties_t kernel_properties,
|
||||
uint32_t thread_id, uint64_t queue_index) {
|
||||
void Profiler::AddPendingSignals(
|
||||
uint32_t writer_id, uint64_t kernel_object, const hsa_signal_t& completion_signal,
|
||||
rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id,
|
||||
rocmtools::profiling_context_t* context, uint64_t session_data_count,
|
||||
hsa_ven_amd_aqlprofile_profile_t* profile, rocprofiler_kernel_properties_t kernel_properties,
|
||||
uint32_t thread_id, uint64_t queue_index, uint64_t correlation_id) {
|
||||
std::lock_guard<std::mutex> lock(sessions_pending_signals_lock_);
|
||||
if (sessions_pending_signals_.find(writer_id) == sessions_pending_signals_.end())
|
||||
sessions_pending_signals_.emplace(writer_id, std::vector<pending_signal_t>());
|
||||
sessions_pending_signals_.at(writer_id).emplace_back(
|
||||
pending_signal_t{kernel_object, completion_signal, session_id_, buffer_id, context,
|
||||
session_data_count, profile, kernel_properties, thread_id, queue_index});
|
||||
if (sessions_pending_signals_->find(writer_id) == sessions_pending_signals_->end())
|
||||
sessions_pending_signals_->emplace(writer_id, std::vector<pending_signal_t*>());
|
||||
sessions_pending_signals_->at(writer_id).emplace_back(new pending_signal_t{
|
||||
kernel_object, completion_signal, session_id_, buffer_id, context, session_data_count,
|
||||
profile, kernel_properties, thread_id, queue_index, correlation_id});
|
||||
}
|
||||
|
||||
const std::vector<pending_signal_t>& Profiler::GetPendingSignals(uint32_t writer_id) {
|
||||
const std::vector<pending_signal_t*>& Profiler::GetPendingSignals(uint32_t writer_id) {
|
||||
std::lock_guard<std::mutex> lock(sessions_pending_signals_lock_);
|
||||
assert(sessions_pending_signals_.find(writer_id) != sessions_pending_signals_.end() &&
|
||||
assert(sessions_pending_signals_->find(writer_id) != sessions_pending_signals_->end() &&
|
||||
"writer_id is not found in the pending_signals");
|
||||
return sessions_pending_signals_.at(writer_id);
|
||||
return sessions_pending_signals_->at(writer_id);
|
||||
}
|
||||
|
||||
bool Profiler::CheckPendingSignalsIsEmpty() {
|
||||
std::lock_guard<std::mutex> lock(sessions_pending_signals_lock_);
|
||||
return sessions_pending_signals_.empty();
|
||||
return sessions_pending_signals_->empty();
|
||||
}
|
||||
|
||||
} // namespace profiler
|
||||
|
||||
@@ -53,6 +53,7 @@ typedef struct {
|
||||
rocprofiler_kernel_properties_t kernel_properties;
|
||||
uint32_t thread_id;
|
||||
uint64_t queue_index;
|
||||
uint64_t correlation_id;
|
||||
} pending_signal_t;
|
||||
|
||||
namespace profiler {
|
||||
@@ -71,9 +72,9 @@ class Profiler {
|
||||
rocmtools::profiling_context_t* context, uint64_t session_data_count,
|
||||
hsa_ven_amd_aqlprofile_profile_t* profile,
|
||||
rocprofiler_kernel_properties_t kernel_properties, uint32_t thread_id,
|
||||
uint64_t queue_index);
|
||||
uint64_t queue_index, uint64_t correlation_id);
|
||||
|
||||
const std::vector<pending_signal_t>& GetPendingSignals(uint32_t writer_id);
|
||||
const std::vector<pending_signal_t*>& GetPendingSignals(uint32_t writer_id);
|
||||
bool CheckPendingSignalsIsEmpty();
|
||||
|
||||
void AddCounterName(rocprofiler_counter_id_t handler, std::string counter_name);
|
||||
@@ -97,7 +98,7 @@ class Profiler {
|
||||
rocprofiler_session_id_t session_id_;
|
||||
|
||||
std::mutex sessions_pending_signals_lock_;
|
||||
std::map<uint32_t, std::vector<pending_signal_t>> sessions_pending_signals_;
|
||||
std::map<uint32_t, std::vector<pending_signal_t*>>* sessions_pending_signals_;
|
||||
};
|
||||
|
||||
} // namespace profiler
|
||||
|
||||
@@ -41,7 +41,9 @@
|
||||
namespace rocmtools {
|
||||
|
||||
Session::Session(rocprofiler_replay_mode_t replay_mode, rocprofiler_session_id_t session_id)
|
||||
: session_id_(session_id), is_active_(false), replay_mode_(replay_mode) {}
|
||||
: session_id_(session_id), is_active_(false), replay_mode_(replay_mode) {
|
||||
buffers_ = new std::map<uint64_t, Memory::GenericBuffer*>();
|
||||
}
|
||||
|
||||
Session::~Session() {
|
||||
while (GetCurrentActiveInterruptSignalsCount() > 0) {
|
||||
@@ -63,6 +65,7 @@ Session::~Session() {
|
||||
// std::lock_guard<std::mutex> lock(filters_lock_);
|
||||
// buffers_.clear();
|
||||
// }
|
||||
delete buffers_;
|
||||
}
|
||||
|
||||
void Session::DisableTools(rocprofiler_buffer_id_t buffer_id) {
|
||||
@@ -171,6 +174,8 @@ void Session::Start() {
|
||||
|
||||
void Session::Terminate() {
|
||||
if (is_active_) {
|
||||
while (GetCurrentActiveInterruptSignalsCount() > 0) {
|
||||
}
|
||||
rocmtools::queue::ResetSessionID();
|
||||
std::lock_guard<std::mutex> lock(session_lock_);
|
||||
if (FindFilterWithKind(ROCPROFILER_SPM_COLLECTION)) {
|
||||
@@ -204,6 +209,11 @@ void Session::Terminate() {
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& buffer : *buffers_) {
|
||||
buffer.second->Flush();
|
||||
delete buffer.second;
|
||||
}
|
||||
|
||||
is_active_ = false;
|
||||
}
|
||||
}
|
||||
@@ -289,7 +299,7 @@ rocprofiler_filter_id_t Session::GetFilterIdWithKind(rocprofiler_filter_kind_t k
|
||||
return rocprofiler_filter_id_t{0};
|
||||
}
|
||||
|
||||
bool Session::HasBuffer() { return buffers_.size() > 0; }
|
||||
bool Session::HasBuffer() { return buffers_->size() > 0; }
|
||||
|
||||
rocprofiler_buffer_id_t Session::CreateBuffer(rocprofiler_buffer_callback_t buffer_callback,
|
||||
size_t buffer_size) {
|
||||
@@ -297,8 +307,8 @@ rocprofiler_buffer_id_t Session::CreateBuffer(rocprofiler_buffer_callback_t buff
|
||||
rocprofiler_buffer_id_t{buffers_counter_.fetch_add(1, std::memory_order_release)};
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(buffers_lock_);
|
||||
buffers_.emplace(id.value,
|
||||
new Memory::GenericBuffer(session_id_, id, buffer_size, buffer_callback));
|
||||
buffers_->emplace(id.value,
|
||||
new Memory::GenericBuffer(session_id_, id, buffer_size, buffer_callback));
|
||||
}
|
||||
return id;
|
||||
}
|
||||
@@ -306,7 +316,7 @@ rocprofiler_buffer_id_t Session::CreateBuffer(rocprofiler_buffer_callback_t buff
|
||||
bool Session::FindBuffer(rocprofiler_buffer_id_t buffer_id) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(buffers_lock_);
|
||||
return buffers_.find(buffer_id.value) != buffers_.end();
|
||||
return buffers_->find(buffer_id.value) != buffers_->end();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -316,8 +326,8 @@ void Session::DestroyTracer() { /* tracer_.reset(); */
|
||||
void Session::DestroyBuffer(rocprofiler_buffer_id_t buffer_id) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(filters_lock_);
|
||||
delete buffers_.at(buffer_id.value);
|
||||
buffers_.erase(buffer_id.value);
|
||||
delete buffers_->at(buffer_id.value);
|
||||
buffers_->erase(buffer_id.value);
|
||||
// if (buffers_.find(buffer_id.value) != buffers_.end() &&
|
||||
// buffers_.at(buffer_id.value)->IsValid())
|
||||
// buffers_.at(buffer_id.value).reset();
|
||||
@@ -347,7 +357,7 @@ rocprofiler_status_t Session::stopSpm() {
|
||||
Memory::GenericBuffer* Session::GetBuffer(rocprofiler_buffer_id_t buffer_id) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(buffers_lock_);
|
||||
return buffers_.at(buffer_id.value);
|
||||
return buffers_->at(buffer_id.value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -124,7 +124,7 @@ class Session {
|
||||
|
||||
std::atomic<uint64_t> buffers_counter_{1};
|
||||
std::mutex buffers_lock_;
|
||||
std::map<uint64_t, Memory::GenericBuffer*> buffers_;
|
||||
std::map<uint64_t, Memory::GenericBuffer*>* buffers_;
|
||||
std::atomic<uint64_t> records_counter_{1};
|
||||
|
||||
|
||||
|
||||
@@ -300,13 +300,13 @@ att_parsed_input_t GetATTParams() {
|
||||
} else if (param_name == "PERFCOUNTER") {
|
||||
counters_names.push_back(line.substr(pos + 1));
|
||||
continue;
|
||||
} else { // param_value is a number
|
||||
} else { // param_value is a number
|
||||
try {
|
||||
auto hexa_pos = line.find("0x", pos); // Is it hex?
|
||||
auto hexa_pos = line.find("0x", pos); // Is it hex?
|
||||
if (hexa_pos != std::string::npos)
|
||||
param_value = stoi(line.substr(hexa_pos + 2), 0, 16); // hexadecimal
|
||||
else
|
||||
param_value = stoi(line.substr(pos + 1), 0, 10); // decimal
|
||||
param_value = stoi(line.substr(pos + 1), 0, 10); // decimal
|
||||
} catch (...) {
|
||||
printf("Error: Invalid parameter value %s - (%s)\n",
|
||||
line.substr(pos + 1, line.size()).c_str(), line.c_str());
|
||||
@@ -364,6 +364,9 @@ att_parsed_input_t GetATTParams() {
|
||||
}
|
||||
|
||||
void finish() {
|
||||
for ([[maybe_unused]] rocprofiler_buffer_id_t buffer_id : buffer_ids) {
|
||||
CHECK_ROCPROFILER(rocprofiler_flush_data(session_id, buffer_id));
|
||||
}
|
||||
if (amd_sys_handler.load(std::memory_order_release)) {
|
||||
amd_sys_handler.exchange(false, std::memory_order_release);
|
||||
wait_for_start_shm.join();
|
||||
@@ -371,11 +374,8 @@ void finish() {
|
||||
}
|
||||
if (session_created.load(std::memory_order_relaxed)) {
|
||||
session_created.exchange(false, std::memory_order_release);
|
||||
CHECK_ROCPROFILER(rocprofiler_terminate_session(session_id));
|
||||
rocprofiler::TraceBufferBase::FlushAll();
|
||||
for ([[maybe_unused]] rocprofiler_buffer_id_t buffer_id : buffer_ids) {
|
||||
CHECK_ROCPROFILER(rocprofiler_flush_data(session_id, buffer_id));
|
||||
}
|
||||
CHECK_ROCPROFILER(rocprofiler_terminate_session(session_id));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -646,49 +646,59 @@ ROCPROFILER_EXPORT bool OnLoad(void* table, uint64_t runtime_version, uint64_t f
|
||||
if (apis_requested.size() > 0) filters_requested.emplace_back(ROCPROFILER_API_TRACE);
|
||||
if (parameters.size() > 0) filters_requested.emplace_back(ROCPROFILER_ATT_TRACE_COLLECTION);
|
||||
|
||||
rocprofiler_buffer_id_t buffer_id;
|
||||
CHECK_ROCPROFILER(rocprofiler_create_buffer(
|
||||
session_id,
|
||||
[](const rocprofiler_record_header_t* record, const rocprofiler_record_header_t* end_record,
|
||||
rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id) {
|
||||
if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id);
|
||||
},
|
||||
1 << 20, &buffer_id));
|
||||
buffer_ids.emplace_back(buffer_id);
|
||||
|
||||
rocprofiler_buffer_id_t buffer_id_1;
|
||||
CHECK_ROCPROFILER(rocprofiler_create_buffer(
|
||||
session_id,
|
||||
[](const rocprofiler_record_header_t* record, const rocprofiler_record_header_t* end_record,
|
||||
rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id_1) {
|
||||
if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id_1);
|
||||
},
|
||||
1 << 20, &buffer_id_1));
|
||||
buffer_ids.emplace_back(buffer_id_1);
|
||||
|
||||
for (rocprofiler_filter_kind_t filter_kind : filters_requested) {
|
||||
switch (filter_kind) {
|
||||
case ROCPROFILER_COUNTERS_COLLECTION: {
|
||||
rocprofiler_buffer_id_t buffer_id;
|
||||
CHECK_ROCPROFILER(rocprofiler_create_buffer(
|
||||
session_id,
|
||||
[](const rocprofiler_record_header_t* record,
|
||||
const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id,
|
||||
rocprofiler_buffer_id_t buffer_id) {
|
||||
if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id);
|
||||
},
|
||||
1 << 20, &buffer_id));
|
||||
buffer_ids.emplace_back(buffer_id);
|
||||
printf("Enabling Counter Collection\n");
|
||||
rocprofiler_filter_id_t filter_id;
|
||||
[[maybe_unused]] rocprofiler_filter_property_t property = {};
|
||||
CHECK_ROCPROFILER(rocprofiler_create_filter(
|
||||
session_id, filter_kind, rocprofiler_filter_data_t{.counters_names = &counters_[0]},
|
||||
counters_.size(), &filter_id, property));
|
||||
CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id_1));
|
||||
CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id));
|
||||
filter_ids.emplace_back(filter_id);
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION: {
|
||||
rocprofiler_buffer_id_t buffer_id;
|
||||
CHECK_ROCPROFILER(rocprofiler_create_buffer(
|
||||
session_id,
|
||||
[](const rocprofiler_record_header_t* record,
|
||||
const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id,
|
||||
rocprofiler_buffer_id_t buffer_id) {
|
||||
if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id);
|
||||
},
|
||||
1 << 20, &buffer_id));
|
||||
buffer_ids.emplace_back(buffer_id);
|
||||
rocprofiler_filter_id_t filter_id;
|
||||
[[maybe_unused]] rocprofiler_filter_property_t property = {};
|
||||
CHECK_ROCPROFILER(rocprofiler_create_filter(
|
||||
session_id, filter_kind, rocprofiler_filter_data_t{}, 0, &filter_id, property));
|
||||
CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id_1));
|
||||
CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id));
|
||||
filter_ids.emplace_back(filter_id);
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_API_TRACE: {
|
||||
rocprofiler_buffer_id_t buffer_id;
|
||||
CHECK_ROCPROFILER(rocprofiler_create_buffer(
|
||||
session_id,
|
||||
[](const rocprofiler_record_header_t* record,
|
||||
const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id,
|
||||
rocprofiler_buffer_id_t buffer_id) {
|
||||
if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id);
|
||||
},
|
||||
1 << 20, &buffer_id));
|
||||
buffer_ids.emplace_back(buffer_id);
|
||||
printf("Enabling API Tracing\n");
|
||||
rocprofiler_filter_id_t filter_id;
|
||||
[[maybe_unused]] rocprofiler_filter_property_t property = {};
|
||||
@@ -702,6 +712,16 @@ ROCPROFILER_EXPORT bool OnLoad(void* table, uint64_t runtime_version, uint64_t f
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_ATT_TRACE_COLLECTION: {
|
||||
rocprofiler_buffer_id_t buffer_id;
|
||||
CHECK_ROCPROFILER(rocprofiler_create_buffer(
|
||||
session_id,
|
||||
[](const rocprofiler_record_header_t* record,
|
||||
const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id,
|
||||
rocprofiler_buffer_id_t buffer_id) {
|
||||
if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id);
|
||||
},
|
||||
1 << 20, &buffer_id));
|
||||
buffer_ids.emplace_back(buffer_id);
|
||||
printf("Enabling ATT Tracing\n");
|
||||
rocprofiler_filter_id_t filter_id;
|
||||
|
||||
@@ -717,11 +737,21 @@ ROCPROFILER_EXPORT bool OnLoad(void* table, uint64_t runtime_version, uint64_t f
|
||||
rocprofiler_create_filter(session_id, ROCPROFILER_ATT_TRACE_COLLECTION,
|
||||
rocprofiler_filter_data_t{.att_parameters = ¶meters[0]},
|
||||
parameters.size(), &filter_id, property));
|
||||
CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id_1));
|
||||
CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id));
|
||||
filter_ids.emplace_back(filter_id);
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_PC_SAMPLING_COLLECTION: {
|
||||
rocprofiler_buffer_id_t buffer_id;
|
||||
CHECK_ROCPROFILER(rocprofiler_create_buffer(
|
||||
session_id,
|
||||
[](const rocprofiler_record_header_t* record,
|
||||
const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id,
|
||||
rocprofiler_buffer_id_t buffer_id) {
|
||||
if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id);
|
||||
},
|
||||
1 << 20, &buffer_id));
|
||||
buffer_ids.emplace_back(buffer_id);
|
||||
puts("Enabling PC sampling");
|
||||
rocprofiler_filter_id_t filter_id;
|
||||
[[maybe_unused]] rocprofiler_filter_property_t property = {};
|
||||
|
||||
@@ -145,7 +145,8 @@ std::string string_printf(const char* format, ...) {
|
||||
std::string errmsg("ROCMTools: fatal error: " + message);
|
||||
fputs(errmsg.c_str(), stderr);
|
||||
|
||||
throw(errmsg);
|
||||
std::cerr << errmsg << std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
/* The function extracts the kernel name from
|
||||
|
||||
Ссылка в новой задаче
Block a user