Added option to control how codeobj is dumped from ATT

Change-Id: Ie76aeea1193c7ba8fe7f51be159516f8a9eab55f


[ROCm/rocprofiler commit: 99b14fc9f8]
Этот коммит содержится в:
Giovanni LB
2023-11-08 17:35:43 -03:00
родитель 129c7e9d4e
Коммит ec895dfb2a
11 изменённых файлов: 84 добавлений и 37 удалений
+6 -1
Просмотреть файл
@@ -392,10 +392,15 @@ Tool used to collect fine-grained hardware metrics. Provides ISA-level instructi
- PERFCOUNTER_MASK=0xFFF // Bitmask for perfcounter collection. GFX9 only.
- PERFCOUNTER=counter_name // Add a SQ counter to be collected with ATT; period defined by PERFCOUNTERS_CTRL. GFX9 only.
- BUFFER_SIZE=[size] // Sets size of the ATT buffer collection, per dispatch, in megabytes (shared among all shader engines).
- ISA_CAPTURE_MODE=[0,1,2] // Set capture mode during kernel dispatch.
- ISA_CAPTURE_MODE=[0,1,2] // Set codeobj capture mode during kernel dispatch.
- 0 = capture symbols only.
- 1 = capture symbols for file:// and make a copy of memory://
- 2 = Copy file:// and memory://
- ISA_DUMP_MODE=[0,1,2,3] // Set how captured codeobj information is dumped when a trace record arrives.
- 0 = Default. Dump everything.
- 1 = Dump only the code object containing the kernel address in the kernel dispatch packet.
- 2 = Dump a single kernel symbol matching the kernel dispatch packet.
- 3 = Disables ISA Dumping.
- By default, kernel names are truncated for ATT.To disable, please see the kernel name truncation section below.
- Example for vectoradd.
+5 -1
Просмотреть файл
@@ -1185,6 +1185,10 @@ typedef struct {
* Addr where codeobj is loaded
*/
uint64_t base_address;
/**
* Maximum offset from base address
*/
uint64_t mem_size;
/**
* If a copy of the codeobj is made, contains the data. Nullptr otherwise.
*/
@@ -1192,7 +1196,7 @@ typedef struct {
/**
* If a copy of the codeobj is made, contains the size of the data. 0 otherwise.
*/
uint64_t size;
uint64_t data_size;
/**
* Timestamp for the time point this codeobj was loaded.
*/
+26 -3
Просмотреть файл
@@ -65,6 +65,9 @@ class att_plugin_t {
header.raw = reinterpret_cast<uint64_t>(data);
header.reserved = 0x11;
isa_mode = static_cast<decltype(isa_mode)>(header.isadumpmode);
header.isadumpmode = 0;
}
bool MPI_ENABLE = false;
@@ -72,6 +75,14 @@ class att_plugin_t {
std::mutex writing_lock;
bool is_valid_{true};
rocprofiler::att_header_packet_t header{.raw = 0};
rocprofiler::rocprofiler_att_isa_dump_mode isa_mode = rocprofiler::ISA_MODE_DUMP_ALL;
bool CheckAddrMatches(uint64_t kernel_addr, uint64_t base_address, uint64_t size)
{
if (isa_mode == rocprofiler::ISA_MODE_DUMP_ALL)
return true;
return (kernel_addr >= base_address) && (kernel_addr < base_address + size);
}
inline bool att_file_exists(const std::string& name) {
struct stat buffer;
@@ -133,7 +144,7 @@ class att_plugin_t {
<< '\n';
// iterate over each shader engine att trace
header.navi = !att_tracer_record->intercept_list.userdata;
header.navi = !att_tracer_record->intercept_list.userdata & 0x1;
int se_num = att_tracer_record->shader_engine_data_count;
for (int i = 0; i < se_num; i++) {
if (!att_tracer_record->shader_engine_data ||
@@ -155,6 +166,11 @@ class att_plugin_t {
out.write(data_buffer_ptr, se_att_trace->buffer_size);
}
if (isa_mode == rocprofiler::ISA_MODE_DUMP_NONE)
return 0;
uint64_t kernel_addr = att_tracer_record->intercept_list.userdata >> 1;
std::ofstream isafile(outfilepath + "_isa.s");
if (!isafile.is_open()) {
std::cerr << "Could not open ISA file: " << outfilepath << "_isa.s" << std::endl;
@@ -166,11 +182,13 @@ class att_plugin_t {
const rocprofiler_intercepted_codeobj_t& symbol =
att_tracer_record->intercept_list.symbols[i];
if (!CheckAddrMatches(kernel_addr, symbol.base_address, symbol.mem_size)) continue;
std::unique_ptr<CodeObjectBinary> binary;
std::unique_ptr<code_object_decoder_t> decoder;
if (symbol.data && symbol.size) {
decoder = std::make_unique<code_object_decoder_t>(symbol.data, symbol.size);
if (symbol.data && symbol.data_size) {
decoder = std::make_unique<code_object_decoder_t>(symbol.data, symbol.data_size);
} else if (std::string(symbol.filepath).find("file://") != std::string::npos) {
binary = std::make_unique<CodeObjectBinary>(symbol.filepath);
decoder =
@@ -179,6 +197,11 @@ class att_plugin_t {
continue;
}
if (isa_mode == rocprofiler::ISA_MODE_DUMP_KERNEL)
decoder->disassemble_single_kernel(kernel_addr-symbol.base_address);
else
decoder->disassemble_kernels();
for (auto& instance : decoder->instructions) {
uint64_t addr = instance.address + symbol.base_address;
+11 -4
Просмотреть файл
@@ -109,7 +109,11 @@ code_object_decoder_t::code_object_decoder_t(const char* codeobj_data, uint64_t
}
// load_symbol_map();
}
disassemble_kernels();
disassembly = std::make_unique<DisassemblyInstance>(*this);
m_symbol_map = disassembly->GetKernelMap();
//disassemble_kernels();
}
@@ -170,8 +174,11 @@ void code_object_decoder_t::disassemble_kernel(uint64_t faddr, uint64_t vaddr) {
}
void code_object_decoder_t::disassemble_kernels() {
disassembly = std::make_unique<DisassemblyInstance>(*this);
m_symbol_map = disassembly->GetKernelMap();
for (auto& [vaddr, v] : m_symbol_map) disassemble_kernel(v.faddr, vaddr);
}
void code_object_decoder_t::disassemble_single_kernel(uint64_t kaddr) {
for (auto& [vaddr, v] : m_symbol_map)
if (kaddr >= vaddr && kaddr < vaddr + v.mem_size)
disassemble_kernel(v.faddr, vaddr);
}
+2
Просмотреть файл
@@ -39,7 +39,9 @@ class code_object_decoder_t {
~code_object_decoder_t();
void disassemble_kernel(uint64_t faddr, uint64_t vaddr);
void disassemble_single_kernel(uint64_t kaddr);
void disassemble_kernels();
int m_fd;
std::map<uint64_t, std::pair<std::string, size_t>> m_line_number_map;
+13 -24
Просмотреть файл
@@ -238,9 +238,7 @@ class perfetto_plugin_t {
uint64_t device_id = profiler_record.gpu_id.handle;
std::unordered_map<uint64_t, perfetto::Track>::iterator device_track_it;
{
std::lock_guard<std::mutex> lock(device_tracks_lock_);
uint64_t device_track_id = getTrackID(machine_id_, TrackType::DEVICE, device_id);
device_track_it = device_tracks.find(device_track_id);
if (device_track_it == device_tracks.end()) {
/* Create a new perfetto::Track (Sub-Track) */
@@ -250,30 +248,25 @@ class perfetto_plugin_t {
gpu_desc.mutable_process()->set_pid(device_id);
gpu_desc.mutable_process()->set_chrome_process_type(
perfetto::protos::gen::ProcessDescriptor::PROCESS_GPU);
gpu_desc.mutable_process()->set_process_name("Node: " + std::string(hostname_) +
" Device: ");
gpu_desc.mutable_process()->set_process_name("Node: " + std::string(hostname_)
+ std::to_string(GetPid()) + " Device: ");
perfetto::TrackEvent::SetTrackDescriptor(device_track_it->second, gpu_desc);
track_ids_used_.emplace_back(device_track_id);
}
}
auto& gpu_track = device_track_it->second;
std::pair<int, uint64_t> gpu_queue_id =
std::make_pair(device_id, profiler_record.queue_id.handle);
auto queue_track_it = queue_tracks_.find(gpu_queue_id.first);
{
std::lock_guard<std::mutex> lock(stream_tracks_lock_);
uint64_t queue_track_id = getTrackID(machine_id_, device_id+TrackType::DEVICE_ID, gpu_queue_id.first);
queue_track_it = queue_tracks_.find(queue_track_id);
if (queue_track_it == queue_tracks_.end()) {
/* Create a new perfetto::Track */
queue_track_it =
queue_tracks_.emplace(queue_track_id, perfetto::Track(queue_track_id, gpu_track)).first;
uint64_t queue_track_id
= getTrackID(machine_id_, device_id+TrackType::DEVICE_ID, profiler_record.queue_id.handle);
auto queue_track_it = queue_tracks_.find(queue_track_id);
if (queue_track_it == queue_tracks_.end()) {
/* Create a new perfetto::Track */
queue_track_it =
queue_tracks_.emplace(queue_track_id, perfetto::Track(queue_track_id, gpu_track)).first;
auto queue_desc = queue_track_it->second.Serialize();
std::string queue_str = rocprofiler::string_printf("Queue %ld", gpu_queue_id.second);
queue_desc.set_name(queue_str);
perfetto::TrackEvent::SetTrackDescriptor(queue_track_it->second, queue_desc);
}
auto queue_desc = queue_track_it->second.Serialize();
std::string queue_str = rocprofiler::string_printf("Queue %ld", profiler_record.queue_id.handle);
queue_desc.set_name(queue_str);
perfetto::TrackEvent::SetTrackDescriptor(queue_track_it->second, queue_desc);
track_ids_used_.emplace_back(queue_track_id);
}
auto& queue_track = queue_track_it->second;
@@ -306,7 +299,6 @@ class perfetto_plugin_t {
std::pair<int, std::string> gpu_counter_track_id = std::make_pair(device_id, counter_name);
std::unordered_map<std::string, perfetto::CounterTrack>::iterator counters_track_it;
{
std::lock_guard<std::mutex> lock(counter_tracks_lock_);
counters_track_it = counter_tracks_.find(gpu_counter_track_id.second);
if (counters_track_it == counter_tracks_.end()) {
/* Create a new perfetto::Track */
@@ -643,9 +635,6 @@ class perfetto_plugin_t {
std::atomic<uint64_t> track_counter_{GetPid()};
std::vector<uint64_t> track_ids_used_;
std::mutex roctx_tracks_lock_, hsa_tracks_lock_, hip_tracks_lock_, device_tracks_lock_;
std::mutex stream_tracks_lock_, counter_tracks_lock_, mem_copies_tracks_lock_;
char hostname_[1024];
uint64_t machine_id_;
+6 -2
Просмотреть файл
@@ -1152,12 +1152,16 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u
session_id_snapshot, buffer_id, profile, kernel_properties,
(uint32_t)syscall(__NR_gettid), user_pkt_index);
uint64_t userdata = HSASupport_Singleton::GetInstance()
uint64_t off = dispatch_packet.kernel_object +
GetKernelCode(dispatch_packet.kernel_object)->kernel_code_entry_byte_offset;
codeobj_record::make_capture(rocprofiler_record_id_t{record_id}, capture_mode, off);
uint64_t IsGFX9 = HSASupport_Singleton::GetInstance()
.GetHSAAgentInfo(queue_info.GetGPUAgent().handle)
.GetDeviceInfo()
.getName()
.find("gfx9") != std::string::npos;
codeobj_record::make_capture(rocprofiler_record_id_t{record_id}, capture_mode, userdata);
codeobj_record::make_capture(rocprofiler_record_id_t{record_id}, capture_mode, IsGFX9 | (off<<1));
codeobj_record::start_capture(rocprofiler_record_id_t{record_id});
codeobj_record::stop_capture(rocprofiler_record_id_t{record_id});
+1 -1
Просмотреть файл
@@ -41,7 +41,7 @@ class codeobj_capture_instance {
rocprofiler_intercepted_codeobj_t get() const {
const char* buf_ptr = buffer.size() ? buffer.data() : nullptr;
return {URI.c_str(), addr, buf_ptr, buffer.size(), start_time, end_time};
return {URI.c_str(), addr, mem_size, buf_ptr, buffer.size(), start_time, end_time};
};
const uint64_t addr;
+8
Просмотреть файл
@@ -44,6 +44,13 @@ typedef struct {
uint64_t queue_index;
} att_pending_signal_t;
enum rocprofiler_att_isa_dump_mode {
ISA_MODE_DUMP_ALL=0,
ISA_MODE_DUMP_OBJ,
ISA_MODE_DUMP_KERNEL,
ISA_MODE_DUMP_NONE
};
union att_header_packet_t {
struct {
uint64_t reserved : 14;
@@ -53,6 +60,7 @@ union att_header_packet_t {
uint64_t DCU : 5;
uint64_t DSA : 1;
uint64_t SEID : 6;
uint64_t isadumpmode : 3;
};
uint64_t raw;
};
+5
Просмотреть файл
@@ -380,6 +380,11 @@ att_parsed_input_t GetATTParams() {
continue;
}
if (param_name.find("ISA_DUMP_MODE") != std::string::npos) {
header.isadumpmode = param_value;
continue;
}
if (ATT_PARAM_NAMES.find(param_name) != ATT_PARAM_NAMES.end()) {
parameters.push_back(std::make_pair(ATT_PARAM_NAMES[param_name], param_value));
try {
+1 -1
Просмотреть файл
@@ -661,7 +661,7 @@ TEST_F(CodeobjTest, WhenRunningProfilerWithMultipleCaptureAndCopy) {
EXPECT_NE(capture.symbols[i].base_address, 0);
EXPECT_NE(capture.symbols[i].clock_start.value, 0);
EXPECT_NE(capture.symbols[i].data, nullptr);
EXPECT_NE(capture.symbols[i].size, 0);
EXPECT_NE(capture.symbols[i].data_size, 0);
}
result = rocprofiler_codeobj_capture_stop(id);