From ec895dfb2a5da9ec78040e159e0c168dfa283f71 Mon Sep 17 00:00:00 2001 From: Giovanni LB Date: Wed, 8 Nov 2023 17:35:43 -0300 Subject: [PATCH] Added option to control how codeobj is dumped from ATT Change-Id: Ie76aeea1193c7ba8fe7f51be159516f8a9eab55f [ROCm/rocprofiler commit: 99b14fc9f8176fc058577acc8bc2444a5801aef0] --- projects/rocprofiler/README.md | 7 +++- .../include/rocprofiler/v2/rocprofiler.h | 6 ++- projects/rocprofiler/plugin/att/att.cpp | 29 +++++++++++++-- .../rocprofiler/plugin/att/code_printing.cpp | 15 ++++++-- .../rocprofiler/plugin/att/code_printing.hpp | 2 + .../rocprofiler/plugin/perfetto/perfetto.cpp | 37 +++++++------------ .../rocprofiler/src/core/hsa/queues/queue.cpp | 8 +++- .../core/isa_capture/code_object_track.hpp | 2 +- .../rocprofiler/src/core/session/att/att.h | 8 ++++ projects/rocprofiler/src/tools/tool.cpp | 5 +++ .../featuretests/profiler/profiler_gtest.cpp | 2 +- 11 files changed, 84 insertions(+), 37 deletions(-) diff --git a/projects/rocprofiler/README.md b/projects/rocprofiler/README.md index 32df0c36a6..9e7900a220 100644 --- a/projects/rocprofiler/README.md +++ b/projects/rocprofiler/README.md @@ -392,10 +392,15 @@ Tool used to collect fine-grained hardware metrics. Provides ISA-level instructi - PERFCOUNTER_MASK=0xFFF // Bitmask for perfcounter collection. GFX9 only. - PERFCOUNTER=counter_name // Add a SQ counter to be collected with ATT; period defined by PERFCOUNTERS_CTRL. GFX9 only. - BUFFER_SIZE=[size] // Sets size of the ATT buffer collection, per dispatch, in megabytes (shared among all shader engines). - - ISA_CAPTURE_MODE=[0,1,2] // Set capture mode during kernel dispatch. + - ISA_CAPTURE_MODE=[0,1,2] // Set codeobj capture mode during kernel dispatch. - 0 = capture symbols only. - 1 = capture symbols for file:// and make a copy of memory:// - 2 = Copy file:// and memory:// + - ISA_DUMP_MODE=[0,1,2,3] // Set how captured codeobj information is dumped when a trace record arrives. + - 0 = Default. Dump everything. + - 1 = Dump only the code object containing the kernel address in the kernel dispatch packet. + - 2 = Dump a single kernel symbol matching the kernel dispatch packet. + - 3 = Disables ISA Dumping. - By default, kernel names are truncated for ATT.To disable, please see the kernel name truncation section below. - Example for vectoradd. diff --git a/projects/rocprofiler/include/rocprofiler/v2/rocprofiler.h b/projects/rocprofiler/include/rocprofiler/v2/rocprofiler.h index 0e373969ed..95b8f2b277 100644 --- a/projects/rocprofiler/include/rocprofiler/v2/rocprofiler.h +++ b/projects/rocprofiler/include/rocprofiler/v2/rocprofiler.h @@ -1185,6 +1185,10 @@ typedef struct { * Addr where codeobj is loaded */ uint64_t base_address; + /** + * Maximum offset from base address + */ + uint64_t mem_size; /** * If a copy of the codeobj is made, contains the data. Nullptr otherwise. */ @@ -1192,7 +1196,7 @@ typedef struct { /** * If a copy of the codeobj is made, contains the size of the data. 0 otherwise. */ - uint64_t size; + uint64_t data_size; /** * Timestamp for the time point this codeobj was loaded. */ diff --git a/projects/rocprofiler/plugin/att/att.cpp b/projects/rocprofiler/plugin/att/att.cpp index 4275f8a81a..b686e1f79d 100644 --- a/projects/rocprofiler/plugin/att/att.cpp +++ b/projects/rocprofiler/plugin/att/att.cpp @@ -65,6 +65,9 @@ class att_plugin_t { header.raw = reinterpret_cast(data); header.reserved = 0x11; + + isa_mode = static_cast(header.isadumpmode); + header.isadumpmode = 0; } bool MPI_ENABLE = false; @@ -72,6 +75,14 @@ class att_plugin_t { std::mutex writing_lock; bool is_valid_{true}; rocprofiler::att_header_packet_t header{.raw = 0}; + rocprofiler::rocprofiler_att_isa_dump_mode isa_mode = rocprofiler::ISA_MODE_DUMP_ALL; + + bool CheckAddrMatches(uint64_t kernel_addr, uint64_t base_address, uint64_t size) + { + if (isa_mode == rocprofiler::ISA_MODE_DUMP_ALL) + return true; + return (kernel_addr >= base_address) && (kernel_addr < base_address + size); + } inline bool att_file_exists(const std::string& name) { struct stat buffer; @@ -133,7 +144,7 @@ class att_plugin_t { << '\n'; // iterate over each shader engine att trace - header.navi = !att_tracer_record->intercept_list.userdata; + header.navi = !att_tracer_record->intercept_list.userdata & 0x1; int se_num = att_tracer_record->shader_engine_data_count; for (int i = 0; i < se_num; i++) { if (!att_tracer_record->shader_engine_data || @@ -155,6 +166,11 @@ class att_plugin_t { out.write(data_buffer_ptr, se_att_trace->buffer_size); } + if (isa_mode == rocprofiler::ISA_MODE_DUMP_NONE) + return 0; + + uint64_t kernel_addr = att_tracer_record->intercept_list.userdata >> 1; + std::ofstream isafile(outfilepath + "_isa.s"); if (!isafile.is_open()) { std::cerr << "Could not open ISA file: " << outfilepath << "_isa.s" << std::endl; @@ -166,11 +182,13 @@ class att_plugin_t { const rocprofiler_intercepted_codeobj_t& symbol = att_tracer_record->intercept_list.symbols[i]; + if (!CheckAddrMatches(kernel_addr, symbol.base_address, symbol.mem_size)) continue; + std::unique_ptr binary; std::unique_ptr decoder; - if (symbol.data && symbol.size) { - decoder = std::make_unique(symbol.data, symbol.size); + if (symbol.data && symbol.data_size) { + decoder = std::make_unique(symbol.data, symbol.data_size); } else if (std::string(symbol.filepath).find("file://") != std::string::npos) { binary = std::make_unique(symbol.filepath); decoder = @@ -179,6 +197,11 @@ class att_plugin_t { continue; } + if (isa_mode == rocprofiler::ISA_MODE_DUMP_KERNEL) + decoder->disassemble_single_kernel(kernel_addr-symbol.base_address); + else + decoder->disassemble_kernels(); + for (auto& instance : decoder->instructions) { uint64_t addr = instance.address + symbol.base_address; diff --git a/projects/rocprofiler/plugin/att/code_printing.cpp b/projects/rocprofiler/plugin/att/code_printing.cpp index 87727fd314..4bb509f890 100644 --- a/projects/rocprofiler/plugin/att/code_printing.cpp +++ b/projects/rocprofiler/plugin/att/code_printing.cpp @@ -109,7 +109,11 @@ code_object_decoder_t::code_object_decoder_t(const char* codeobj_data, uint64_t } // load_symbol_map(); } - disassemble_kernels(); + + disassembly = std::make_unique(*this); + m_symbol_map = disassembly->GetKernelMap(); + + //disassemble_kernels(); } @@ -170,8 +174,11 @@ void code_object_decoder_t::disassemble_kernel(uint64_t faddr, uint64_t vaddr) { } void code_object_decoder_t::disassemble_kernels() { - disassembly = std::make_unique(*this); - m_symbol_map = disassembly->GetKernelMap(); - for (auto& [vaddr, v] : m_symbol_map) disassemble_kernel(v.faddr, vaddr); } + +void code_object_decoder_t::disassemble_single_kernel(uint64_t kaddr) { + for (auto& [vaddr, v] : m_symbol_map) + if (kaddr >= vaddr && kaddr < vaddr + v.mem_size) + disassemble_kernel(v.faddr, vaddr); +} diff --git a/projects/rocprofiler/plugin/att/code_printing.hpp b/projects/rocprofiler/plugin/att/code_printing.hpp index a4193694e9..02df1deee9 100644 --- a/projects/rocprofiler/plugin/att/code_printing.hpp +++ b/projects/rocprofiler/plugin/att/code_printing.hpp @@ -39,7 +39,9 @@ class code_object_decoder_t { ~code_object_decoder_t(); void disassemble_kernel(uint64_t faddr, uint64_t vaddr); + void disassemble_single_kernel(uint64_t kaddr); void disassemble_kernels(); + int m_fd; std::map> m_line_number_map; diff --git a/projects/rocprofiler/plugin/perfetto/perfetto.cpp b/projects/rocprofiler/plugin/perfetto/perfetto.cpp index 318e800ca2..439e9d2533 100644 --- a/projects/rocprofiler/plugin/perfetto/perfetto.cpp +++ b/projects/rocprofiler/plugin/perfetto/perfetto.cpp @@ -238,9 +238,7 @@ class perfetto_plugin_t { uint64_t device_id = profiler_record.gpu_id.handle; std::unordered_map::iterator device_track_it; { - std::lock_guard lock(device_tracks_lock_); uint64_t device_track_id = getTrackID(machine_id_, TrackType::DEVICE, device_id); - device_track_it = device_tracks.find(device_track_id); if (device_track_it == device_tracks.end()) { /* Create a new perfetto::Track (Sub-Track) */ @@ -250,30 +248,25 @@ class perfetto_plugin_t { gpu_desc.mutable_process()->set_pid(device_id); gpu_desc.mutable_process()->set_chrome_process_type( perfetto::protos::gen::ProcessDescriptor::PROCESS_GPU); - gpu_desc.mutable_process()->set_process_name("Node: " + std::string(hostname_) + - " Device: "); + gpu_desc.mutable_process()->set_process_name("Node: " + std::string(hostname_) + + std::to_string(GetPid()) + " Device: "); perfetto::TrackEvent::SetTrackDescriptor(device_track_it->second, gpu_desc); track_ids_used_.emplace_back(device_track_id); } } auto& gpu_track = device_track_it->second; - std::pair gpu_queue_id = - std::make_pair(device_id, profiler_record.queue_id.handle); - auto queue_track_it = queue_tracks_.find(gpu_queue_id.first); - { - std::lock_guard lock(stream_tracks_lock_); - uint64_t queue_track_id = getTrackID(machine_id_, device_id+TrackType::DEVICE_ID, gpu_queue_id.first); - queue_track_it = queue_tracks_.find(queue_track_id); - if (queue_track_it == queue_tracks_.end()) { - /* Create a new perfetto::Track */ - queue_track_it = - queue_tracks_.emplace(queue_track_id, perfetto::Track(queue_track_id, gpu_track)).first; + uint64_t queue_track_id + = getTrackID(machine_id_, device_id+TrackType::DEVICE_ID, profiler_record.queue_id.handle); + auto queue_track_it = queue_tracks_.find(queue_track_id); + if (queue_track_it == queue_tracks_.end()) { + /* Create a new perfetto::Track */ + queue_track_it = + queue_tracks_.emplace(queue_track_id, perfetto::Track(queue_track_id, gpu_track)).first; - auto queue_desc = queue_track_it->second.Serialize(); - std::string queue_str = rocprofiler::string_printf("Queue %ld", gpu_queue_id.second); - queue_desc.set_name(queue_str); - perfetto::TrackEvent::SetTrackDescriptor(queue_track_it->second, queue_desc); - } + auto queue_desc = queue_track_it->second.Serialize(); + std::string queue_str = rocprofiler::string_printf("Queue %ld", profiler_record.queue_id.handle); + queue_desc.set_name(queue_str); + perfetto::TrackEvent::SetTrackDescriptor(queue_track_it->second, queue_desc); track_ids_used_.emplace_back(queue_track_id); } auto& queue_track = queue_track_it->second; @@ -306,7 +299,6 @@ class perfetto_plugin_t { std::pair gpu_counter_track_id = std::make_pair(device_id, counter_name); std::unordered_map::iterator counters_track_it; { - std::lock_guard lock(counter_tracks_lock_); counters_track_it = counter_tracks_.find(gpu_counter_track_id.second); if (counters_track_it == counter_tracks_.end()) { /* Create a new perfetto::Track */ @@ -643,9 +635,6 @@ class perfetto_plugin_t { std::atomic track_counter_{GetPid()}; std::vector track_ids_used_; - std::mutex roctx_tracks_lock_, hsa_tracks_lock_, hip_tracks_lock_, device_tracks_lock_; - std::mutex stream_tracks_lock_, counter_tracks_lock_, mem_copies_tracks_lock_; - char hostname_[1024]; uint64_t machine_id_; diff --git a/projects/rocprofiler/src/core/hsa/queues/queue.cpp b/projects/rocprofiler/src/core/hsa/queues/queue.cpp index 1c5b031515..7897d959e1 100644 --- a/projects/rocprofiler/src/core/hsa/queues/queue.cpp +++ b/projects/rocprofiler/src/core/hsa/queues/queue.cpp @@ -1152,12 +1152,16 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u session_id_snapshot, buffer_id, profile, kernel_properties, (uint32_t)syscall(__NR_gettid), user_pkt_index); - uint64_t userdata = HSASupport_Singleton::GetInstance() + uint64_t off = dispatch_packet.kernel_object + + GetKernelCode(dispatch_packet.kernel_object)->kernel_code_entry_byte_offset; + codeobj_record::make_capture(rocprofiler_record_id_t{record_id}, capture_mode, off); + + uint64_t IsGFX9 = HSASupport_Singleton::GetInstance() .GetHSAAgentInfo(queue_info.GetGPUAgent().handle) .GetDeviceInfo() .getName() .find("gfx9") != std::string::npos; - codeobj_record::make_capture(rocprofiler_record_id_t{record_id}, capture_mode, userdata); + codeobj_record::make_capture(rocprofiler_record_id_t{record_id}, capture_mode, IsGFX9 | (off<<1)); codeobj_record::start_capture(rocprofiler_record_id_t{record_id}); codeobj_record::stop_capture(rocprofiler_record_id_t{record_id}); diff --git a/projects/rocprofiler/src/core/isa_capture/code_object_track.hpp b/projects/rocprofiler/src/core/isa_capture/code_object_track.hpp index 9adb703ce8..96add163b8 100644 --- a/projects/rocprofiler/src/core/isa_capture/code_object_track.hpp +++ b/projects/rocprofiler/src/core/isa_capture/code_object_track.hpp @@ -41,7 +41,7 @@ class codeobj_capture_instance { rocprofiler_intercepted_codeobj_t get() const { const char* buf_ptr = buffer.size() ? buffer.data() : nullptr; - return {URI.c_str(), addr, buf_ptr, buffer.size(), start_time, end_time}; + return {URI.c_str(), addr, mem_size, buf_ptr, buffer.size(), start_time, end_time}; }; const uint64_t addr; diff --git a/projects/rocprofiler/src/core/session/att/att.h b/projects/rocprofiler/src/core/session/att/att.h index f0a44ca7ea..4da130dd92 100644 --- a/projects/rocprofiler/src/core/session/att/att.h +++ b/projects/rocprofiler/src/core/session/att/att.h @@ -44,6 +44,13 @@ typedef struct { uint64_t queue_index; } att_pending_signal_t; +enum rocprofiler_att_isa_dump_mode { + ISA_MODE_DUMP_ALL=0, + ISA_MODE_DUMP_OBJ, + ISA_MODE_DUMP_KERNEL, + ISA_MODE_DUMP_NONE +}; + union att_header_packet_t { struct { uint64_t reserved : 14; @@ -53,6 +60,7 @@ union att_header_packet_t { uint64_t DCU : 5; uint64_t DSA : 1; uint64_t SEID : 6; + uint64_t isadumpmode : 3; }; uint64_t raw; }; diff --git a/projects/rocprofiler/src/tools/tool.cpp b/projects/rocprofiler/src/tools/tool.cpp index 7289b305a3..dd1be29bde 100644 --- a/projects/rocprofiler/src/tools/tool.cpp +++ b/projects/rocprofiler/src/tools/tool.cpp @@ -380,6 +380,11 @@ att_parsed_input_t GetATTParams() { continue; } + if (param_name.find("ISA_DUMP_MODE") != std::string::npos) { + header.isadumpmode = param_value; + continue; + } + if (ATT_PARAM_NAMES.find(param_name) != ATT_PARAM_NAMES.end()) { parameters.push_back(std::make_pair(ATT_PARAM_NAMES[param_name], param_value)); try { diff --git a/projects/rocprofiler/tests-v2/featuretests/profiler/profiler_gtest.cpp b/projects/rocprofiler/tests-v2/featuretests/profiler/profiler_gtest.cpp index 93d6ebafba..7b699dcf55 100644 --- a/projects/rocprofiler/tests-v2/featuretests/profiler/profiler_gtest.cpp +++ b/projects/rocprofiler/tests-v2/featuretests/profiler/profiler_gtest.cpp @@ -661,7 +661,7 @@ TEST_F(CodeobjTest, WhenRunningProfilerWithMultipleCaptureAndCopy) { EXPECT_NE(capture.symbols[i].base_address, 0); EXPECT_NE(capture.symbols[i].clock_start.value, 0); EXPECT_NE(capture.symbols[i].data, nullptr); - EXPECT_NE(capture.symbols[i].size, 0); + EXPECT_NE(capture.symbols[i].data_size, 0); } result = rocprofiler_codeobj_capture_stop(id);