Added option to control how codeobj is dumped from ATT
Change-Id: Ie76aeea1193c7ba8fe7f51be159516f8a9eab55f
[ROCm/rocprofiler commit: 99b14fc9f8]
Этот коммит содержится в:
@@ -392,10 +392,15 @@ Tool used to collect fine-grained hardware metrics. Provides ISA-level instructi
|
||||
- PERFCOUNTER_MASK=0xFFF // Bitmask for perfcounter collection. GFX9 only.
|
||||
- PERFCOUNTER=counter_name // Add a SQ counter to be collected with ATT; period defined by PERFCOUNTERS_CTRL. GFX9 only.
|
||||
- BUFFER_SIZE=[size] // Sets size of the ATT buffer collection, per dispatch, in megabytes (shared among all shader engines).
|
||||
- ISA_CAPTURE_MODE=[0,1,2] // Set capture mode during kernel dispatch.
|
||||
- ISA_CAPTURE_MODE=[0,1,2] // Set codeobj capture mode during kernel dispatch.
|
||||
- 0 = capture symbols only.
|
||||
- 1 = capture symbols for file:// and make a copy of memory://
|
||||
- 2 = Copy file:// and memory://
|
||||
- ISA_DUMP_MODE=[0,1,2,3] // Set how captured codeobj information is dumped when a trace record arrives.
|
||||
- 0 = Default. Dump everything.
|
||||
- 1 = Dump only the code object containing the kernel address in the kernel dispatch packet.
|
||||
- 2 = Dump a single kernel symbol matching the kernel dispatch packet.
|
||||
- 3 = Disables ISA Dumping.
|
||||
- By default, kernel names are truncated for ATT.To disable, please see the kernel name truncation section below.
|
||||
|
||||
- Example for vectoradd.
|
||||
|
||||
@@ -1185,6 +1185,10 @@ typedef struct {
|
||||
* Addr where codeobj is loaded
|
||||
*/
|
||||
uint64_t base_address;
|
||||
/**
|
||||
* Maximum offset from base address
|
||||
*/
|
||||
uint64_t mem_size;
|
||||
/**
|
||||
* If a copy of the codeobj is made, contains the data. Nullptr otherwise.
|
||||
*/
|
||||
@@ -1192,7 +1196,7 @@ typedef struct {
|
||||
/**
|
||||
* If a copy of the codeobj is made, contains the size of the data. 0 otherwise.
|
||||
*/
|
||||
uint64_t size;
|
||||
uint64_t data_size;
|
||||
/**
|
||||
* Timestamp for the time point this codeobj was loaded.
|
||||
*/
|
||||
|
||||
@@ -65,6 +65,9 @@ class att_plugin_t {
|
||||
|
||||
header.raw = reinterpret_cast<uint64_t>(data);
|
||||
header.reserved = 0x11;
|
||||
|
||||
isa_mode = static_cast<decltype(isa_mode)>(header.isadumpmode);
|
||||
header.isadumpmode = 0;
|
||||
}
|
||||
|
||||
bool MPI_ENABLE = false;
|
||||
@@ -72,6 +75,14 @@ class att_plugin_t {
|
||||
std::mutex writing_lock;
|
||||
bool is_valid_{true};
|
||||
rocprofiler::att_header_packet_t header{.raw = 0};
|
||||
rocprofiler::rocprofiler_att_isa_dump_mode isa_mode = rocprofiler::ISA_MODE_DUMP_ALL;
|
||||
|
||||
bool CheckAddrMatches(uint64_t kernel_addr, uint64_t base_address, uint64_t size)
|
||||
{
|
||||
if (isa_mode == rocprofiler::ISA_MODE_DUMP_ALL)
|
||||
return true;
|
||||
return (kernel_addr >= base_address) && (kernel_addr < base_address + size);
|
||||
}
|
||||
|
||||
inline bool att_file_exists(const std::string& name) {
|
||||
struct stat buffer;
|
||||
@@ -133,7 +144,7 @@ class att_plugin_t {
|
||||
<< '\n';
|
||||
|
||||
// iterate over each shader engine att trace
|
||||
header.navi = !att_tracer_record->intercept_list.userdata;
|
||||
header.navi = !att_tracer_record->intercept_list.userdata & 0x1;
|
||||
int se_num = att_tracer_record->shader_engine_data_count;
|
||||
for (int i = 0; i < se_num; i++) {
|
||||
if (!att_tracer_record->shader_engine_data ||
|
||||
@@ -155,6 +166,11 @@ class att_plugin_t {
|
||||
out.write(data_buffer_ptr, se_att_trace->buffer_size);
|
||||
}
|
||||
|
||||
if (isa_mode == rocprofiler::ISA_MODE_DUMP_NONE)
|
||||
return 0;
|
||||
|
||||
uint64_t kernel_addr = att_tracer_record->intercept_list.userdata >> 1;
|
||||
|
||||
std::ofstream isafile(outfilepath + "_isa.s");
|
||||
if (!isafile.is_open()) {
|
||||
std::cerr << "Could not open ISA file: " << outfilepath << "_isa.s" << std::endl;
|
||||
@@ -166,11 +182,13 @@ class att_plugin_t {
|
||||
const rocprofiler_intercepted_codeobj_t& symbol =
|
||||
att_tracer_record->intercept_list.symbols[i];
|
||||
|
||||
if (!CheckAddrMatches(kernel_addr, symbol.base_address, symbol.mem_size)) continue;
|
||||
|
||||
std::unique_ptr<CodeObjectBinary> binary;
|
||||
std::unique_ptr<code_object_decoder_t> decoder;
|
||||
|
||||
if (symbol.data && symbol.size) {
|
||||
decoder = std::make_unique<code_object_decoder_t>(symbol.data, symbol.size);
|
||||
if (symbol.data && symbol.data_size) {
|
||||
decoder = std::make_unique<code_object_decoder_t>(symbol.data, symbol.data_size);
|
||||
} else if (std::string(symbol.filepath).find("file://") != std::string::npos) {
|
||||
binary = std::make_unique<CodeObjectBinary>(symbol.filepath);
|
||||
decoder =
|
||||
@@ -179,6 +197,11 @@ class att_plugin_t {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isa_mode == rocprofiler::ISA_MODE_DUMP_KERNEL)
|
||||
decoder->disassemble_single_kernel(kernel_addr-symbol.base_address);
|
||||
else
|
||||
decoder->disassemble_kernels();
|
||||
|
||||
for (auto& instance : decoder->instructions) {
|
||||
uint64_t addr = instance.address + symbol.base_address;
|
||||
|
||||
|
||||
@@ -109,7 +109,11 @@ code_object_decoder_t::code_object_decoder_t(const char* codeobj_data, uint64_t
|
||||
}
|
||||
// load_symbol_map();
|
||||
}
|
||||
disassemble_kernels();
|
||||
|
||||
disassembly = std::make_unique<DisassemblyInstance>(*this);
|
||||
m_symbol_map = disassembly->GetKernelMap();
|
||||
|
||||
//disassemble_kernels();
|
||||
}
|
||||
|
||||
|
||||
@@ -170,8 +174,11 @@ void code_object_decoder_t::disassemble_kernel(uint64_t faddr, uint64_t vaddr) {
|
||||
}
|
||||
|
||||
void code_object_decoder_t::disassemble_kernels() {
|
||||
disassembly = std::make_unique<DisassemblyInstance>(*this);
|
||||
m_symbol_map = disassembly->GetKernelMap();
|
||||
|
||||
for (auto& [vaddr, v] : m_symbol_map) disassemble_kernel(v.faddr, vaddr);
|
||||
}
|
||||
|
||||
void code_object_decoder_t::disassemble_single_kernel(uint64_t kaddr) {
|
||||
for (auto& [vaddr, v] : m_symbol_map)
|
||||
if (kaddr >= vaddr && kaddr < vaddr + v.mem_size)
|
||||
disassemble_kernel(v.faddr, vaddr);
|
||||
}
|
||||
|
||||
@@ -39,7 +39,9 @@ class code_object_decoder_t {
|
||||
~code_object_decoder_t();
|
||||
|
||||
void disassemble_kernel(uint64_t faddr, uint64_t vaddr);
|
||||
void disassemble_single_kernel(uint64_t kaddr);
|
||||
void disassemble_kernels();
|
||||
|
||||
int m_fd;
|
||||
|
||||
std::map<uint64_t, std::pair<std::string, size_t>> m_line_number_map;
|
||||
|
||||
@@ -238,9 +238,7 @@ class perfetto_plugin_t {
|
||||
uint64_t device_id = profiler_record.gpu_id.handle;
|
||||
std::unordered_map<uint64_t, perfetto::Track>::iterator device_track_it;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(device_tracks_lock_);
|
||||
uint64_t device_track_id = getTrackID(machine_id_, TrackType::DEVICE, device_id);
|
||||
|
||||
device_track_it = device_tracks.find(device_track_id);
|
||||
if (device_track_it == device_tracks.end()) {
|
||||
/* Create a new perfetto::Track (Sub-Track) */
|
||||
@@ -250,30 +248,25 @@ class perfetto_plugin_t {
|
||||
gpu_desc.mutable_process()->set_pid(device_id);
|
||||
gpu_desc.mutable_process()->set_chrome_process_type(
|
||||
perfetto::protos::gen::ProcessDescriptor::PROCESS_GPU);
|
||||
gpu_desc.mutable_process()->set_process_name("Node: " + std::string(hostname_) +
|
||||
" Device: ");
|
||||
gpu_desc.mutable_process()->set_process_name("Node: " + std::string(hostname_)
|
||||
+ std::to_string(GetPid()) + " Device: ");
|
||||
perfetto::TrackEvent::SetTrackDescriptor(device_track_it->second, gpu_desc);
|
||||
track_ids_used_.emplace_back(device_track_id);
|
||||
}
|
||||
}
|
||||
auto& gpu_track = device_track_it->second;
|
||||
std::pair<int, uint64_t> gpu_queue_id =
|
||||
std::make_pair(device_id, profiler_record.queue_id.handle);
|
||||
auto queue_track_it = queue_tracks_.find(gpu_queue_id.first);
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(stream_tracks_lock_);
|
||||
uint64_t queue_track_id = getTrackID(machine_id_, device_id+TrackType::DEVICE_ID, gpu_queue_id.first);
|
||||
queue_track_it = queue_tracks_.find(queue_track_id);
|
||||
if (queue_track_it == queue_tracks_.end()) {
|
||||
/* Create a new perfetto::Track */
|
||||
queue_track_it =
|
||||
queue_tracks_.emplace(queue_track_id, perfetto::Track(queue_track_id, gpu_track)).first;
|
||||
uint64_t queue_track_id
|
||||
= getTrackID(machine_id_, device_id+TrackType::DEVICE_ID, profiler_record.queue_id.handle);
|
||||
auto queue_track_it = queue_tracks_.find(queue_track_id);
|
||||
if (queue_track_it == queue_tracks_.end()) {
|
||||
/* Create a new perfetto::Track */
|
||||
queue_track_it =
|
||||
queue_tracks_.emplace(queue_track_id, perfetto::Track(queue_track_id, gpu_track)).first;
|
||||
|
||||
auto queue_desc = queue_track_it->second.Serialize();
|
||||
std::string queue_str = rocprofiler::string_printf("Queue %ld", gpu_queue_id.second);
|
||||
queue_desc.set_name(queue_str);
|
||||
perfetto::TrackEvent::SetTrackDescriptor(queue_track_it->second, queue_desc);
|
||||
}
|
||||
auto queue_desc = queue_track_it->second.Serialize();
|
||||
std::string queue_str = rocprofiler::string_printf("Queue %ld", profiler_record.queue_id.handle);
|
||||
queue_desc.set_name(queue_str);
|
||||
perfetto::TrackEvent::SetTrackDescriptor(queue_track_it->second, queue_desc);
|
||||
track_ids_used_.emplace_back(queue_track_id);
|
||||
}
|
||||
auto& queue_track = queue_track_it->second;
|
||||
@@ -306,7 +299,6 @@ class perfetto_plugin_t {
|
||||
std::pair<int, std::string> gpu_counter_track_id = std::make_pair(device_id, counter_name);
|
||||
std::unordered_map<std::string, perfetto::CounterTrack>::iterator counters_track_it;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(counter_tracks_lock_);
|
||||
counters_track_it = counter_tracks_.find(gpu_counter_track_id.second);
|
||||
if (counters_track_it == counter_tracks_.end()) {
|
||||
/* Create a new perfetto::Track */
|
||||
@@ -643,9 +635,6 @@ class perfetto_plugin_t {
|
||||
std::atomic<uint64_t> track_counter_{GetPid()};
|
||||
std::vector<uint64_t> track_ids_used_;
|
||||
|
||||
std::mutex roctx_tracks_lock_, hsa_tracks_lock_, hip_tracks_lock_, device_tracks_lock_;
|
||||
std::mutex stream_tracks_lock_, counter_tracks_lock_, mem_copies_tracks_lock_;
|
||||
|
||||
char hostname_[1024];
|
||||
uint64_t machine_id_;
|
||||
|
||||
|
||||
@@ -1152,12 +1152,16 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u
|
||||
session_id_snapshot, buffer_id, profile, kernel_properties,
|
||||
(uint32_t)syscall(__NR_gettid), user_pkt_index);
|
||||
|
||||
uint64_t userdata = HSASupport_Singleton::GetInstance()
|
||||
uint64_t off = dispatch_packet.kernel_object +
|
||||
GetKernelCode(dispatch_packet.kernel_object)->kernel_code_entry_byte_offset;
|
||||
codeobj_record::make_capture(rocprofiler_record_id_t{record_id}, capture_mode, off);
|
||||
|
||||
uint64_t IsGFX9 = HSASupport_Singleton::GetInstance()
|
||||
.GetHSAAgentInfo(queue_info.GetGPUAgent().handle)
|
||||
.GetDeviceInfo()
|
||||
.getName()
|
||||
.find("gfx9") != std::string::npos;
|
||||
codeobj_record::make_capture(rocprofiler_record_id_t{record_id}, capture_mode, userdata);
|
||||
codeobj_record::make_capture(rocprofiler_record_id_t{record_id}, capture_mode, IsGFX9 | (off<<1));
|
||||
codeobj_record::start_capture(rocprofiler_record_id_t{record_id});
|
||||
codeobj_record::stop_capture(rocprofiler_record_id_t{record_id});
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ class codeobj_capture_instance {
|
||||
|
||||
rocprofiler_intercepted_codeobj_t get() const {
|
||||
const char* buf_ptr = buffer.size() ? buffer.data() : nullptr;
|
||||
return {URI.c_str(), addr, buf_ptr, buffer.size(), start_time, end_time};
|
||||
return {URI.c_str(), addr, mem_size, buf_ptr, buffer.size(), start_time, end_time};
|
||||
};
|
||||
|
||||
const uint64_t addr;
|
||||
|
||||
@@ -44,6 +44,13 @@ typedef struct {
|
||||
uint64_t queue_index;
|
||||
} att_pending_signal_t;
|
||||
|
||||
enum rocprofiler_att_isa_dump_mode {
|
||||
ISA_MODE_DUMP_ALL=0,
|
||||
ISA_MODE_DUMP_OBJ,
|
||||
ISA_MODE_DUMP_KERNEL,
|
||||
ISA_MODE_DUMP_NONE
|
||||
};
|
||||
|
||||
union att_header_packet_t {
|
||||
struct {
|
||||
uint64_t reserved : 14;
|
||||
@@ -53,6 +60,7 @@ union att_header_packet_t {
|
||||
uint64_t DCU : 5;
|
||||
uint64_t DSA : 1;
|
||||
uint64_t SEID : 6;
|
||||
uint64_t isadumpmode : 3;
|
||||
};
|
||||
uint64_t raw;
|
||||
};
|
||||
|
||||
@@ -380,6 +380,11 @@ att_parsed_input_t GetATTParams() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (param_name.find("ISA_DUMP_MODE") != std::string::npos) {
|
||||
header.isadumpmode = param_value;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ATT_PARAM_NAMES.find(param_name) != ATT_PARAM_NAMES.end()) {
|
||||
parameters.push_back(std::make_pair(ATT_PARAM_NAMES[param_name], param_value));
|
||||
try {
|
||||
|
||||
@@ -661,7 +661,7 @@ TEST_F(CodeobjTest, WhenRunningProfilerWithMultipleCaptureAndCopy) {
|
||||
EXPECT_NE(capture.symbols[i].base_address, 0);
|
||||
EXPECT_NE(capture.symbols[i].clock_start.value, 0);
|
||||
EXPECT_NE(capture.symbols[i].data, nullptr);
|
||||
EXPECT_NE(capture.symbols[i].size, 0);
|
||||
EXPECT_NE(capture.symbols[i].data_size, 0);
|
||||
}
|
||||
|
||||
result = rocprofiler_codeobj_capture_stop(id);
|
||||
|
||||
Ссылка в новой задаче
Block a user