From f275cdd602109b5c851b0e4cdaa041a029368a76 Mon Sep 17 00:00:00 2001 From: Giovanni LB Date: Thu, 30 Nov 2023 14:16:29 -0300 Subject: [PATCH] SWDEV-432445: ATT continuous mode update part2. Added codeobj tracking. Change-Id: I1b58af70d221bbeb9b4cab960d26357a504045dd [ROCm/rocprofiler commit: edf93d48ab0ef6c1d32aa50478973433b5f21cdb] --- .../rocprofiler/plugin/att/CMakeLists.txt | 1 + .../rocprofiler/plugin/att/code_printing.cpp | 131 ++++++------- .../rocprofiler/plugin/att/code_printing.hpp | 127 +++++++++++-- .../rocprofiler/plugin/att/disassembly.cpp | 176 ++++++++++++------ .../rocprofiler/plugin/att/disassembly.hpp | 15 +- projects/rocprofiler/plugin/att/segment.hpp | 139 ++++++++++++++ projects/rocprofiler/plugin/att/service.py | 153 +++++++++------ projects/rocprofiler/plugin/att/stitch.py | 22 ++- projects/rocprofiler/plugin/att/ui/index.html | 47 ++--- projects/rocprofiler/plugin/exportmap | 9 +- .../rocprofiler/src/core/hsa/hsa_support.cpp | 2 +- .../core/hsa/packets/packets_generator.cpp | 15 +- .../src/core/hsa/packets/packets_generator.h | 7 +- .../core/isa_capture/code_object_track.cpp | 10 +- .../core/isa_capture/code_object_track.hpp | 28 ++- .../rocprofiler/src/core/session/att/att.h | 16 +- .../src/core/session/att/continuous.cpp | 120 ++++++++++-- .../rocprofiler/src/util/hsa_rsrc_factory.cpp | 2 - 18 files changed, 731 insertions(+), 289 deletions(-) create mode 100644 projects/rocprofiler/plugin/att/segment.hpp diff --git a/projects/rocprofiler/plugin/att/CMakeLists.txt b/projects/rocprofiler/plugin/att/CMakeLists.txt index f4af4f6795..eda2cb5c67 100644 --- a/projects/rocprofiler/plugin/att/CMakeLists.txt +++ b/projects/rocprofiler/plugin/att/CMakeLists.txt @@ -56,6 +56,7 @@ configure_file(trace_view.py att/trace_view.py COPYONLY) configure_file(stitch.py att/stitch.py COPYONLY) configure_file(drawing.py att/drawing.py COPYONLY) configure_file(att_to_csv.py att/att_to_csv.py COPYONLY) +configure_file(service.py att/service.py COPYONLY) configure_file(ui/index.html att/ui/index.html COPYONLY) configure_file(ui/logo.svg att/ui/logo.svg COPYONLY) configure_file(ui/styles.css att/ui/styles.css COPYONLY) diff --git a/projects/rocprofiler/plugin/att/code_printing.cpp b/projects/rocprofiler/plugin/att/code_printing.cpp index dd089d2837..f6ab9c2a10 100644 --- a/projects/rocprofiler/plugin/att/code_printing.cpp +++ b/projects/rocprofiler/plugin/att/code_printing.cpp @@ -69,11 +69,8 @@ catch (...) \ return returndata; \ } -code_object_decoder_t::code_object_decoder_t(const char* codeobj_data, uint64_t codeobj_size) { - buffer = std::vector{}; - buffer.resize(codeobj_size); - std::memcpy(buffer.data(), codeobj_data, codeobj_size); - +CodeObjDecoderComponent::CodeObjDecoderComponent(const char* codeobj_data, uint64_t codeobj_size) +{ m_fd = -1; #if defined(_GNU_SOURCE) && defined(MFD_ALLOW_SEALING) && defined(MFD_CLOEXEC) m_fd = ::memfd_create(m_uri.c_str(), MFD_ALLOW_SEALING | MFD_CLOEXEC); @@ -86,7 +83,7 @@ code_object_decoder_t::code_object_decoder_t(const char* codeobj_data, uint64_t return; } - if (size_t size = ::write(m_fd, buffer.data(), buffer.size()); size != buffer.size()) { + if (size_t size = ::write(m_fd, codeobj_data, codeobj_size); size != codeobj_size) { printf("could not write to the temporary file\n"); return; } @@ -130,26 +127,20 @@ code_object_decoder_t::code_object_decoder_t(const char* codeobj_data, uint64_t // load_symbol_map(); } - try { - disassembly = std::make_unique(*this); // Can throw - } catch(std::exception& e) { - return; - } + disassembly = std::make_unique(codeobj_data, codeobj_size, std::nullopt); // Can throw try { m_symbol_map = disassembly->GetKernelMap(); // Can throw - } catch(std::exception& e) { - return; - } + } catch(...) {} //disassemble_kernels(); } -code_object_decoder_t::~code_object_decoder_t() { +CodeObjDecoderComponent::~CodeObjDecoderComponent() { if (m_fd) ::close(m_fd); } -std::optional code_object_decoder_t::find_symbol(uint64_t vaddr) { +std::optional CodeObjDecoderComponent::find_symbol(uint64_t vaddr) { /* Load the symbol table. */ auto it = m_symbol_map.upper_bound(vaddr); if (it == m_symbol_map.begin()) @@ -172,7 +163,7 @@ std::optional code_object_decoder_t::find_symbol(uint64_t vaddr) { } std::pair -code_object_decoder_t::disassemble_instruction(uint64_t faddr, uint64_t vaddr) +CodeObjDecoderComponent::disassemble_instruction(uint64_t faddr, uint64_t vaddr) { if (!disassembly) throw std::exception(); @@ -191,7 +182,7 @@ code_object_decoder_t::disassemble_instruction(uint64_t faddr, uint64_t vaddr) return {disassembly->last_instruction, size}; } -void code_object_decoder_t::disassemble_kernel(uint64_t faddr, uint64_t vaddr) +void CodeObjDecoderComponent::disassemble_kernel(uint64_t faddr, uint64_t vaddr) { if (!disassembly) return; auto symbol = find_symbol(vaddr); @@ -216,17 +207,18 @@ void code_object_decoder_t::disassemble_kernel(uint64_t faddr, uint64_t vaddr) } } -void code_object_decoder_t::disassemble_kernels() { +void CodeObjDecoderComponent::disassemble_kernels() { for (auto& [vaddr, v] : m_symbol_map) disassemble_kernel(v.faddr, vaddr); } -void code_object_decoder_t::disassemble_single_kernel(uint64_t kaddr) { +void CodeObjDecoderComponent::disassemble_single_kernel(uint64_t kaddr) { for (auto& [vaddr, v] : m_symbol_map) if (kaddr >= vaddr && kaddr < vaddr + v.mem_size) disassemble_kernel(v.faddr, vaddr); } -CodeobjService::CodeobjService(const char* filepath, uint64_t load_base): load_base(load_base) +CodeobjDecoder::CodeobjDecoder(const char* filepath, uint64_t loadbase, uint64_t mem_size): + loadbase(loadbase), load_end(loadbase + mem_size) { if (!filepath) throw "Empty filepath."; @@ -245,22 +237,23 @@ CodeobjService::CodeobjService(const char* filepath, uint64_t load_base): load_b file.seekg(0, file.beg); file.read(buffer.data(), buffer.size()); - decoder = std::make_unique(buffer.data(), buffer.size()); + decoder = std::make_unique(buffer.data(), buffer.size()); } else { std::unique_ptr binary = std::make_unique(filepath); - decoder = std::make_unique(binary->buffer.data(), binary->buffer.size()); + auto& buffer = binary->buffer; + decoder = std::make_unique(buffer.data(), buffer.size()); } + + auto elf_segments = decoder->disassembly->getSegments(); } -bool CodeobjService::decode_single(uint64_t vaddr, uint64_t faddr) +bool CodeobjDecoder::add_to_map(uint64_t faddr, uint64_t vaddr, uint64_t voffset) { - if (!decoder->disassembly) return false; - try { - decoded_map[vaddr] = decoder->disassemble_instruction(faddr, vaddr-load_base); + decoded_map[vaddr] = decoder->disassemble_instruction(faddr, voffset); } catch(std::exception& e) { @@ -269,28 +262,27 @@ bool CodeobjService::decode_single(uint64_t vaddr, uint64_t faddr) return true; } -std::pair& CodeobjService::getDecoded(uint64_t addr) +bool CodeobjDecoder::decode_single_at_offset(uint64_t vaddr, uint64_t voffset) +{ + auto faddr = decoder->disassembly->va2fo(voffset); + if (!faddr) + return false; + + return add_to_map(*faddr, vaddr, voffset); +} + +bool CodeobjDecoder::decode_single(uint64_t vaddr) +{ + if (!decoder || vaddr < loadbase) return false; + return decode_single_at_offset(vaddr, vaddr-loadbase); +} + +std::pair& CodeobjDecoder::getDecoded(uint64_t addr) { if (decoded_map.find(addr) != decoded_map.end()) return decoded_map[addr]; - std::optional faddr{}; - - if (!bNotElfFILE) - { - faddr = DisassemblyInstance::va2fo(decoder->buffer.data(), addr-load_base); - if (!faddr) - bNotElfFILE = true; - } - - if (bNotElfFILE && decoder->buffer.size() > 0x100) { - uint64_t f_offset = *reinterpret_cast(decoder->buffer.data()+0xb8); - uint64_t v_offset = *reinterpret_cast(decoder->buffer.data()+0xc8); - - faddr = addr+f_offset-load_base-v_offset; - } - - if (!faddr || !decode_single(addr, *faddr)) + if (!decode_single(addr)) { std::cerr << "Invalid addr: " << std::hex << addr << std::dec << std::endl; throw std::exception(); @@ -299,62 +291,49 @@ std::pair& CodeobjService::getDecoded(uint64_t a return decoded_map[addr]; } -std::unordered_map> services{}; -std::atomic shandles{1}; - #define PUBLIC_API __attribute__((visibility("default"))) +CodeobjTableTranslation table; + extern "C" { - PUBLIC_API uint64_t createService(const char* filename, uint64_t load_base) + PUBLIC_API int addDecoder(const char* filename, uint32_t id, uint64_t loadbase, uint64_t memsize) { C_API_BEGIN - uint64_t handle = shandles.fetch_add(1); - services[handle] = std::make_unique(filename, load_base); - return handle; + table.addDecoder(filename, id, loadbase, memsize); + return 0; - C_API_END(0) + C_API_END(1) } - PUBLIC_API int deleteService(uint64_t handle) + PUBLIC_API int removeDecoder(uint32_t id, uint64_t loadbase) { - return services.erase(handle); + return table.removeDecoder(id, loadbase) != false; } - PUBLIC_API const char* getInstruction(uint64_t handle, uint64_t addr) + PUBLIC_API instruction_info_t getInstructionFromAddr(uint64_t vaddr) { + static instruction_info_t default_info{nullptr, nullptr, 0}; C_API_BEGIN - return services.at(handle)->getInstruction(addr); + return table.get(vaddr); - C_API_END(nullptr) + C_API_END(default_info) } - PUBLIC_API const char* getCppref(uint64_t handle, uint64_t addr) - { - C_API_BEGIN - - return services.at(handle)->getCppref(addr); - - C_API_END(nullptr) - } - PUBLIC_API size_t getInstSize(uint64_t handle, uint64_t addr) + PUBLIC_API instruction_info_t getInstructionFromID(uint32_t id, uint64_t offset) { + static instruction_info_t default_info{nullptr, nullptr, 0}; C_API_BEGIN - return services.at(handle)->getSize(addr); + return table.get(id, offset); - C_API_END(0) + C_API_END(default_info) } PUBLIC_API const char* getSymbolName(uint64_t addr) { C_API_BEGIN - for (auto& [handle, service] : services) - { - if (!service->inrange(addr)) continue; - return service->getSymbolName(addr); - } - return nullptr; + return table.getSymbolName(addr); C_API_END(nullptr) } -} \ No newline at end of file +} diff --git a/projects/rocprofiler/plugin/att/code_printing.hpp b/projects/rocprofiler/plugin/att/code_printing.hpp index 17c9a7463f..5e5a294080 100644 --- a/projects/rocprofiler/plugin/att/code_printing.hpp +++ b/projects/rocprofiler/plugin/att/code_printing.hpp @@ -30,13 +30,15 @@ #include #include "disassembly.hpp" +#include "segment.hpp" -class code_object_decoder_t { +class CodeObjDecoderComponent +{ public: std::optional find_symbol(uint64_t address); - code_object_decoder_t(const char* codeobj_data, uint64_t codeobj_size); - ~code_object_decoder_t(); + CodeObjDecoderComponent(const char* codeobj_data, uint64_t codeobj_size); + ~CodeObjDecoderComponent(); std::pair disassemble_instruction(uint64_t faddr, uint64_t vaddr); @@ -50,44 +52,133 @@ public: std::map m_symbol_map{}; std::string m_uri; - std::vector buffer; std::vector instructions{}; std::unique_ptr disassembly{}; }; -class CodeobjService +typedef struct { + const char* inst; + const char* cpp; + size_t size; +} instruction_info_t; + +class CodeobjDecoder { public: - CodeobjService(const char* filepath, uint64_t load_base); - bool decode_single(uint64_t vaddr, uint64_t faddr); + CodeobjDecoder(const char* filepath, uint64_t loadbase, uint64_t memsize); + bool decode_single(uint64_t vaddr); + bool decode_single_at_offset(uint64_t vaddr, uint64_t voffset); + bool add_to_map(uint64_t faddr, uint64_t vaddr, uint64_t voffset); std::pair& getDecoded(uint64_t addr); const char* getInstruction(uint64_t addr) { return getDecoded(addr).first.instruction; } const char* getCppref(uint64_t addr) { return getDecoded(addr).first.cpp_reference; } size_t getSize(uint64_t addr) { return getDecoded(addr).second; } - - uint64_t size() const { - if (!decoder) return 0; - return decoder->buffer.size(); + instruction_info_t get(uint64_t addr) { + auto& inst = getDecoded(addr); + return {inst.first.instruction, inst.first.cpp_reference, inst.second}; } - uint64_t begin() const { return load_base; }; - uint64_t end() const { return begin() + size(); } + + uint64_t begin() const { return loadbase; }; + uint64_t end() const { return load_end; } + uint64_t size() const { return load_end-loadbase; } bool inrange(uint64_t addr) const { return addr >= begin() && addr < end(); } const char* getSymbolName(uint64_t addr) const { if (!decoder) return nullptr; - auto it = decoder->m_symbol_map.find(addr-load_base); + auto it = decoder->m_symbol_map.find(addr-loadbase); if (it != decoder->m_symbol_map.end()) return it->second.name.data(); return nullptr; } + std::vector> elf_segments{}; + private: - const uint64_t load_base; + const uint64_t loadbase; + uint64_t load_end = 0; std::unordered_map> decoded_map; - std::unique_ptr decoder{nullptr}; - - bool bNotElfFILE = false; + std::unique_ptr decoder{nullptr}; +}; + +/** + * @brief Maps ID and offsets into instructions +*/ +class CodeobjList +{ +public: + CodeobjList() = default; + + virtual void addDecoder(const char* filepath, uint32_t id, uint64_t loadbase, uint64_t memsize) + { + decoders[id] = std::make_shared(filepath, loadbase, memsize); + } + + virtual bool removeDecoder(uint32_t id) + { + return decoders.erase(id) != 0; + } + + instruction_info_t get(uint32_t id, uint64_t offset) + { + auto& decoder = decoders.at(id); + auto& inst = decoder->getDecoded(decoder->begin() + offset); + return {inst.first.instruction, inst.first.cpp_reference, inst.second}; + } + + const char* getSymbolName(uint32_t id, uint64_t offset) + { + auto& decoder = decoders.at(id); + uint64_t vaddr = decoder->begin() + offset; + if (decoder->inrange(vaddr)) + return decoder->getSymbolName(vaddr); + return nullptr; + } + +protected: + std::unordered_map> decoders{}; +}; + +/** + * @brief Translates virtual addresses to elf file offsets +*/ +class CodeobjTableTranslation : protected CodeobjList +{ + using Super = CodeobjList; +public: + CodeobjTableTranslation() = default; + + void addDecoder(const char* filepath, uint32_t id, uint64_t loadbase, uint64_t memsize) override + { + this->Super::addDecoder(filepath, id, loadbase, memsize); + auto ptr = decoders.at(id); + table.insert({ptr->begin(), static_cast(ptr->size()), id, 0}); + } + + virtual bool removeDecoder(uint32_t id, uint64_t loadbase) + { + return table.remove(loadbase) && this->Super::removeDecoder(id); + } + + instruction_info_t get(uint64_t vaddr) + { + auto& addr_range = table.find_codeobj_in_range(vaddr); + return get(addr_range.id, vaddr - addr_range.vbegin); + } + instruction_info_t get(uint32_t id, uint64_t offset) { return this->Super::get(id, offset); } + + const char* getSymbolName(uint64_t vaddr) + { + for (auto& [_, decoder] : decoders) + { + if (!decoder->inrange(vaddr)) continue; + return decoder->getSymbolName(vaddr); + } + return nullptr; + } + +private: + CodeobjTableTranslator table; }; diff --git a/projects/rocprofiler/plugin/att/disassembly.cpp b/projects/rocprofiler/plugin/att/disassembly.cpp index b7be35de96..659d068892 100644 --- a/projects/rocprofiler/plugin/att/disassembly.cpp +++ b/projects/rocprofiler/plugin/att/disassembly.cpp @@ -122,58 +122,65 @@ CodeObjectBinary::CodeObjectBinary(const std::string& uri) : m_uri(uri) { }); buffer = std::vector{}; - try { - size_t offset{0}, size{0}; + size_t offset{0}, size{0}; - if (auto offset_it = params.find("offset"); offset_it != params.end()) { - offset = std::stoul(offset_it->second, nullptr, 0); - } - - if (auto size_it = params.find("size"); size_it != params.end()) { - if (!(size = std::stoul(size_it->second, nullptr, 0))) return; - } - - if (protocol != "file") throw protocol + " protocol not supported!"; - - std::ifstream file(decoded_path, std::ios::in | std::ios::binary); - if (!file || !file.is_open()) throw "could not open " + decoded_path; - - if (!size) { - file.ignore(std::numeric_limits::max()); - size_t bytes = file.gcount(); - file.clear(); - - if (bytes < offset) - throw "invalid uri " + decoded_path + " (file size < offset)"; - - size = bytes - offset; - } - - file.seekg(offset, std::ios_base::beg); - buffer.resize(size); - file.read(&buffer[0], size); - } catch (...) { + if (auto offset_it = params.find("offset"); offset_it != params.end()) { + offset = std::stoul(offset_it->second, nullptr, 0); } + + if (auto size_it = params.find("size"); size_it != params.end()) { + if (!(size = std::stoul(size_it->second, nullptr, 0))) return; + } + + if (protocol != "file") throw protocol + " protocol not supported!"; + + std::ifstream file(decoded_path, std::ios::in | std::ios::binary); + if (!file || !file.is_open()) throw "could not open " + decoded_path; + + if (!size) { + file.ignore(std::numeric_limits::max()); + size_t bytes = file.gcount(); + file.clear(); + + if (bytes < offset) + throw "invalid uri " + decoded_path + " (file size < offset)"; + + size = bytes - offset; + } + + file.seekg(offset, std::ios_base::beg); + buffer.resize(size); + file.read(&buffer[0], size); } -DisassemblyInstance::DisassemblyInstance(code_object_decoder_t& decoder) - : buffer(reinterpret_cast(decoder.buffer.data())), - size(decoder.buffer.size()) +DisassemblyInstance::DisassemblyInstance( + const char* codeobj_data, + uint64_t codeobj_size, + std::optional input_isa +) { - THROW_COMGR(amd_comgr_create_data(AMD_COMGR_DATA_KIND_EXECUTABLE, &data)); - THROW_COMGR(amd_comgr_set_data(data, size, decoder.buffer.data())); + buffer = std::vector(codeobj_size, 0); + std::memcpy(buffer.data(), codeobj_data, codeobj_size); - /*std::cout << "checking isa" << std::endl; - char isa_name[128]; - size_t isa_size = sizeof(isa_name); - CHECK_COMGR(amd_comgr_get_data_isa_name(data, &isa_size, isa_name)); - std::cout << isa_name << std::endl; */ - const char* isa_name = "amdgcn-amd-amdhsa--gfx1100"; + THROW_COMGR(amd_comgr_create_data(AMD_COMGR_DATA_KIND_EXECUTABLE, &data)); + THROW_COMGR(amd_comgr_set_data(data, buffer.size(), buffer.data())); + if (!input_isa) + { + input_isa = "amdgcn-amd-amdhsa--gfx1100"; + } + if (!input_isa) + { + input_isa = std::string(); + input_isa->resize(128); + size_t isa_size = sizeof(input_isa->size()); + THROW_COMGR(amd_comgr_get_data_isa_name(data, &isa_size, input_isa->data())); + } THROW_COMGR(amd_comgr_create_disassembly_info( - isa_name, + input_isa->data(), &DisassemblyInstance::memory_callback, &DisassemblyInstance::inst_callback, [](uint64_t address, void* user_data) {}, &info)); + } static bool IsKernelType(amd_comgr_symbol_type_t type) @@ -208,7 +215,7 @@ amd_comgr_status_t DisassemblyInstance::symbol_callback(amd_comgr_symbol_t symbo RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME, name.data())); DisassemblyInstance& instance = *static_cast(user_data); - std::optional faddr = va2fo(instance.buffer, vaddr); + std::optional faddr = instance.va2fo(vaddr); if (faddr) instance.symbol_map[vaddr] = {name, *faddr, mem_size}; @@ -230,7 +237,7 @@ DisassemblyInstance::~DisassemblyInstance() { uint64_t DisassemblyInstance::ReadInstruction(uint64_t faddr, uint64_t vaddr, const char* cpp_line) { uint64_t size_read; - uint64_t addr_in_buffer = reinterpret_cast(buffer) + faddr; + uint64_t addr_in_buffer = reinterpret_cast(buffer.data()) + faddr; THROW_COMGR(amd_comgr_disassemble_instruction(info, addr_in_buffer, (void*)this, &size_read)); @@ -242,7 +249,8 @@ uint64_t DisassemblyInstance::ReadInstruction(uint64_t faddr, uint64_t vaddr, co uint64_t DisassemblyInstance::memory_callback(uint64_t from, char* to, uint64_t size, void* user_data) { DisassemblyInstance& instance = *static_cast(user_data); - int64_t copysize = reinterpret_cast(instance.buffer) + instance.size - (int64_t)from; + int64_t copysize = reinterpret_cast(instance.buffer.data()) + + instance.buffer.size() - static_cast(from); copysize = std::min(size, copysize); std::memcpy(to, (char*)from, copysize); return copysize; @@ -261,11 +269,11 @@ void DisassemblyInstance::inst_callback(const char* instruction, void* user_data // mem - input argument, start of the elf // va - input argument, virtual address // return file offset, if found -std::optional DisassemblyInstance::va2fo(void *mem, uint64_t va) +std::optional DisassemblyInstance::va2fo(uint64_t va) { - CHECK_VA2FO(mem, "mem is nullptr"); + /*CHECK_VA2FO(buffer.size(), "buffer is not large enough"); - uint8_t *e_ident = (uint8_t*)mem; + uint8_t *e_ident = (uint8_t*)buffer.data(); CHECK_VA2FO(e_ident, "e_ident is nullptr"); CHECK_VA2FO( @@ -277,20 +285,25 @@ std::optional DisassemblyInstance::va2fo(void *mem, uint64_t va) CHECK_VA2FO(e_ident[EI_CLASS] == ELFCLASS64, "unexpected ei_class"); CHECK_VA2FO(e_ident[EI_DATA] == ELFDATA2LSB, "unexpected ei_data"); CHECK_VA2FO(e_ident[EI_VERSION] == EV_CURRENT, "unexpected ei_version"); - CHECK_VA2FO(e_ident[EI_OSABI] == 64 /*ELFOSABI_AMDGPU_HSA*/, "unexpected ei_osabi"); + CHECK_VA2FO(e_ident[EI_OSABI] == 64, "unexpected ei_osabi"); // ELFOSABI_AMDGPU_HSA CHECK_VA2FO( - e_ident[EI_ABIVERSION] == 2 /*ELFABIVERSION_AMDGPU_HSA_V4*/ || - e_ident[EI_ABIVERSION] == 3 /*ELFABIVERSION_AMDGPU_HSA_V5*/ , "unexpected ei_abiversion"); + e_ident[EI_ABIVERSION] == 2 || // ELFABIVERSION_AMDGPU_HSA_V4 + e_ident[EI_ABIVERSION] == 3, "unexpected ei_abiversion"); // ELFABIVERSION_AMDGPU_HSA_V5 - Elf64_Ehdr *ehdr = (Elf64_Ehdr*)mem; + Elf64_Ehdr *ehdr = (Elf64_Ehdr*)buffer.data(); + CHECK_VA2FO(buffer.size() > ehdr->e_phoff + sizeof(Elf64_Ehdr), "buffer is not large enough"); CHECK_VA2FO(ehdr, "ehdr is nullptr"); CHECK_VA2FO(ehdr->e_type == ET_DYN, "unexpected e_type"); - CHECK_VA2FO(ehdr->e_machine == ELF::EM_AMDGPU, "unexpected e_machine"); + CHECK_VA2FO(ehdr->e_machine == ELF::EM_AMDGPU, "unexpected e_machine"); */ + CHECK_VA2FO(buffer.size() > sizeof(Elf64_Ehdr), "buffer is not large enough"); + Elf64_Ehdr *ehdr = (Elf64_Ehdr*)buffer.data(); CHECK_VA2FO(ehdr->e_phoff != 0, "unexpected e_phoff"); - Elf64_Phdr *phdr = (Elf64_Phdr*)((uint8_t*)mem + ehdr->e_phoff); + CHECK_VA2FO(buffer.size() > ehdr->e_phoff + sizeof(Elf64_Phdr), "buffer is not large enough"); + + Elf64_Phdr *phdr = (Elf64_Phdr*)((uint8_t*)buffer.data() + ehdr->e_phoff); CHECK_VA2FO(phdr, "phdr is nullptr"); for (uint16_t i = 0; i < ehdr->e_phnum; ++i) @@ -304,3 +317,58 @@ std::optional DisassemblyInstance::va2fo(void *mem, uint64_t va) } return std::nullopt; } + +#undef CHECK_VA2FO +#define CHECK_VA2FO(x, msg) if (!(x)) { \ + std::cerr << __FILE__ << ' ' << __LINE__ << ' ' << msg << std::endl; \ + return {}; \ +} + +std::vector> DisassemblyInstance::getSegments() +{ + /*CHECK_VA2FO(buffer.size(), "buffer is not large enough"); + + uint8_t *e_ident = (uint8_t*)buffer.data(); + CHECK_VA2FO(e_ident, "e_ident is nullptr"); + + CHECK_VA2FO( + e_ident[EI_MAG0] == ELFMAG0 || + e_ident[EI_MAG1] == ELFMAG1 || + e_ident[EI_MAG2] == ELFMAG2 || + e_ident[EI_MAG3] == ELFMAG3, "unexpected ei_mag"); + + CHECK_VA2FO(e_ident[EI_CLASS] == ELFCLASS64, "unexpected ei_class"); + CHECK_VA2FO(e_ident[EI_DATA] == ELFDATA2LSB, "unexpected ei_data"); + CHECK_VA2FO(e_ident[EI_VERSION] == EV_CURRENT, "unexpected ei_version"); + CHECK_VA2FO(e_ident[EI_OSABI] == 64, "unexpected ei_osabi"); // ELFOSABI_AMDGPU_HSA + + CHECK_VA2FO( + e_ident[EI_ABIVERSION] == 2 || // ELFABIVERSION_AMDGPU_HSA_V4 + e_ident[EI_ABIVERSION] == 3, "unexpected ei_abiversion"); // ELFABIVERSION_AMDGPU_HSA_V5 + + Elf64_Ehdr *ehdr = (Elf64_Ehdr*)buffer.data(); + CHECK_VA2FO(buffer.size() > ehdr->e_phoff + sizeof(Elf64_Ehdr), "buffer is not large enough"); + CHECK_VA2FO(ehdr, "ehdr is nullptr"); + CHECK_VA2FO(ehdr->e_type == ET_DYN, "unexpected e_type"); + CHECK_VA2FO(ehdr->e_machine == ELF::EM_AMDGPU, "unexpected e_machine"); */ + + CHECK_VA2FO(buffer.size() > sizeof(Elf64_Ehdr), "buffer is not large enough"); + Elf64_Ehdr *ehdr = (Elf64_Ehdr*)buffer.data(); + CHECK_VA2FO(ehdr->e_phoff != 0, "unexpected e_phoff"); + + CHECK_VA2FO(buffer.size() > ehdr->e_phoff + sizeof(Elf64_Phdr), "buffer is not large enough"); + + Elf64_Phdr *phdr = (Elf64_Phdr*)((uint8_t*)buffer.data() + ehdr->e_phoff); + CHECK_VA2FO(phdr, "phdr is nullptr"); + + std::vector> segments; + for (Elf64_Half i = 0; i < ehdr->e_phnum; ++i) + { + if (phdr[i].p_type != PT_LOAD) + continue; + + segments.push_back({phdr[i].p_vaddr - phdr[i].p_offset, phdr[i].p_memsz}); + } + + return segments; +} diff --git a/projects/rocprofiler/plugin/att/disassembly.hpp b/projects/rocprofiler/plugin/att/disassembly.hpp index 98019c0420..64bec5147a 100644 --- a/projects/rocprofiler/plugin/att/disassembly.hpp +++ b/projects/rocprofiler/plugin/att/disassembly.hpp @@ -20,9 +20,9 @@ #pragma once +#include #include #include -#include #include #include @@ -48,7 +48,11 @@ struct SymbolInfo class DisassemblyInstance { public: - DisassemblyInstance(class code_object_decoder_t& decoder); + DisassemblyInstance( + const char* codeobj_data, + uint64_t codeobj_size, + std::optional input_isa + ); ~DisassemblyInstance(); uint64_t ReadInstruction(uint64_t faddr, uint64_t vaddr, const char* cpp_line); @@ -57,10 +61,11 @@ class DisassemblyInstance { static uint64_t memory_callback(uint64_t from, char* to, uint64_t size, void* user_data); static void inst_callback(const char* instruction, void* user_data); static amd_comgr_status_t symbol_callback(amd_comgr_symbol_t symbol, void* user_data); - static std::optional va2fo(void *mem, uint64_t va); - void* buffer; - int64_t size; + std::optional va2fo(uint64_t va); + std::vector> getSegments(); + + std::vector buffer; instruction_instance_t last_instruction; amd_comgr_disassembly_info_t info; amd_comgr_data_t data; diff --git a/projects/rocprofiler/plugin/att/segment.hpp b/projects/rocprofiler/plugin/att/segment.hpp new file mode 100644 index 0000000000..58da1a1093 --- /dev/null +++ b/projects/rocprofiler/plugin/att/segment.hpp @@ -0,0 +1,139 @@ +/* Copyright (c) 2023 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#pragma once +#include +#include +#include +#include +#include +#include + +template +class ordered_vector : public std::vector +{ + using Super = std::vector; +public: + void insert(const Type& elem) + { + size_t loc = lower_bound(elem.begin()); + if (this->size() && get(loc).begin() < elem.begin()) + loc ++; + this->Super::insert(this->begin()+loc, elem); + } + bool remove(const Type& elem) + { + if (!this->size()) return false; + size_t loc = lower_bound(elem.begin()); + if (get(loc) != elem) return false; + + this->Super::erase(this->begin()+loc); + return true; + } + bool remove(uint64_t elem_begin) + { + if (!this->size()) return false; + size_t loc = lower_bound(elem_begin); + if (get(loc).begin() != elem_begin) return false; + + this->Super::erase(this->begin()+loc); + return true; + } + size_t lower_bound(size_t addr) const + { + if (!this->size()) return 0; + return binary_search(addr, 0, this->size()-1); + } + + size_t binary_search(size_t addr, size_t s, size_t e) const + { + if (s >= e) + return s; + else if (s+1 == e) + return (get(e).begin() <= addr) ? e : s; + + size_t mid = (s+e)/2; + if (get(mid).begin() <= addr) + return binary_search(addr, mid, e); + else + return binary_search(addr, s, mid); + } + const Type& get(size_t i) const { return this->operator[](i); } +}; + +struct address_range_t +{ + uint64_t vbegin; + uint32_t size; + uint32_t id; + uint32_t offset; + + bool operator<(const address_range_t& other) const { return vbegin < other.vbegin; } + bool inrange(uint64_t addr) const { return addr >= vbegin && addr < vbegin+size; }; + uint64_t begin() const { return vbegin; } +}; + + +/** + * @brief Finds a candidate codeobj for the given vaddr +*/ +class CodeobjTableTranslator : protected ordered_vector +{ + using Super = ordered_vector; +public: + CodeobjTableTranslator() { reset(); } + + const address_range_t& find_codeobj_in_range(uint64_t addr) + { + if (cached_segment < size() && get(cached_segment).inrange(addr)) + return get(cached_segment); + + size_t lb = lower_bound(addr); + if (lb >= size() || !get(lb).inrange(addr)) + throw std::string("segment addr out of range"); + + cached_segment = lb; + return get(cached_segment); + } + + uint64_t find_codeobj_addr_in_range(uint64_t addr) { + return find_codeobj_in_range(addr).vbegin; + } + + const address_range_t& get(size_t index) const { return data()[index]; } + + void insert(const address_range_t& elem) { this->Super::insert(elem); } + void insert_list(std::vector arange) + { + for (auto& elem : arange) push_back(elem); + std::sort( + this->begin(), + this->end(), + [](const address_range_t& a, const address_range_t& b) { return a < b; } + ); + }; + + void reset() { cached_segment = ~0; } + void clear() { reset(); this->Super::clear(); } + bool remove(uint64_t addr) { reset(); return this->Super::remove(addr); } + +private: + size_t cached_segment = ~0; +}; diff --git a/projects/rocprofiler/plugin/att/service.py b/projects/rocprofiler/plugin/att/service.py index bafc19b73b..4bb61913ac 100644 --- a/projects/rocprofiler/plugin/att/service.py +++ b/projects/rocprofiler/plugin/att/service.py @@ -1,95 +1,138 @@ +#!/usr/bin/env python3 +import sys + +if sys.version_info[0] < 3: + raise Exception("Must be using Python 3") + import ctypes from ctypes import * +import os -pluginpath = '/home/giovanni/Desktop/rocprofiler/build/lib/rocprofiler/libatt_plugin.so' +HEADER_OFFSET = 62 +HEADER_MASK = 0x3 +ID_OFFSET = 30 +ID_MASK = (1<<32)-1 +OFFSET_MASK = (1<<30)-1 -attplugin = ctypes.CDLL(pluginpath) +pluginpath = '../../../lib/rocprofiler/libatt_plugin.so' +filedir = os.path.dirname(os.path.realpath(__file__)) +attplugin = CDLL(os.path.join(filedir, pluginpath)) -attplugin.createService.restype = ctypes.c_uint64 -attplugin.createService.argtypes = [ctypes.c_char_p, ctypes.c_uint64] -attplugin.deleteService.restype = ctypes.c_int -attplugin.deleteService.argtypes = [ctypes.c_uint64] -attplugin.getInstruction.restype = ctypes.c_char_p -attplugin.getInstruction.argtypes = [ctypes.c_uint64, ctypes.c_uint64] -attplugin.getCppref.restype = ctypes.c_char_p -attplugin.getCppref.argtypes = [ctypes.c_uint64, ctypes.c_uint64] -attplugin.getInstSize.restype = ctypes.c_size_t -attplugin.getInstSize.argtypes = [ctypes.c_uint64, ctypes.c_uint64] -attplugin.getSymbolName.restype = ctypes.c_char_p -attplugin.getSymbolName.argtypes = [ctypes.c_uint64] +attplugin.getSymbolName.restype = c_char_p +attplugin.getSymbolName.argtypes = [c_uint64] +class instruction_info_t(ctypes.Structure): + _fields_ = [('inst', c_char_p), + ('cpp', c_char_p), + ('size', c_size_t)] + +attplugin.getInstructionFromAddr.restype = instruction_info_t +attplugin.getInstructionFromAddr.argtypes = [c_uint64] + +attplugin.getInstructionFromID.restype = instruction_info_t +attplugin.getInstructionFromID.argtypes = [c_uint32, c_uint64] + +attplugin.addDecoder.restype = c_int +attplugin.addDecoder.argtypes = [c_char_p, c_uint32, c_uint64, c_uint64] + +attplugin.removeDecoder.restype = c_int +attplugin.removeDecoder.argtypes = [c_uint32, c_uint64] + +def IsRawPC(addr): + return addr >> HEADER_OFFSET == 0 + +def getID(addr): + return (addr >> ID_OFFSET) & ID_MASK + +def getOffset(addr): + return addr & OFFSET_MASK class CodeobjInstance: - def __init__(self, line, classification_func): + def __init__(self, line): tokens = line.split(' ') self.load_base = int(tokens[0], 16) - self.load_end = self.load_base + int(tokens[1], 16) - self.att_marker = int(tokens[2]) + self.memsize = int(tokens[1], 16) + self.att_id = int(tokens[2]) self.fpath = tokens[3] - self.handle = attplugin.createService(self.fpath.encode('utf-8'), self.load_base) - self.classifier = classification_func - - if self.handle == 0: + encoded = self.fpath.encode('utf-8') + self.error = attplugin.addDecoder(encoded, self.att_id, self.load_base, self.memsize) + if self.error != 0: print('Warning: Could not open', line) raise def release(self): - attplugin.deleteService(self.handle) - - def inrange(self, addr): - return addr >= self.load_base and addr < self.load_end+0x1000 - - def GetInstruction(self, addr): - inst = attplugin.getInstruction(self.handle, addr) - if inst is None: - return None - inst = inst.decode() - while len(inst) and (inst[0] == '\t' or inst[0] == ' '): - inst = inst[1:] - while len(inst) and (inst[-1] == '\t' or inst[-1] == ' '): - inst = inst[:-1] - cpp = attplugin.getCppref(self.handle, addr) - if cpp: - cpp = cpp.decode() - size = attplugin.getInstSize(self.handle, addr) - if size and inst: - return (self.classifier(inst.encode('utf-8'), len(inst)), inst, cpp, size) - return None + attplugin.removeDecoder(self.att_id, self.load_base) class CodeobjService: - def __init__(self, att_kernel_txt, cfunc) -> None: + def __init__(self, att_kernel_txt, cfunc): cfunc.restype = ctypes.c_int cfunc.argtypes = [ctypes.c_char_p, ctypes.c_size_t] + self.classifier = cfunc self.last_instance = None - self.services = [] + self.services = {} for line in att_kernel_txt: try: if 'memory://' == line[0:len('memory://')]: continue - self.services.append(CodeobjInstance(line, cfunc)) + service = CodeobjInstance(line) + self.services[service.att_id] = service except: pass + def ToRawPC(self, addr): + if IsRawPC(addr): + return addr + return self.services[getID(addr)].load_base + getOffset(addr) + def release(self): - for _, _, instance in self.services: + for _, instance in self.services.items(): instance.release() def GetInstruction(self, addr): - if self.last_instance and self.last_instance.inrange(addr): - return self.last_instance.GetInstruction(addr) + if addr >> HEADER_OFFSET != 0: + return self.GetInstructionFromID(getID(addr), getOffset(addr)) + else: + return self.GetInstructionFromAddr(addr) - for instance in self.services: - if instance.inrange(addr): - self.last_instance = instance - return instance.GetInstruction(addr) + def GetInstructionFromAddr(self, addr): + info_inst = attplugin.getInstructionFromAddr(addr) + if info_inst.size == 0 or info_inst.inst is None: + return None + inst = info_inst.inst.decode() + cpp = info_inst.cpp + if cpp: + cpp = cpp.decode() - return None + while len(inst) and (inst[0] == '\t' or inst[0] == ' '): + inst = inst[1:] + while len(inst) and (inst[-1] == '\t' or inst[-1] == ' '): + inst = inst[:-1] + + return (self.classifier(info_inst.inst, len(inst)), inst, cpp, info_inst.size) + + def GetInstructionFromID(self, id, offset): + info_inst = attplugin.getInstructionFromID(id, offset) + if info_inst.size == 0 or info_inst.inst is None: + return None + inst = info_inst.inst.decode() + cpp = info_inst.cpp + if cpp: + cpp = cpp.decode() + else: + cpp = '' + + while len(inst) and (inst[0] == '\t' or inst[0] == ' '): + inst = inst[1:] + while len(inst) and (inst[-1] == '\t' or inst[-1] == ' '): + inst = inst[:-1] + + return (self.classifier(info_inst.inst, len(inst)), inst, cpp, info_inst.size) def getSymbolName(self, addr): - name = attplugin.getSymbolName(addr) + name = attplugin.getSymbolName(self.ToRawPC(addr)) if name: return name.decode() - return "Addr #"+hex(addr) + return "Addr #"+hex(self.ToRawPC(addr)) diff --git a/projects/rocprofiler/plugin/att/stitch.py b/projects/rocprofiler/plugin/att/stitch.py index c70337cbb4..acaa1e096f 100644 --- a/projects/rocprofiler/plugin/att/stitch.py +++ b/projects/rocprofiler/plugin/att/stitch.py @@ -186,6 +186,18 @@ class PCTranslator: def jump(self, as_line): return self.jump_map[as_line[-3]] + def addsymbol(self, addr): + if addr in self.addrmap: + return + + symbol = self.codeservice.getSymbolName(addr) + if symbol is None: + symbol = "Unkown symbol at 0x" + hex(addr) + + last_line = self.raw_code[-1] + newline = ['; ' + symbol, 100, last_line[2], 0, last_line[4], last_line[5], -1, 0, 0] + self.raw_code.append(newline) + def getcode(self, addr): try: return self.addrmap[addr] @@ -330,6 +342,7 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto, codeservice): return None watchlist = PCTranslator(insts, code, raw_code, reverse_map, codeservice) + watchlist.addsymbol(firstinst.cycles) line = firstinst.cycles lineincrement = watchlist.getincrement(line) pcskip.append(0) @@ -517,8 +530,8 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto, codeservice): insts[i] = insts[i + 1] insts[i + 1] = temp next = line - elif "s_waitcnt " in as_line[0] or "_load_" in as_line[0]: - if skipped_immed > 0 and "s_waitcnt " in as_line[0]: + elif "s_waitcnt" in as_line[0] or "_load_" in as_line[0]: + if skipped_immed > 0 and "s_waitcnt" in as_line[0]: matched = True skipped_immed -= 1 elif 'scratch_' not in as_line[0]: @@ -530,11 +543,12 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto, codeservice): result.append(inst) i += 1 num_failed_stitches = 0 - elif not bGFX9 and inst.type == IMMED and line != next: + elif inst.type == IMMED and line != next and (not bGFX9 or 's_barrier' in as_line[0]): skipped_immed += 1 inst.asmline = reverse_map[line] result.append(inst) - next = line + if 's_barrier' in as_line[0]: + next = line + lineincrement i += 1 else: num_failed_stitches += 1 diff --git a/projects/rocprofiler/plugin/att/ui/index.html b/projects/rocprofiler/plugin/att/ui/index.html index 185b3b5bc3..9f78232427 100644 --- a/projects/rocprofiler/plugin/att/ui/index.html +++ b/projects/rocprofiler/plugin/att/ui/index.html @@ -682,13 +682,7 @@ const token_id = "token" + scroll_to const token = d3.select('#'+token_id).node() // scrolling not consistent without the timer - setTimeout(() => { - token.scrollIntoView({behavior: "smooth", inline: "start"}) - }, 800) - const inView = isInViewport(src_line) - if (!inView) { - src_line.scrollIntoView({behavior: "smooth", block: "start"}) - } + src_line.scrollIntoView({behavior: "smooth", block: "start"}) } }) .text((d,i) => { @@ -952,28 +946,27 @@ .attr("fill", (d) => { return INST_TYPE[d[1]][1] }) .style("cursor", "pointer") .datum((d) => { return {data:d} }) - .on('mouseenter', (d) => { - if (d) { - try { - const token = d3.select(d.toElement) - last_color = token.attr('fill') - token.attr('fill', '#e0115f') - let token_data = token.datum().data - d3.select(".highlight") .classed("highlight", false) - d3.select('.line_' + token_data[4]) .classed("highlight", true) - src_line = d3.select('.line_' + token_data[4]).node() - const inView = isInViewport(src_line) - if (!inView) { - src_line.scrollIntoView({behavior: "smooth", block: "start"}) - } - } catch {} - } + .on('mouseenter', (e, d) => { + try { + d3.select('.line_' + d.data[4]).classed("highlight", true) + d3.select(".highlight") .classed("highlight", true) + } catch {} }) - .on("mouseleave", (d) => { - if (d) { - d3.select(d.fromElement) .attr("fill", last_color) + .on("mouseleave", (e, d) => { + try { + d3.select('.line_' + d.data[4]).classed("highlight", false) d3.select(".highlight") .classed("highlight", false) - } + } catch {} + }) + .on('click', (e, d) => { + try { + d3.select('.line_' + d.data[4]).classed("highlight", true) + src_line = d3.select('.line_' + d.data[4]).node() + const inView = isInViewport(src_line) + if (!inView) { + src_line.scrollIntoView({behavior: "smooth", block: "start"}) + } + } catch {} }) .append("svg:title") .text((d) => { diff --git a/projects/rocprofiler/plugin/exportmap b/projects/rocprofiler/plugin/exportmap index 1a8c2d0863..46d1942121 100644 --- a/projects/rocprofiler/plugin/exportmap +++ b/projects/rocprofiler/plugin/exportmap @@ -3,11 +3,10 @@ global: rocprofiler_plugin_initialize; rocprofiler_plugin_finalize; rocprofiler_plugin_write_buffer_records; rocprofiler_plugin_write_record; - createService; - deleteService; - getInstruction; - getCppref; - getInstSize; + addDecoder; + removeDecoder; + getInstructionFromAddr; + getInstructionFromID; getSymbolName; local: *; }; \ No newline at end of file diff --git a/projects/rocprofiler/src/core/hsa/hsa_support.cpp b/projects/rocprofiler/src/core/hsa/hsa_support.cpp index fd0a828bf6..ff8324d4d0 100644 --- a/projects/rocprofiler/src/core/hsa/hsa_support.cpp +++ b/projects/rocprofiler/src/core/hsa/hsa_support.cpp @@ -503,7 +503,7 @@ hsa_status_t CodeObjectCallback(hsa_executable_t executable, if (data.codeobj.unload) codeobj_capture_instance::Unload(data.codeobj.load_base); else - codeobj_capture_instance::Load(data.codeobj.load_base, + codeobj_capture_instance::Load(data.codeobj.load_base, data.codeobj.load_size, uri_str, data.codeobj.memory_base, data.codeobj.memory_size); hsa_executable_iterate_agent_symbols(executable, data.codeobj.agent, diff --git a/projects/rocprofiler/src/core/hsa/packets/packets_generator.cpp b/projects/rocprofiler/src/core/hsa/packets/packets_generator.cpp index c724eb3b79..0a2447bbb6 100644 --- a/projects/rocprofiler/src/core/hsa/packets/packets_generator.cpp +++ b/projects/rocprofiler/src/core/hsa/packets/packets_generator.cpp @@ -18,8 +18,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -//#define HSA_ATT_MARKER_ENABLE - #include "packets_generator.h" #include "src/api/rocprofiler_singleton.h" @@ -618,10 +616,13 @@ hsa_ven_amd_aqlprofile_profile_t* GenerateATTPackets( // Generate ATT tracer marker packets. Also generate and return // the descriptor object which has the PM4 buffer for inserting data -hsa_ven_amd_aqlprofile_descriptor_t -GenerateATTMarkerPackets(hsa_agent_t gpu_agent, packet_t& marker_packet, uint32_t data) +hsa_ven_amd_aqlprofile_descriptor_t GenerateATTMarkerPackets( + hsa_agent_t gpu_agent, + packet_t& marker_packet, + uint32_t data, + hsa_ven_amd_aqlprofile_att_marker_channel_t channel +) { -#ifdef HSA_ATT_MARKER_ENABLE // Preparing the profile structure to get the packets auto pool = rocprofiler::HSASupport_Singleton::GetInstance() .GetHSAAgentInfo(gpu_agent.handle) @@ -635,7 +636,6 @@ GenerateATTMarkerPackets(hsa_agent_t gpu_agent, packet_t& marker_packet, uint32_ {}, desc }; - hsa_ven_amd_aqlprofile_att_marker_channel_t channel = HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_2; hsa_status_t status = hsa_ven_amd_aqlprofile_att_marker(&profile, &marker_packet, data, channel); if (status != HSA_STATUS_SUCCESS) { @@ -645,9 +645,6 @@ GenerateATTMarkerPackets(hsa_agent_t gpu_agent, packet_t& marker_packet, uint32_ } return desc; -#else - return {nullptr,0}; -#endif } void AddVendorSpecificPacket(const packet_t* packet, diff --git a/projects/rocprofiler/src/core/hsa/packets/packets_generator.h b/projects/rocprofiler/src/core/hsa/packets/packets_generator.h index 205e1c525a..42914bdfd6 100644 --- a/projects/rocprofiler/src/core/hsa/packets/packets_generator.h +++ b/projects/rocprofiler/src/core/hsa/packets/packets_generator.h @@ -61,7 +61,12 @@ hsa_ven_amd_aqlprofile_profile_t* GenerateATTPackets( packet_t* stop_packet, size_t att_buffer_size); hsa_ven_amd_aqlprofile_descriptor_t -GenerateATTMarkerPackets(hsa_agent_t gpu_agent, packet_t& marker_packet, uint32_t data); +GenerateATTMarkerPackets( + hsa_agent_t gpu_agent, + packet_t& marker_packet, + uint32_t data, + hsa_ven_amd_aqlprofile_att_marker_channel_t channel +); uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_pool_t* cpu_pool); diff --git a/projects/rocprofiler/src/core/isa_capture/code_object_track.cpp b/projects/rocprofiler/src/core/isa_capture/code_object_track.cpp index 1fa1e57b57..cf3b6c5d2e 100644 --- a/projects/rocprofiler/src/core/isa_capture/code_object_track.cpp +++ b/projects/rocprofiler/src/core/isa_capture/code_object_track.cpp @@ -48,7 +48,7 @@ std::mutex codeobj_record::mutex; std::unordered_map codeobj_record::codeobjs{}; std::unordered_map codeobj_record::record_id_map{}; std::unordered_set codeobj_record::listeners; -std::atomic codeobj_capture_instance::loadcount{0}; +std::atomic codeobj_capture_instance::eventcount{0}; // Codeobj Record codeobj_record::codeobj_record(rocprofiler_codeobj_capture_mode_t mode) : capture_mode(mode){}; @@ -71,16 +71,17 @@ void codeobj_record::stop_capture() { // Codeobj Capture void codeobj_capture_instance::Load( uint64_t addr, + uint64_t load_size, const std::string& URI, uint64_t mem_addr, - uint64_t size + uint64_t mem_size ) { - uint32_t id = loadcount.fetch_add(1, std::memory_order_relaxed)+1; + uint32_t id = eventcount.fetch_add(1, std::memory_order_relaxed)+1; auto time = rocprofiler::ROCProfiler_Singleton::GetInstance().timestamp_ns().value; std::lock_guard lock(codeobj_record::mutex); - auto inst = std::make_shared(addr, URI, mem_addr, size, time, id); + auto inst = std::make_shared(addr, load_size, URI, mem_addr, mem_size, time, id); codeobj_record::codeobjs[addr] = inst; for (auto* listen : codeobj_record::listeners) listen->addcapture(inst); } @@ -90,6 +91,7 @@ void codeobj_capture_instance::Unload(uint64_t addr) { if (codeobj_record::codeobjs.find(addr) == codeobj_record::codeobjs.end()) return; + eventcount.fetch_add(1, std::memory_order_relaxed)+1; auto time = rocprofiler::ROCProfiler_Singleton::GetInstance().timestamp_ns().value; codeobj_record::codeobjs.at(addr)->end_time = time; codeobj_record::codeobjs.erase(addr); diff --git a/projects/rocprofiler/src/core/isa_capture/code_object_track.hpp b/projects/rocprofiler/src/core/isa_capture/code_object_track.hpp index 6e5de1ed3c..66b1ef3d16 100644 --- a/projects/rocprofiler/src/core/isa_capture/code_object_track.hpp +++ b/projects/rocprofiler/src/core/isa_capture/code_object_track.hpp @@ -34,30 +34,44 @@ */ class codeobj_capture_instance { public: - codeobj_capture_instance(uint64_t _addr, const std::string& _uri, uint64_t mem_addr, - uint64_t mem_size, uint64_t start_time, uint32_t id) - : addr(_addr), start_time(start_time), URI(_uri), + codeobj_capture_instance( + uint64_t _addr, + uint64_t _load_size, + const std::string& _uri, + uint64_t mem_addr, + uint64_t mem_size, + uint64_t start_time, + uint32_t id + ) + : addr(_addr), load_size(_load_size), start_time(start_time), URI(_uri), mem_addr(mem_addr), mem_size(mem_size), load_id(id) {}; void setmode(rocprofiler_codeobj_capture_mode_t mode); rocprofiler_intercepted_codeobj_t get() const { const char* buf_ptr = buffer.size() ? buffer.data() : nullptr; - return {URI.c_str(), addr, mem_size, buf_ptr, buffer.size(), start_time, end_time, load_id}; + return {URI.c_str(), addr, load_size, buf_ptr, buffer.size(), start_time, end_time, load_id}; }; const uint64_t addr; + const uint64_t load_size; const uint64_t start_time; const uint32_t load_id; - static void Load(uint64_t addr, const std::string& URI, uint64_t mem_addr, uint64_t mem_size); + static void Load( + uint64_t addr, + uint64_t load_size, + const std::string& URI, + uint64_t mem_addr, + uint64_t mem_size + ); static void Unload(uint64_t addr); - static uint32_t GetLoadCount() { return loadcount.load(std::memory_order_relaxed); } + static uint32_t GetEventCount() { return eventcount.load(std::memory_order_relaxed); } private: //! 32 bits ID because this is the natural channel width for ATT Markers. //! There is no world in which 4 billions markers can be sent anyway. - static std::atomic loadcount; + static std::atomic eventcount; void reset(rocprofiler_codeobj_capture_mode_t mode); std::pair parse_uri(); diff --git a/projects/rocprofiler/src/core/session/att/att.h b/projects/rocprofiler/src/core/session/att/att.h index 0a2397beba..05edd98051 100644 --- a/projects/rocprofiler/src/core/session/att/att.h +++ b/projects/rocprofiler/src/core/session/att/att.h @@ -91,10 +91,21 @@ public: ); void InsertMarker( + std::vector& transformed_packets, + hsa_agent_t agent, + uint32_t data, + hsa_ven_amd_aqlprofile_att_marker_channel_t channel + ); + void InsertUnloadMarker( std::vector& transformed_packets, hsa_agent_t agent, uint32_t data ); + void InsertLoadMarker( + std::vector& transformed_packets, + hsa_agent_t agent, + rocprofiler_intercepted_codeobj_t codeobj + ); void SetParameters(const std::vector& params) { att_parameters_data = params; @@ -125,7 +136,7 @@ protected: static std::mutex att_enable_disable_mutex; private: - uint32_t codeobj_load_cnt = 0; + uint32_t codeobj_event_cnt = 0; static void AddAttRecord( rocprofiler_record_att_tracer_t* record, @@ -176,6 +187,9 @@ private: std::mutex sessions_pending_signals_lock_; std::map> sessions_pending_signals_; + + rocprofiler_record_id_t capture_id; + std::unordered_set active_capture_event_ids; }; } // namespace att diff --git a/projects/rocprofiler/src/core/session/att/continuous.cpp b/projects/rocprofiler/src/core/session/att/continuous.cpp index 8a4c3dac29..f99e3890c8 100644 --- a/projects/rocprofiler/src/core/session/att/continuous.cpp +++ b/projects/rocprofiler/src/core/session/att/continuous.cpp @@ -27,19 +27,71 @@ #define __NR_gettid 186 +#define ATT_MARKER_HEADER_CHANNEL HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_0 +#define ATT_MARKER_SIZE_CHANNEL HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_1 +#define ATT_MARKER_LO_CHANNEL HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_2 +#define ATT_MARKER_HI_CHANNEL HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_3 + +enum rocprofiler_att_marker_type_t { + ROCPROFILER_ATT_MARKER_LOAD = 0, + ROCPROFILER_ATT_MARKER_UNLOAD = 1 +}; + +union att_header_marker_t +{ + uint32_t raw; + struct { + uint32_t type : 2; + uint32_t id : 30; + }; +}; + namespace rocprofiler { namespace att { -void AttTracer::InsertMarker( +void AttTracer::InsertUnloadMarker( std::vector& transformed_packets, hsa_agent_t agent, uint32_t data +) { + att_header_marker_t header{.raw = 0}; + header.type = ROCPROFILER_ATT_MARKER_UNLOAD; + header.id = data; + hsa_ven_amd_aqlprofile_att_marker_channel_t channel = ATT_MARKER_HEADER_CHANNEL; + + this->InsertMarker(transformed_packets, agent, header.raw, channel); +} + +void AttTracer::InsertLoadMarker( + std::vector& transformed_packets, + hsa_agent_t agent, + rocprofiler_intercepted_codeobj_t codeobj +) { + this->InsertMarker(transformed_packets, agent, codeobj.mem_size, ATT_MARKER_SIZE_CHANNEL); + + uint64_t addr = codeobj.base_address; + this->InsertMarker(transformed_packets, agent, addr & ((1ul << 32)-1), ATT_MARKER_LO_CHANNEL); + this->InsertMarker(transformed_packets, agent, addr >> 32, ATT_MARKER_HI_CHANNEL); + + att_header_marker_t header{.raw = 0}; + header.type = ROCPROFILER_ATT_MARKER_LOAD; + header.id = codeobj.att_marker_id; + this->InsertMarker(transformed_packets, agent, header.raw, ATT_MARKER_HEADER_CHANNEL); +} + +void AttTracer::InsertMarker( + std::vector& transformed_packets, + hsa_agent_t agent, + uint32_t data, + hsa_ven_amd_aqlprofile_att_marker_channel_t channel ) { packet_t marker_packet{}; - auto desc = Packet::GenerateATTMarkerPackets(agent, marker_packet, data); + auto desc = Packet::GenerateATTMarkerPackets(agent, marker_packet, data, channel); if (desc.ptr && desc.size) Packet::AddVendorSpecificPacket(&marker_packet, &transformed_packets, hsa_signal_t{.handle = 0}); + else + rocprofiler::warning("Could not add ATT Marker"); } @@ -63,25 +115,26 @@ bool AttTracer::ATTContiguousWriteInterceptor( // att start // Getting Queue Data and Information + auto agent_handle = queue_info.GetGPUAgent().handle; rocprofiler::HSAAgentInfo& agentInfo = rocprofiler::HSASupport_Singleton::GetInstance() - .GetHSAAgentInfo(queue_info.GetGPUAgent().handle); + .GetHSAAgentInfo(agent_handle); auto dispatchPackets = Packet::ExtractDispatchPackets(packets, pkt_count); if (dispatchPackets.size() == 0) return false; size_t writer_id = WRITER_ID.fetch_add(dispatchPackets.size(), std::memory_order_relaxed); - uint32_t new_load_cnt = codeobj_capture_instance::GetLoadCount(); + uint32_t new_load_cnt = codeobj_capture_instance::GetEventCount(); auto bInsertStart = RequiresStartPacket(writer_id, dispatchPackets.size()); { std::lock_guard lk(att_enable_disable_mutex); // If att_start already exists, don't start again - auto agent_pending_packets = pending_stop_packets.find(queue_info.GetGPUAgent().handle); + auto agent_pending_packets = pending_stop_packets.find(agent_handle); if (agent_pending_packets != pending_stop_packets.end()) bInsertStart = {}; // If nothing will be added or removed, return - if (!bInsertStart && codeobj_load_cnt == new_load_cnt) + if (!bInsertStart && codeobj_event_cnt == new_load_cnt) { if ( agent_pending_packets == pending_stop_packets.end() || @@ -107,7 +160,7 @@ bool AttTracer::ATTContiguousWriteInterceptor( } uint64_t IsGFX9 = HSASupport_Singleton::GetInstance() - .GetHSAAgentInfo(queue_info.GetGPUAgent().handle) + .GetHSAAgentInfo(agent_handle) .GetDeviceInfo() .getName() .find("gfx9") != std::string::npos; @@ -134,33 +187,58 @@ bool AttTracer::ATTContiguousWriteInterceptor( 0 ); - codeobj_record::make_capture(rocprofiler_record_id_t{record_id}, capturem, IsGFX9); - codeobj_record::start_capture(rocprofiler_record_id_t{record_id}); + this->capture_id = rocprofiler_record_id_t{record_id}; + codeobj_record::make_capture(this->capture_id, capturem, IsGFX9); + codeobj_record::start_capture(this->capture_id); stop_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; std::lock_guard lk(att_enable_disable_mutex); - pending_stop_packets[queue_info.GetGPUAgent().handle] + pending_stop_packets[agent_handle] = {record_id, writer_id, bInsertStart->second, session_id_, stop_packet}; } - if (codeobj_load_cnt != new_load_cnt) + bool bHasPending = false; { - codeobj_load_cnt = new_load_cnt; - InsertMarker(transformed_packets, queue_info.GetGPUAgent(), new_load_cnt); + std::lock_guard lk(att_enable_disable_mutex); + bHasPending = pending_stop_packets.find(agent_handle) != pending_stop_packets.end(); + } + + if (bHasPending && (bInsertStart || codeobj_event_cnt != new_load_cnt)) + { + codeobj_event_cnt = new_load_cnt; + + auto symbols = codeobj_record::get_capture(this->capture_id); + std::unordered_set current_ids; + + for (size_t s=0; s lk(att_enable_disable_mutex); - auto agent_pending_packets = pending_stop_packets.find(queue_info.GetGPUAgent().handle); + auto agent_pending_packets = pending_stop_packets.at(agent_handle); - if (agent_pending_packets != pending_stop_packets.end() && - agent_pending_packets->second.last_kernel_exec <= writer_id + dispatchPackets.size() - ) { - const ATTRecordSignal& rsignal = agent_pending_packets->second; + if (agent_pending_packets.last_kernel_exec <= writer_id + dispatchPackets.size()) + { + const ATTRecordSignal& rsignal = agent_pending_packets; // Adding a barrier packet with the original packet's completion signal. hsa_signal_t interrupt_signal; CreateSignal(0, &interrupt_signal); @@ -180,8 +258,10 @@ bool AttTracer::ATTContiguousWriteInterceptor( interrupt_signal }); - codeobj_record::stop_capture(rocprofiler_record_id_t{rsignal.record_id}); - pending_stop_packets.erase(queue_info.GetGPUAgent().handle); + //codeobj_record::stop_capture(rocprofiler_record_id_t{rsignal.record_id}); + codeobj_record::stop_capture(this->capture_id); + active_capture_event_ids.clear(); + pending_stop_packets.erase(agent_handle); } } diff --git a/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp b/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp index 4fba4bb439..a419f49a47 100644 --- a/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp +++ b/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp @@ -316,10 +316,8 @@ hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { api->hsa_ven_amd_aqlprofile_iterate_data = (decltype(::hsa_ven_amd_aqlprofile_iterate_data)*)dlsym( handle, "hsa_ven_amd_aqlprofile_iterate_data"); -#ifdef HSA_ATT_MARKER_ENABLE api->hsa_ven_amd_aqlprofile_att_marker = (decltype(::hsa_ven_amd_aqlprofile_att_marker)*) dlsym(handle, "hsa_ven_amd_aqlprofile_att_marker"); -#endif return HSA_STATUS_SUCCESS; }