920b33c0b9
- Temporarily, masking out the trap handler latency, by detecting untagged error samples. - Disabling checks for the number of invalid samples.
207 строки
7.1 KiB
C++
207 строки
7.1 KiB
C++
// MIT License
|
|
//
|
|
// Copyright (c) 2024-2025 ROCm Developer Tools
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in all
|
|
// copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
// SOFTWARE.
|
|
|
|
// undefine NDEBUG so asserts are implemented
|
|
#ifdef NDEBUG
|
|
# undef NDEBUG
|
|
#endif
|
|
|
|
#include "address_translation.hpp"
|
|
#include "pcs.hpp"
|
|
#include "utils.hpp"
|
|
|
|
#include <cassert>
|
|
#include <cstdio>
|
|
#include <iostream>
|
|
#include <memory>
|
|
#include <sstream>
|
|
#include <unordered_set>
|
|
|
|
namespace client
|
|
{
|
|
namespace address_translation
|
|
{
|
|
namespace
|
|
{
|
|
struct FlatProfiler
|
|
{
|
|
FlatProfiler() = default;
|
|
~FlatProfiler() = default;
|
|
|
|
CodeobjAddressTranslate translator = {};
|
|
KernelObjectMap kernel_object_map = {};
|
|
FlatProfile flat_profile = {};
|
|
std::mutex global_mut = {};
|
|
};
|
|
} // namespace
|
|
|
|
// Raw pointer to prevent early destruction of static objects
|
|
FlatProfiler* flat_profiler = nullptr;
|
|
|
|
void
|
|
init()
|
|
{
|
|
flat_profiler = new FlatProfiler();
|
|
}
|
|
|
|
void
|
|
fini()
|
|
{
|
|
delete flat_profiler;
|
|
flat_profiler = nullptr;
|
|
}
|
|
|
|
CodeobjAddressTranslate&
|
|
get_address_translator()
|
|
{
|
|
return flat_profiler->translator;
|
|
}
|
|
|
|
KernelObjectMap&
|
|
get_kernel_object_map()
|
|
{
|
|
return flat_profiler->kernel_object_map;
|
|
}
|
|
|
|
FlatProfile&
|
|
get_flat_profile()
|
|
{
|
|
return flat_profiler->flat_profile;
|
|
}
|
|
|
|
std::mutex&
|
|
get_global_mutex()
|
|
{
|
|
return flat_profiler->global_mut;
|
|
}
|
|
|
|
KernelObject::KernelObject(uint64_t code_object_id,
|
|
std::string kernel_name,
|
|
uint64_t begin_address,
|
|
uint64_t end_address)
|
|
: code_object_id_(code_object_id)
|
|
, kernel_name_(kernel_name)
|
|
, begin_address_(begin_address)
|
|
, end_address_(end_address)
|
|
{
|
|
auto& translator = get_address_translator();
|
|
uint64_t vaddr = begin_address;
|
|
while(vaddr < end_address)
|
|
{
|
|
auto inst = translator.get(code_object_id, vaddr);
|
|
vaddr += inst->size;
|
|
this->add_instruction(std::move(inst));
|
|
}
|
|
}
|
|
|
|
void
|
|
dump_flat_profile()
|
|
{
|
|
// It seems that an instruction can be part of multiple
|
|
// instances of the same kernel loaded on two different devices.
|
|
// We need to prevent counting the same instruction multiple times.
|
|
std::unordered_set<Instruction*> visited_instructions;
|
|
|
|
const auto& kernel_object_map = get_kernel_object_map();
|
|
const auto& flat_profile = get_flat_profile();
|
|
|
|
std::stringstream ss;
|
|
uint64_t samples_num = 0;
|
|
kernel_object_map.iterate_kernel_objects([&](const KernelObject* kernel_obj) {
|
|
ss << "\n====================================";
|
|
ss << "The kernel: " << kernel_obj->kernel_name()
|
|
<< " with the begin address: " << kernel_obj->begin_address()
|
|
<< " from code object with id: " << kernel_obj->code_object_id() << std::endl;
|
|
kernel_obj->iterate_instrunctions([&](const Instruction& inst) {
|
|
ss << "\t";
|
|
ss << inst.inst << "\t";
|
|
ss << inst.comment << "\t";
|
|
ss << "samples: ";
|
|
const auto* _sample_instruction = flat_profile.get_sample_instruction(inst);
|
|
if(_sample_instruction == nullptr)
|
|
ss << "0";
|
|
else
|
|
{
|
|
_sample_instruction->process([&](const SampleInstruction& sample_instruction) {
|
|
ss << sample_instruction.sample_count();
|
|
// Each instruction should be visited exactly once.
|
|
// Otherwise, code object loading/unloading and relocations
|
|
// are not handled properly.
|
|
assert(visited_instructions.count(sample_instruction.inst()) == 0);
|
|
// Assure that each instruction is counted once.
|
|
if(visited_instructions.count(sample_instruction.inst()) == 0)
|
|
{
|
|
samples_num += sample_instruction.sample_count();
|
|
visited_instructions.insert(sample_instruction.inst());
|
|
}
|
|
|
|
if(sample_instruction.exec_mask_counts().size() <= 1)
|
|
{
|
|
ss << ", exec_mask: " << std::hex;
|
|
ss << sample_instruction.exec_mask_counts().begin()->first;
|
|
ss << std::dec;
|
|
assert(sample_instruction.sample_count() ==
|
|
sample_instruction.exec_mask_counts().begin()->second);
|
|
}
|
|
else
|
|
{
|
|
uint64_t num_samples_sum = 0;
|
|
// More than one exec_mask
|
|
for(auto& [exec_mask, samples_per_exec] :
|
|
sample_instruction.exec_mask_counts())
|
|
{
|
|
ss << std::endl;
|
|
ss << "\t\t"
|
|
<< "exec_mask: " << std::hex << exec_mask;
|
|
ss << "\t"
|
|
<< "samples: " << std::dec << samples_per_exec;
|
|
num_samples_sum += samples_per_exec;
|
|
ss << std::endl;
|
|
}
|
|
assert(sample_instruction.sample_count() == num_samples_sum);
|
|
}
|
|
});
|
|
}
|
|
ss << std::endl;
|
|
});
|
|
ss << "====================================\n" << std::endl;
|
|
});
|
|
|
|
ss << "The total number of valid decoded samples: "
|
|
<< flat_profile.get_valid_decoded_samples_num() << std::endl;
|
|
ss << "The total number of invalid samples : " << flat_profile.get_invalid_samples_num()
|
|
<< std::endl;
|
|
|
|
*utils::get_output_stream() << ss.str() << std::endl;
|
|
|
|
utils::pcs_assert(
|
|
samples_num == flat_profile.get_valid_decoded_samples_num(),
|
|
"Number of collected valid samples different than the number of decoded samples.");
|
|
utils::pcs_assert(samples_num > 0, "No valid samples collected/decoded.");
|
|
// Temporarily disabling the check, until the trap handler latency is fixed.
|
|
// utils::pcs_assert(flat_profile.more_valid_decoded_samples_expected(),
|
|
// "More invalid samples observed.");
|
|
}
|
|
|
|
} // namespace address_translation
|
|
} // namespace client
|