2
0
Ficheiros
Indic, Vladimir 0ca07105a3 [SDK][rocprofv3] MI300 Stochastic PC sampling (#92)
* MI300 Stochastic PC sampling SDK API implementation

* ROCProfV3: Stochastic PC sampling Support (#94)

* ROCProfV3: MI300 Stochastic PC sampling initial draft

* ROCProfV3: Initial Stochastic PC sampling Tests (#95)

ROCProfV3: Initial Stochastic PC sampling tests

* Update rocprofiler_pc_sampling_record_stochastic_v0_t

- update doxygen docs for members
- replace rocprofiler_correlation_id_t with rocprofiler_async_correlation_id_t

* Relax the check in JSON tests

* drain PC sampling buffer during finalize_rocprofv3

* Increase timeout for "Test Install Build" step

- 10 minutes -> 20 minutes
- "Test Installed Packages" has 20 minutes so "Test Install Build" should also

---------

Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>

[ROCm/rocprofiler-sdk commit: 49ce79a5b5]
2025-03-21 14:40:45 -05:00

310 linhas
9.4 KiB
C++

// MIT License
//
// Copyright (c) 2024-2025 ROCm Developer Tools
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <rocprofiler-sdk/cxx/codeobj/code_printing.hpp>
#include <algorithm>
#include <atomic>
#include <cassert>
#include <functional>
#include <map>
#include <memory>
#include <mutex>
#include <shared_mutex>
#include <string>
#include <vector>
namespace client
{
namespace address_translation
{
using Instruction = rocprofiler::sdk::codeobj::disassembly::Instruction;
using CodeobjAddressTranslate = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate;
using marker_id_t = rocprofiler::sdk::codeobj::disassembly::marker_id_t;
/**
* @brief Pair (code_object_id, pc_addr) uniquely identifies an instruction.
*/
struct inst_id_t
{
marker_id_t code_object_id = 0;
uint64_t pc_addr = 0;
bool operator==(const inst_id_t& b) const
{
return this->pc_addr == b.pc_addr && this->code_object_id == b.code_object_id;
};
bool operator<(const inst_id_t& b) const
{
if(this->code_object_id == b.code_object_id) return this->pc_addr < b.pc_addr;
return this->code_object_id < b.code_object_id;
};
};
class KernelObject
{
private:
using process_inst_fn = std::function<void(const Instruction&)>;
public:
KernelObject() = default;
KernelObject(uint64_t code_object_id,
std::string kernel_name,
uint64_t begin_address,
uint64_t end_address);
// write lock required
void add_instruction(std::unique_ptr<Instruction> instruction)
{
auto lock = std::unique_lock{mut};
instructions_.push_back(std::move(instruction));
}
// read lock required
void iterate_instrunctions(process_inst_fn fn) const
{
auto lock = std::shared_lock{mut};
for(const auto& inst : this->instructions_)
fn(*inst);
}
uint64_t code_object_id() const { return code_object_id_; };
std::string kernel_name() const { return kernel_name_; };
uint64_t begin_address() const { return begin_address_; };
uint64_t end_address() const { return end_address_; };
private:
mutable std::shared_mutex mut = {};
uint64_t code_object_id_ = 0;
std::string kernel_name_ = {};
uint64_t begin_address_ = 0;
uint64_t end_address_ = 0;
std::vector<std::unique_ptr<Instruction>> instructions_ = {};
};
class KernelObjectMap
{
private:
using process_kernel_fn = std::function<void(const KernelObject*)>;
public:
KernelObjectMap() = default;
// write lock required
void add_kernel(uint64_t code_object_id,
std::string name,
uint64_t begin_address,
uint64_t end_address)
{
auto lock = std::unique_lock{mut};
auto key = form_key(code_object_id, name, begin_address);
auto it = kernel_object_map.find(key);
assert(it == kernel_object_map.end());
kernel_object_map.insert(
{key,
std::make_unique<KernelObject>(code_object_id, name, begin_address, end_address)});
}
#if 0
// read lock required
KernelObject* get_kernel(uint64_t code_object_id, std::string name)
{
auto lock = std::shared_lock{mut};
auto key = form_key(code_object_id, name);
auto it = kernel_object_map.find(key);
if(it == kernel_object_map.end())
{
return nullptr;
}
return it->second.get();
}
#endif
// read lock required
void iterate_kernel_objects(process_kernel_fn fn) const
{
auto lock = std::shared_lock{mut};
for(auto& [_, kernel_obj] : kernel_object_map)
fn(kernel_obj.get());
}
private:
std::unordered_map<std::string, std::unique_ptr<KernelObject>> kernel_object_map = {};
mutable std::shared_mutex mut = {};
std::string form_key(uint64_t code_object_id, std::string kernel_name, uint64_t begin_address)
{
return std::to_string(code_object_id) + "_" + kernel_name + "_" +
std::to_string(begin_address);
}
};
class SampleInstruction
{
private:
using proces_sample_inst_fn = std::function<void(const SampleInstruction&)>;
public:
SampleInstruction() = default;
SampleInstruction(std::unique_ptr<Instruction> inst)
: inst_(std::move(inst))
{}
// write lock required
void add_sample(uint64_t exec_mask)
{
auto lock = std::unique_lock{mut};
if(exec_mask_counts_.find(exec_mask) == exec_mask_counts_.end())
{
exec_mask_counts_[exec_mask] = 0;
}
exec_mask_counts_[exec_mask]++;
sample_count_++;
}
// read lock required
void process(proces_sample_inst_fn fn) const
{
auto lock = std::shared_lock{mut};
fn(*this);
}
Instruction* inst() const { return inst_.get(); };
// In case an instruction is samples with different exec masks,
// keep track of how many time each exec_mask was observed.
const std::map<uint64_t, uint64_t>& exec_mask_counts() const { return exec_mask_counts_; }
// How many time this instruction is samples
uint64_t sample_count() const { return sample_count_; };
private:
mutable std::shared_mutex mut = {};
// FIXME: prevent direct access of the following fields.
// The following fields should be accessible only from within `process` function.
std::unique_ptr<Instruction> inst_ = {};
// In case an instruction is samples with different exec masks,
// keep track of how many time each exec_mask was observed.
std::map<uint64_t, uint64_t> exec_mask_counts_ = {};
// How many time this instruction is samples
uint64_t sample_count_ = 0;
};
class FlatProfile
{
public:
FlatProfile() = default;
// write lock required
void add_sample(std::unique_ptr<Instruction> instruction, uint64_t exec_mask)
{
// counting valid decoded samples
valid_decoded_samples_num++;
auto lock = std::unique_lock{mut};
inst_id_t inst_id = {.code_object_id = instruction->codeobj_id,
.pc_addr = instruction->ld_addr};
auto itr = samples.find(inst_id);
if(itr == samples.end())
{
// Add new instruction
samples.insert({inst_id, std::make_unique<SampleInstruction>(std::move(instruction))});
itr = samples.find(inst_id);
}
auto* sample_instruction = itr->second.get();
sample_instruction->add_sample(exec_mask);
}
// read lock required
const SampleInstruction* get_sample_instruction(const Instruction& inst) const
{
auto lock = std::shared_lock{mut};
// TODO: Avoid creating a new instance of `inst_id_t` whenever querying
// sampled instructions.
inst_id_t inst_id = {.code_object_id = inst.codeobj_id, .pc_addr = inst.ld_addr};
auto itr = samples.find(inst_id);
if(itr == samples.end()) return nullptr;
return itr->second.get();
return nullptr;
}
void add_invalid_sample()
{
// counting invalid samples
invalid_decoded_samples_num++;
}
/**
* @brief Verify that more valid decoded samples is generated.
*/
bool more_valid_decoded_samples_expected() const
{
return valid_decoded_samples_num > invalid_decoded_samples_num;
}
uint64_t get_valid_decoded_samples_num() const { return valid_decoded_samples_num; }
uint64_t get_invalid_samples_num() const { return invalid_decoded_samples_num; }
private:
// TODO: optimize to use unordered_map
std::map<inst_id_t, std::unique_ptr<SampleInstruction>> samples = {};
std::atomic<uint64_t> valid_decoded_samples_num = {};
std::atomic<uint64_t> invalid_decoded_samples_num = {};
mutable std::shared_mutex mut = {};
};
std::mutex&
get_global_mutex();
CodeobjAddressTranslate&
get_address_translator();
KernelObjectMap&
get_kernel_object_map();
FlatProfile&
get_flat_profile();
void
dump_flat_profile();
void
init();
void
fini();
} // namespace address_translation
} // namespace client