Host trap PC sampling uses new record type (#1207)

* Host trap PC sampling uses new record type

* removing redundant field

* formatting

* simplifying templates in the parser - no need for HostTrap boolean

* reviving some parser tests

* hw_id decoding on GFX9

* HW id parser test

* parser CID test

* Parser multigpu test

* removing rocprofiler_pc_sampling_record_t and some fields from hw_id

* simplifying parser context

* keep bench test internally

* initializing gfx9_hw_id_t differently

* anonymous struct first

* avoiding inlining initialization of struct

[ROCm/rocprofiler-sdk commit: bc52c17e64]
Cette révision appartient à :
Vladimir Indic
2024-11-20 21:02:47 +01:00
révisé par GitHub
Parent 3291f05e2e
révision 42c6ffc0eb
21 fichiers modifiés avec 1319 ajouts et 738 suppressions
+12 -10
Voir le fichier
@@ -245,7 +245,8 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info,
picked_cfg->method,
picked_cfg->unit,
interval,
buffer_id);
buffer_id,
0);
if(status == ROCPROFILER_STATUS_SUCCESS)
{
*utils::get_output_stream()
@@ -305,24 +306,25 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/,
}
else if(cur_header->category == ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING)
{
if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_SAMPLE)
if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE)
{
auto* pc_sample =
static_cast<rocprofiler_pc_sampling_record_t*>(cur_header->payload);
auto* pc_sample = static_cast<rocprofiler_pc_sampling_record_host_trap_v0_t*>(
cur_header->payload);
ss << "(code_obj_id, offset): (" << pc_sample->pc.loaded_code_object_id << ", 0x"
<< std::hex << pc_sample->pc.loaded_code_object_offset << "), "
ss << "(code_obj_id, offset): (" << pc_sample->pc.code_object_id << ", 0x"
<< std::hex << pc_sample->pc.code_object_offset << "), "
<< "timestamp: " << std::dec << pc_sample->timestamp << ", "
<< "exec: " << std::hex << std::setw(16) << pc_sample->exec_mask << ", "
<< "workgroup_id_(x=" << std::dec << std::setw(5) << pc_sample->workgroup_id.x
<< ", "
<< "y=" << std::setw(5) << pc_sample->workgroup_id.y << ", "
<< "z=" << std::setw(5) << pc_sample->workgroup_id.z << "), "
<< "wave_id: " << std::setw(2) << static_cast<unsigned int>(pc_sample->wave_id)
<< "wave_in_group: " << std::setw(2)
<< static_cast<unsigned int>(pc_sample->wave_in_group) << ", "
<< "chiplet: " << std::setw(2)
<< static_cast<unsigned int>(pc_sample->hw_id.chiplet)
<< ", "
<< "chiplet: " << std::setw(2) << static_cast<unsigned int>(pc_sample->chiplet)
<< ", "
<< "cu_id: " << pc_sample->hw_id << ", "
// << "cu_id: " << pc_sample->hw_id << ", "
<< "correlation: {internal=" << std::setw(7)
<< pc_sample->correlation_id.internal << ", "
<< "external=" << std::setw(5) << pc_sample->correlation_id.external.value << "}"
+2 -1
Voir le fichier
@@ -439,7 +439,8 @@ typedef enum
typedef enum
{
ROCPROFILER_PC_SAMPLING_RECORD_NONE = 0,
ROCPROFILER_PC_SAMPLING_RECORD_SAMPLE, ///< ::rocprofiler_pc_sampling_record_t
ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE, ///< ::rocprofiler_pc_sampling_record_host_trap_v0_t
ROCPROFILER_PC_SAMPLING_RECORD_STOCHASTIC_V0_SAMPLE, ///< for the future use
ROCPROFILER_PC_SAMPLING_RECORD_LAST,
} rocprofiler_pc_sampling_record_kind_t;
+42 -93
Voir le fichier
@@ -99,6 +99,7 @@ ROCPROFILER_EXTERN_C_INIT
* @param [in] unit - The unit appropriate to the PC sampling type/method.
* @param [in] interval - frequency at which PC samples are generated
* @param [in] buffer_id - id of the buffer used for delivering PC samples
* @param [in] flags - for future use
* @return ::rocprofiler_status_t
* @retval ::ROCPROFILER_STATUS_SUCCESS PC sampling service configured successfully
* @retval ::ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE One of the scenarios is present:
@@ -117,7 +118,8 @@ rocprofiler_configure_pc_sampling_service(rocprofiler_context_id_t conte
rocprofiler_pc_sampling_method_t method,
rocprofiler_pc_sampling_unit_t unit,
uint64_t interval,
rocprofiler_buffer_id_t buffer_id) ROCPROFILER_API;
rocprofiler_buffer_id_t buffer_id,
int flags) ROCPROFILER_API;
/**
* @brief PC sampling configuration supported by a GPU agent.
@@ -195,122 +197,69 @@ rocprofiler_query_pc_sampling_agent_configurations(
void* user_data) ROCPROFILER_API ROCPROFILER_NONNULL(2, 3);
/**
* @brief The header of the @ref rocprofiler_pc_sampling_record_t, indicating
* what fields of the @ref rocprofiler_pc_sampling_record_t instance are meaningful
* for the sample.
* @brief Information about the GPU part where wave was executing
* at the moment of sampling.
*/
typedef struct
typedef struct rocprofiler_pc_sampling_hw_id_v0_t
{
uint8_t valid : 1; /// ::rocprofiler_pc_sampling_snapshot_v1_t field is valid
uint8_t type : 4;
uint8_t has_stall_reason : 1;
uint8_t has_wave_cnt : 1;
uint8_t reserved : 1; /// for future use
/// @var type
/// @brief The following values are possible:
/// - 0 - reserved
/// - 1 - host trap pc sample
/// - 2 - stochastic pc sample
/// - 3 - perfcounter (unsupported at the moment)
/// - other values does not mean anything at the moment
/// @var has_stall_reason
/// @brief whether the sample contains information about the stall reason.
/// If so, please @see rocprofiler_pc_sampling_snapshot_v1_t.
/// @var has_wave_cnt
/// @brief whether the @ref rocprofiler_pc_sampling_record_t::wave_count
/// contains meaningful value
} rocprofiler_pc_sampling_header_v1_t;
/**
* @brief For future use.
*
* @todo: Provide the description
* @todo: Should we use bitfields because of C ABI portability?
* @todo: Should we abstract this to be architecture agnostic?
* @todo: Consider having a query to determine organization of this information.
*/
typedef struct
{
uint32_t dual_issue_valu : 1;
uint32_t inst_type : 4;
uint32_t reason_not_issued : 7;
uint32_t arb_state_issue : 10;
uint32_t arb_state_stall : 10;
} rocprofiler_pc_sampling_snapshot_v1_t;
uint64_t chiplet : 6; ///< chiplet index (3 bits allocated by the ROCr runtime)
uint64_t wave_id : 7; ///< wave slot index
uint64_t simd_id : 2; ///< SIMD index
uint64_t pipe_id : 4; ///< pipe index
uint64_t cu_or_wgp_id : 4; ///< Index of compute unit on GFX9 or workgroup processer on other
///< architectures
uint64_t shader_array_id : 1; ///< Shared array index
uint64_t shader_engine_id : 5; ///< shared engine index
uint64_t workgroup_id : 7; ///< thread_group index on GFX9, and workgroup index on GFX10+
uint64_t vm_id : 6; ///< virtual memory ID
uint64_t queue_id : 4; ///< queue id
uint64_t microengine_id : 2; ///< ACE (microengine) index
uint64_t reserved0 : 16; ///< Reserved for the future use
} rocprofiler_pc_sampling_hw_id_v0_t;
/**
* @brief Sampled program counter.
*/
typedef struct
{
uint64_t loaded_code_object_id;
uint64_t loaded_code_object_offset;
uint64_t code_object_id;
uint64_t code_object_offset;
/// @var loaded_code_object_id
/// @var code_object_id
/// @brief id of the loaded code object instance that contains sampled PC.
/// This fields holds the value ::ROCPROFILER_CODE_OBJECT_ID_NONE
/// if the code object cannot be determined
/// (e.g., sampled PC belongs to code generated by self modifying code).
/// @var loaded_code_object_offset
/// @brief If @ref loaded_code_object_id is different than ::ROCPROFILER_CODE_OBJECT_ID_NONE,
/// @var code_object_offset
/// @brief If @ref code_object_id is different than ::ROCPROFILER_CODE_OBJECT_ID_NONE,
/// then this field contains the offset of the sampled PC relative to the
/// ::rocprofiler_callback_tracing_code_object_load_data_t::load_base
/// of the code object instance with @ref loaded_code_object_id.
/// of the code object instance with @ref code_object_id.
/// To calculate the original virtual address of the sampled PC, one can add the value
/// of this field to the ::rocprofiler_callback_tracing_code_object_load_data_t::load_base.
/// The value of @ref loaded_code_object_offset matches
/// The value of @ref code_object_offset matches
/// the virtual address of the sampled instruction (PC), only if the
/// @ref loaded_code_object_id is equal to the ::ROCPROFILER_CODE_OBJECT_ID_NONE.
/// @ref code_object_id is equal to the ::ROCPROFILER_CODE_OBJECT_ID_NONE.
} rocprofiler_pc_t;
// TODO: The definition of this structure might change over time
// to reduce the space needed to represent a single sample.
// TODO: The definition of this struct might change over time.
/**
* @brief ROCProfiler PC Sampling Record corresponding to the interrupted wave.
* @brief ROCProfiler Host-Trap PC Sampling Record.
*/
typedef struct
typedef struct rocprofiler_pc_sampling_record_host_trap_v0_t
{
uint64_t size; ///< Size of this struct
rocprofiler_pc_sampling_header_v1_t flags;
uint8_t chiplet; ///< chiplet index
uint8_t wave_id; ///< wave identifier within the workgroup
uint8_t wave_issued : 1;
uint8_t reserved : 7; ///< reserved 7 bits, must be zero
uint32_t hw_id; ///< compute unit identifier
rocprofiler_pc_t pc; ///< information about sampled program counter
uint64_t exec_mask;
rocprofiler_dim3_t workgroup_id; ///< wave coordinates within the workgroup
uint32_t wave_count;
uint64_t timestamp; ///< timestamp when sample is generated
rocprofiler_correlation_id_t correlation_id;
rocprofiler_pc_sampling_snapshot_v1_t
snapshot; ///< @see ::rocprofiler_pc_sampling_snapshot_v1_t
uint32_t reserved2; ///< for future use
/// @var flags
/// @brief indicates what fields of this struct are meaningful for the represented sample.
/// The values depend on what the underlying GPU agent architecture supports.
/// @var wave_issued
/// @brief indicates whether the wave is issueing the instruction represented by the @ref pc
/// @var exec_mask
/// @brief shows how many SIMD lanes of the wave were executing the instruction
/// represented by the @ref pc. Useful to understand thread-divergance within the wave
/// @var wave_count
/// @brief number of active waves on the CU at the moment of sample generation
/// @var correlation_id
/// @brief correlation id of the API call that initiated a dispatch of the kernel
/// during whose execution the wave was interrupted at @ref pc.
} rocprofiler_pc_sampling_record_t;
uint64_t size; ///< Size of this struct
rocprofiler_pc_sampling_hw_id_v0_t hw_id; ///< @see ::rocprofiler_pc_sampling_hw_id_0_t
rocprofiler_pc_t pc; ///< information about sampled program counter
uint64_t exec_mask; ///< active SIMD lanes when sampled
uint64_t timestamp; ///< timestamp when sample is generated
uint64_t dispatch_id; ///< originating kernel dispatch ID
rocprofiler_correlation_id_t correlation_id; ///< API launch call id that matches dispatch ID
rocprofiler_dim3_t workgroup_id; ///< wave coordinates within the workgroup
uint32_t wave_in_group : 8; ///< wave position within the workgroup (0-31)
uint32_t reserved0 : 24; ///< wave position within the workgroup (0-31)
} rocprofiler_pc_sampling_record_host_trap_v0_t;
/** @} */
ROCPROFILER_EXTERN_C_FINI
ROCPROFILER_CXX_CODE(
static_assert(sizeof(rocprofiler_pc_sampling_record_t) == 88,
"Increasing the size of the pc sampling record is not permitted."));
ROCPROFILER_CXX_CODE(static_assert(offsetof(rocprofiler_pc_sampling_record_t, chiplet) == 9 &&
offsetof(rocprofiler_pc_sampling_record_t, reserved2) == 84,
"PC sampling record layout changed."));
+2 -1
Voir le fichier
@@ -63,7 +63,8 @@ rocprofiler_configure_pc_sampling_service(rocprofiler_context_id_t conte
rocprofiler_pc_sampling_method_t method,
rocprofiler_pc_sampling_unit_t unit,
uint64_t interval,
rocprofiler_buffer_id_t buffer_id)
rocprofiler_buffer_id_t buffer_id,
int /*flags*/)
{
if(!is_pc_sampling_explicitly_enabled()) return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED;
@@ -1,7 +1,7 @@
set(ROCPROFILER_LIB_PC_SAMPLING_PARSER_SOURCES pc_record_interface.cpp)
set(ROCPROFILER_LIB_PC_SAMPLING_PARSER_HEADERS
correlation.hpp gfx9.hpp gfx11.hpp parser_types.h pc_record_interface.hpp rocr.h
translation.hpp)
correlation.hpp gfx9.hpp gfx11.hpp parser_types.hpp pc_record_interface.hpp rocr.h
stochastic_records.h translation.hpp)
target_sources(
rocprofiler-sdk-object-library PRIVATE ${ROCPROFILER_LIB_PC_SAMPLING_PARSER_SOURCES}
@@ -205,13 +205,13 @@ private:
using address_range_t = rocprofiler::sdk::codeobj::segment::address_range_t;
template <bool bHostTrap, typename GFXIP>
template <typename GFXIP, typename PcSamplingRecordT>
inline pcsample_status_t
add_upcoming_samples(const device_handle device,
const generic_sample_t* buffer,
const size_t available_samples,
Parser::CorrelationMap* corr_map,
rocprofiler_pc_sampling_record_t* samples)
add_upcoming_samples(const device_handle device,
const generic_sample_t* buffer,
const size_t available_samples,
Parser::CorrelationMap* corr_map,
PcSamplingRecordT* samples)
{
pcsample_status_t status = PCSAMPLE_STATUS_SUCCESS;
auto cache_addr_range = address_range_t{0, 0, ROCPROFILER_CODE_OBJECT_ID_NONE};
@@ -226,15 +226,14 @@ add_upcoming_samples(const device_handle device,
const auto* snap = reinterpret_cast<const perf_sample_snapshot_v1*>(buffer + p);
auto& pc_sample = samples[p];
pc_sample = copySample<bHostTrap, GFXIP>((const void*) (buffer + p));
pc_sample.size = sizeof(rocprofiler_pc_sampling_record_t);
pc_sample = copySample<GFXIP, PcSamplingRecordT>((const void*) (buffer + p));
// Convert PC -> (loaded code object id containing PC, offset within code object)
if(!cache_addr_range.inrange(snap->pc))
cache_addr_range = table->find_codeobj_in_range(snap->pc);
pc_sample.pc.loaded_code_object_id = cache_addr_range.id;
pc_sample.pc.loaded_code_object_offset = snap->pc - cache_addr_range.addr;
pc_sample.pc.code_object_id = cache_addr_range.id;
pc_sample.pc.code_object_offset = snap->pc - cache_addr_range.addr;
try
{
@@ -251,13 +250,13 @@ add_upcoming_samples(const device_handle device,
return status;
}
template <typename GFXIP>
template <typename GFXIP, typename PcSamplingRecordT>
inline pcsample_status_t
_parse_buffer(generic_sample_t* buffer,
uint64_t buffer_size,
user_callback_t callback,
void* userdata,
Parser::CorrelationMap* corr_map)
_parse_buffer(generic_sample_t* buffer,
uint64_t buffer_size,
user_callback_t<PcSamplingRecordT> callback,
void* userdata,
Parser::CorrelationMap* corr_map)
{
// Maximum size
uint64_t index = 0;
@@ -283,26 +282,31 @@ _parse_buffer(generic_sample_t* buffer,
uint64_t pkt_counter = pkt.num_samples;
if(index + pkt_counter > buffer_size) return PCSAMPLE_STATUS_OUT_OF_BOUNDS_ERROR;
bool bIsHostTrap = pkt.which_sample_type == AMD_HOST_TRAP_V1;
// I don't think we need this.
// bool bIsHostTrap = pkt.which_sample_type == AMD_HOST_TRAP_V1;
while(pkt_counter > 0)
{
rocprofiler_pc_sampling_record_t* samples = nullptr;
PcSamplingRecordT* samples = nullptr;
uint64_t available_samples = callback(&samples, pkt_counter, userdata);
if(available_samples == 0 || available_samples > pkt_counter)
return PCSAMPLE_STATUS_CALLBACK_ERROR;
if(bIsHostTrap)
{
status |= add_upcoming_samples<true, GFXIP>(
pkt.device, buffer + index, available_samples, corr_map, samples);
}
else
{
status |= add_upcoming_samples<false, GFXIP>(
pkt.device, buffer + index, available_samples, corr_map, samples);
}
// I don't think we need if-else here
// if(bIsHostTrap)
// {
// status |= add_upcoming_samples<GFXIP>(
// pkt.device, buffer + index, available_samples, corr_map, samples);
// }
// else
// {
// status |= add_upcoming_samples<GFXIP>(
// pkt.device, buffer + index, available_samples, corr_map, samples);
// }
status |= add_upcoming_samples<GFXIP>(
pkt.device, buffer + index, available_samples, corr_map, samples);
index += available_samples;
pkt_counter -= available_samples;
@@ -329,19 +333,20 @@ _parse_buffer(generic_sample_t* buffer,
* a size smaller than requested, then it may be called again requesting more memory.
* @param[in] userdata parameter forwarded to the user callback.
*/
pcsample_status_t inline parse_buffer(generic_sample_t* buffer,
uint64_t buffer_size,
int gfxip_major,
user_callback_t callback,
void* userdata)
template <typename PcSamplingRecordT>
pcsample_status_t inline parse_buffer(generic_sample_t* buffer,
uint64_t buffer_size,
int gfxip_major,
user_callback_t<PcSamplingRecordT> callback,
void* userdata)
{
static auto corr_map = std::make_unique<Parser::CorrelationMap>();
auto parseSample_func = _parse_buffer<GFX9>;
auto parseSample_func = _parse_buffer<GFX9, PcSamplingRecordT>;
if(gfxip_major == 9)
parseSample_func = _parse_buffer<GFX9>;
parseSample_func = _parse_buffer<GFX9, PcSamplingRecordT>;
else if(gfxip_major == 11)
parseSample_func = _parse_buffer<GFX11>;
parseSample_func = _parse_buffer<GFX11, PcSamplingRecordT>;
else
return PCSAMPLE_STATUS_INVALID_GFXIP;
@@ -81,13 +81,8 @@ enum pcsample_arb_issue_state
};
} // namespace PCSAMPLE
union pcsample_header_v1_t
{
rocprofiler_pc_sampling_header_v1_t flags;
uint8_t raw;
};
typedef uint64_t (*user_callback_t)(rocprofiler_pc_sampling_record_t**, uint64_t, void*);
template <typename PcSamplingRecordT>
using user_callback_t = uint64_t (*)(PcSamplingRecordT**, uint64_t, void*);
/**
* The types of errors to be returned by parse_buffer.
@@ -22,13 +22,31 @@
#include "lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.hpp"
template <>
uint64_t
PCSamplingParserContext::alloc(rocprofiler_pc_sampling_record_t** buffer, uint64_t size)
PCSamplingParserContext::alloc<rocprofiler_pc_sampling_record_host_trap_v0_t>(
rocprofiler_pc_sampling_record_host_trap_v0_t** buffer,
uint64_t size)
{
std::unique_lock<std::shared_mutex> lock(mut);
assert(buffer != nullptr);
data.emplace_back(std::make_unique<PCSamplingData>(size));
*buffer = data.back()->samples.data();
host_trap_data.emplace_back(
std::make_unique<PCSamplingData<rocprofiler_pc_sampling_record_host_trap_v0_t>>(size));
*buffer = host_trap_data.back()->samples.data();
return size;
}
template <>
uint64_t
PCSamplingParserContext::alloc<rocprofiler_pc_sampling_record_stochastic_v0_t>(
rocprofiler_pc_sampling_record_stochastic_v0_t** buffer,
uint64_t size)
{
std::unique_lock<std::shared_mutex> lock(mut);
assert(buffer != nullptr);
stochastic_data.emplace_back(
std::make_unique<PCSamplingData<rocprofiler_pc_sampling_record_stochastic_v0_t>>(size));
*buffer = stochastic_data.back()->samples.data();
return size;
}
@@ -39,10 +57,21 @@ PCSamplingParserContext::parse(const upcoming_samples_t& upcoming,
std::condition_variable& midway_signal,
bool bRocrBufferFlip)
{
bool bIsHostTrap = upcoming.which_sample_type == AMD_HOST_TRAP_V1;
// Template instantiation is faster!
auto parseSample_func = &PCSamplingParserContext::_parse<GFX9>;
auto parseSample_func =
bIsHostTrap
? &PCSamplingParserContext::_parse<GFX9, rocprofiler_pc_sampling_record_host_trap_v0_t>
: &PCSamplingParserContext::_parse<GFX9,
rocprofiler_pc_sampling_record_stochastic_v0_t>;
if(gfxip_major == 11)
parseSample_func = &PCSamplingParserContext::_parse<GFX11>;
parseSample_func =
bIsHostTrap
? &PCSamplingParserContext::_parse<GFX11,
rocprofiler_pc_sampling_record_host_trap_v0_t>
: &PCSamplingParserContext::_parse<GFX11,
rocprofiler_pc_sampling_record_stochastic_v0_t>;
else if(gfxip_major != 9)
return PCSAMPLE_STATUS_INVALID_GFXIP;
@@ -98,11 +127,13 @@ PCSamplingParserContext::shouldFlipRocrBuffer(const dispatch_pkt_id_t& pkt) cons
return corr_map->checkDispatch(pkt);
}
template <typename PcSamplingRecordKindT>
void
PCSamplingParserContext::generate_upcoming_pc_record(
uint64_t agent_id_handle,
const rocprofiler_pc_sampling_record_t* samples,
size_t num_samples)
uint64_t agent_id_handle,
const PcSamplingRecordKindT* samples,
size_t num_samples,
rocprofiler_pc_sampling_record_kind_t record_kind)
{
auto buff_id = _agent_buffers.at(rocprofiler_agent_id_t{agent_id_handle});
rocprofiler::buffer::instance* buff = rocprofiler::buffer::get_buffer(buff_id);
@@ -111,7 +142,28 @@ PCSamplingParserContext::generate_upcoming_pc_record(
throw std::runtime_error(fmt::format("Buffer with id: {} does not exists", buff_id.handle));
for(size_t i = 0; i < num_samples; i++)
buff->emplace(ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING,
ROCPROFILER_PC_SAMPLING_RECORD_SAMPLE,
samples[i]);
buff->emplace(ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING, record_kind, samples[i]);
}
template <>
void
PCSamplingParserContext::generate_upcoming_pc_record<rocprofiler_pc_sampling_record_host_trap_v0_t>(
uint64_t agent_id_handle,
const rocprofiler_pc_sampling_record_host_trap_v0_t* samples,
size_t num_samples)
{
this->generate_upcoming_pc_record(
agent_id_handle, samples, num_samples, ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE);
}
template <>
void
PCSamplingParserContext::generate_upcoming_pc_record<
rocprofiler_pc_sampling_record_stochastic_v0_t>(
uint64_t agent_id_handle,
const rocprofiler_pc_sampling_record_stochastic_v0_t* samples,
size_t num_samples)
{
this->generate_upcoming_pc_record(
agent_id_handle, samples, num_samples, ROCPROFILER_PC_SAMPLING_RECORD_STOCHASTIC_V0_SAMPLE);
}
@@ -24,7 +24,8 @@
#include "lib/rocprofiler-sdk/buffer.hpp"
#include "lib/rocprofiler-sdk/pc_sampling/parser/correlation.hpp"
#include "lib/rocprofiler-sdk/pc_sampling/parser/parser_types.h"
#include "lib/rocprofiler-sdk/pc_sampling/parser/parser_types.hpp"
#include "lib/rocprofiler-sdk/pc_sampling/parser/stochastic_records.h"
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/cxx/hash.hpp>
@@ -41,13 +42,14 @@
#include <thread>
#include <unordered_set>
template <typename PcSamplingRecordT>
struct PCSamplingData
{
PCSamplingData(size_t size)
: samples(size){};
PCSamplingData& operator=(PCSamplingData&) = delete;
std::vector<rocprofiler_pc_sampling_record_t> samples;
std::vector<PcSamplingRecordT> samples;
};
class PCSamplingParserContext
@@ -55,13 +57,16 @@ class PCSamplingParserContext
public:
PCSamplingParserContext()
: corr_map(std::make_unique<Parser::CorrelationMap>()){};
/**
* @brief Allocates some memory. TODO: Translate to Jonathan's buffer implementation.
* @brief Allocates some memory for samples.
* TODO: Translate to Jonathan's buffer implementation.
* @param[out] buffer Pointer where samples are to be written to.
* @param[in] size Number of samples requested.
* @returns Number of samples actually allocated on *buffer.
*/
uint64_t alloc(rocprofiler_pc_sampling_record_t** buffer, uint64_t size);
template <typename PcSamplingRecordT>
uint64_t alloc(PcSamplingRecordT** buffer, uint64_t size);
/**
* @brief Parses a chunk of samples.
@@ -127,7 +132,7 @@ protected:
* @brief Parses the given input data and generates pc sampling records.
* Calls generate_upcoming_pc_record().
*/
template <typename GFX>
template <typename GFX, typename PcSamplingRecordT>
pcsample_status_t _parse(const upcoming_samples_t& upcoming, const generic_sample_t* data_)
{
// std::shared_lock<std::shared_mutex> lock(mut);
@@ -139,16 +144,16 @@ protected:
while(pkt_counter > 0)
{
rocprofiler_pc_sampling_record_t* samples = nullptr;
uint64_t memsize = alloc(&samples, pkt_counter);
PcSamplingRecordT* samples = nullptr;
uint64_t memsize = alloc(&samples, pkt_counter);
if(memsize == 0 || memsize > pkt_counter) return PCSAMPLE_STATUS_CALLBACK_ERROR;
auto* map = corr_map.get();
if(bIsHostTrap)
status |= add_upcoming_samples<true, GFX>(dev, data_, memsize, map, samples);
status |= add_upcoming_samples<GFX>(dev, data_, memsize, map, samples);
else
status |= add_upcoming_samples<false, GFX>(dev, data_, memsize, map, samples);
status |= add_upcoming_samples<GFX>(dev, data_, memsize, map, samples);
data_ += memsize;
pkt_counter -= memsize;
@@ -164,14 +169,26 @@ protected:
*/
pcsample_status_t flushForgetList();
static void generate_id_completion_record(const dispatch_pkt_id_t& pkt) { (void) pkt; };
void generate_upcoming_pc_record(uint64_t agent_id_handle,
const rocprofiler_pc_sampling_record_t* samples,
size_t num_samples);
template <typename PcSamplingRecordT>
void generate_upcoming_pc_record(uint64_t agent_id_handle,
const PcSamplingRecordT* samples,
size_t num_samples);
template <typename PcSamplingRecordT>
void generate_upcoming_pc_record(uint64_t agent_id_handle,
const PcSamplingRecordT* samples,
size_t num_samples,
rocprofiler_pc_sampling_record_kind_t record_kind);
//! Maps doorbells and dispatch_index to correlation_id
std::unique_ptr<Parser::CorrelationMap> corr_map;
//! Data allocated to store samples. Temporary.
std::vector<std::unique_ptr<PCSamplingData>> data;
//! Data allocated to store host trap and stochastic samples, respectively.
//! Temporary solution until we figured out a smooth way to copy data directly to SDK's buffers.
std::vector<std::unique_ptr<PCSamplingData<rocprofiler_pc_sampling_record_host_trap_v0_t>>>
host_trap_data;
std::vector<std::unique_ptr<PCSamplingData<rocprofiler_pc_sampling_record_stochastic_v0_t>>>
stochastic_data;
//! Dispatches not yet completed.
// Uses only the internal correlation_id.
std::unordered_map<uint64_t, dispatch_pkt_id_t> active_dispatches;
@@ -0,0 +1,180 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#pragma once
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/pc_sampling.h>
#include <stdint.h>
/**
* @brief The header of the @ref rocprofiler_pc_sampling_record_stochastic_v0_t, indicating
* what fields of the @ref rocprofiler_pc_sampling_record_stochastic_v0_t instance are meaningful
* for the sample.
*/
typedef struct rocprofiler_pc_sampling_record_stochastic_header_t
{
uint8_t valid : 1; ///< pc sample is valid
uint8_t has_memory_counter : 1; ///< pc sample provides memory counters information
///< via ::rocprofiler_pc_sampling_memory_counters_t
uint8_t reserved_type : 6;
} rocprofiler_pc_sampling_record_stochastic_header_t;
/**
* @brief Enumaration describing sampled instruction type.
*/
typedef enum rocprofiler_pc_sampling_instruction_type_t
{
// Do we need ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_NONE=0? (we defined *_NONE in some other
// enums ) If so, then parser needs to add offset +1 after determining the type
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_VALU = 0,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_MATRIX,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_SCALAR,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_TEX,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LDS,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LDS_DIRECT,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_FLAT,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_EXPORT,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_MESSAGE,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BARRIER,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_NOT_TAKEN,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_TAKEN,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_JUMP,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_OTHER,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_NO_INST,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_DUAL_VALU,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LAST
} rocprofiler_pc_sampling_instruction_type_t;
/**
* @brief Enumaration describing reason for not issuing an instruction.
*/
typedef enum pcsample_reason_not_issued
{
// Do we need ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_NONE=0? (we defined *_NONE in some
// other enums ) If so, then parser needs to add offset +1 after determining the reason.
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NOT_AVAILABLE = 0,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_WAITCNT,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_INTERNAL,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_BARRIER,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_EX_STALL,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_OTHER_WAIT,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_SLEEP,
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_LAST
} rocprofiler_pc_sampling_instruction_not_issued_reason_t;
/**
* @brief Data provided by stochastic sampling hardware.
*
*/
typedef struct rocprofiler_pc_sampling_snapshot_v0_t
{
uint32_t
reason_not_issued : 4; ///< The reason for not issuing an instruction.
///< (9 different issue reason fits in 4 bits)
///< The field takes one of the value defined in
///< @ref ::rocprofiler_pc_sampling_instruction_not_issued_reason_t
uint32_t reserved0 : 1; ///< reserved for future use
uint32_t arb_state_issue_valu : 1; ///< arbiter issued a VALU instruction
uint32_t arb_state_issue_matrix : 1; ///< arbiter issued a matrix instruction
uint32_t arb_state_issue_lds : 1; ///< arbiter issued a LDS instruction
uint32_t arb_state_issue_lds_direct : 1; ///< arbiter issued a LDS direct instruction
uint32_t arb_state_issue_scalar : 1; ///< arbiter issued a scalar (SALU/SMEM) instruction
uint32_t arb_state_issue_vmem_tex : 1; ///< arbiter issued a texture instruction
uint32_t arb_state_issue_flat : 1; ///< arbiter issued a FLAT instruction
uint32_t arb_state_issue_exp : 1; ///< arbiter issued a export instruction
uint32_t arb_state_issue_misc : 1; ///< arbiter issued a miscellaneous instruction
uint32_t arb_state_issue_brmsg : 1; ///< arbiter issued a branch/message instruction
uint32_t arb_state_issue_reserved : 1; ///< reserved for the future use
// Replacing `uint32_t arb_state_stall : 10;`
uint32_t arb_state_stall_valu : 1; ///< VALU instruction was stalled when sampled is generated
uint32_t
arb_state_stall_matrix : 1; ///< matrix instruction was stalled when sampled is generated
uint32_t arb_state_stall_lds : 1; ///< LDS instruction was stalled when sampled is generated
uint32_t arb_state_stall_lds_direct : 1; ///< LDS direct instruction was stalled when sampled
///< is generated
uint32_t arb_state_stall_scalar : 1; ///< Scalar (SALU/SMEM) instruction was stalled when
///< sampled is generated
uint32_t arb_state_stall_vmem_tex : 1; ///< texture instruction was stalled when sampled is
///< generated
uint32_t arb_state_stall_flat : 1; ///< flat instruction was stalled when sampled is generated
uint32_t arb_state_stall_exp : 1; ///< export instruction was stalled when sampled is generated
uint32_t arb_state_stall_misc : 1; ///< miscellaneous instruction was stalled when sampled is
///< generated
uint32_t arb_state_stall_brmsg : 1; ///< branch/message instruction was stalled when sampled is
///< generated
uint32_t arb_state_state_reserved : 1; ///< reserved for the future use
// We have two reserved bits
uint32_t
dual_issue_valu : 1; ///< two VALU instructions issued for coexecution (MI3xx specific)
uint32_t reserved1 : 1; ///< reserved for the future use
uint32_t reserved2 : 3; ///< reserved for the future use
} rocprofiler_pc_sampling_snapshot_v0_t;
/**
* @brief Counters of issued instructions.
*/
typedef struct rocprofiler_pc_sampling_memory_counters_t
{
uint32_t load_cnt : 6; ///< Counts the number of VMEM load instructions issued but not yet
///< completed.
uint32_t store_cnt : 6; ///< Counts the number of VMEM store instructions issued but not yet
///< completed.
uint32_t
bvh_cnt : 3; ///< Counts the number of VMEM BVH instructions issued but not yet completed.
uint32_t sample_cnt : 6; ///< Counts the number of VMEM sample instructions issued but not yet
///< completed.
uint32_t ds_cnt : 6; ///< Counts the number of LDS instructions issued but not yet completed.
uint32_t km_cnt : 5; ///< Counts the number of scalar memory reads and memory instructions
///< issued but not yet completed.
} rocprofiler_pc_sampling_memory_counters_t;
/**
* @brief ROCProfiler Stochastic PC Sampling Record.
*/
typedef struct rocprofiler_pc_sampling_record_stochastic_v0_t
{
// TODO: use size to know whether memory counters exist or not
uint64_t size; ///< Size of this struct
rocprofiler_pc_sampling_record_stochastic_header_t
flags; ///< defines what fields are relevant for the sample
uint8_t wave_in_group; ///< wave position within the workgroup (0-15)
uint8_t wave_issued : 1; ///< wave issued the instruction represented with the PC
uint8_t inst_type : 5; ///< instruction type, takes a value defined in @ref
///< ::rocprofiler_pc_sampling_instruction_type_t
uint8_t reserved : 2; ///< reserved 2 bits must be zero
rocprofiler_pc_sampling_hw_id_v0_t hw_id; ///< @see ::rocprofiler_pc_sampling_hw_id_v0_t
rocprofiler_pc_t pc; ///< information about sampled program counter
uint64_t exec_mask; ///< active SIMD lanes at the moment sampling
rocprofiler_dim3_t workgroup_id; ///< wave coordinates within the workgroup
uint32_t wave_count; /// active waves on the CU at the moment of sampling
uint64_t timestamp; ///< timestamp when sample is generated
uint64_t dispatch_id; ///< originating kernel dispatch ID
rocprofiler_correlation_id_t correlation_id;
rocprofiler_pc_sampling_snapshot_v0_t
snapshot; ///< @see ::rocprofiler_pc_sampling_snapshot_v0_t
rocprofiler_pc_sampling_memory_counters_t
memory_counters; ///< @see ::rocprofiler_pc_sampling_memory_counters_t
} rocprofiler_pc_sampling_record_stochastic_v0_t;
@@ -31,6 +31,7 @@
* Benchmarks how fast the parser can process samples on a single threaded case
* Current: 5600X with -Ofast, up to >140 million samples/s or ~9GB/s R/W (18GB/s bidirectional)
*/
template <typename PcSamplingRecordT>
static bool
Benchmark(bool bWarmup)
{
@@ -38,14 +39,16 @@ Benchmark(bool bWarmup)
constexpr size_t DISP_PER_QUEUE = 8;
constexpr size_t NUM_QUEUES = 4;
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>();
std::array<std::vector<std::shared_ptr<MockDispatch>>, NUM_QUEUES> active_dispatches;
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
std::array<std::vector<std::shared_ptr<MockDispatch<PcSamplingRecordT>>>, NUM_QUEUES>
active_dispatches;
for(size_t q = 0; q < NUM_QUEUES; q++)
{
std::shared_ptr<MockQueue> queue = std::make_shared<MockQueue>(DISP_PER_QUEUE * 2, buffer);
auto queue = std::make_shared<MockQueue<PcSamplingRecordT>>(DISP_PER_QUEUE * 2, buffer);
for(size_t d = 0; d < DISP_PER_QUEUE; d++)
active_dispatches[q].push_back(std::make_shared<MockDispatch>(queue));
active_dispatches[q].push_back(
std::make_shared<MockDispatch<PcSamplingRecordT>>(queue));
}
constexpr size_t TOTAL_NUM_SAMPLES = NUM_QUEUES * DISP_PER_QUEUE * SAMPLE_PER_DISPATCH;
@@ -56,23 +59,24 @@ Benchmark(bool bWarmup)
for(size_t i = 0; i < SAMPLE_PER_DISPATCH; i++)
MockWave(dispatch).genPCSample();
std::pair<rocprofiler_pc_sampling_record_t*, size_t> userdata;
userdata.first = new rocprofiler_pc_sampling_record_t[TOTAL_NUM_SAMPLES];
std::pair<PcSamplingRecordT*, size_t> userdata;
userdata.first = new PcSamplingRecordT[TOTAL_NUM_SAMPLES];
userdata.second = TOTAL_NUM_SAMPLES;
auto t0 = std::chrono::system_clock::now();
CHECK_PARSER(parse_buffer(
(generic_sample_t*) buffer->packets.data(),
buffer->packets.size(),
GFXIP_MAJOR,
[](rocprofiler_pc_sampling_record_t** sample, uint64_t size, void* userdata_) {
auto* pair =
reinterpret_cast<std::pair<rocprofiler_pc_sampling_record_t*, size_t>*>(userdata_);
user_callback_t<PcSamplingRecordT> user_cb =
[](PcSamplingRecordT** sample, uint64_t size, void* userdata_) {
auto* pair = reinterpret_cast<std::pair<PcSamplingRecordT*, size_t>*>(userdata_);
assert(TOTAL_NUM_SAMPLES == pair->second);
*sample = pair->first;
return size;
},
&userdata));
};
auto t0 = std::chrono::system_clock::now();
CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(),
buffer->packets.size(),
GFXIP_MAJOR,
user_cb,
&userdata));
auto t1 = std::chrono::system_clock::now();
float samples_per_us = float(TOTAL_NUM_SAMPLES) / (t1 - t0).count() * 1E3f;
@@ -80,8 +84,7 @@ Benchmark(bool bWarmup)
{
std::cout << "Benchmark: Parsed " << int(samples_per_us * 1E3f + 0.5f) * 1E-3f
<< " Msample/s (";
std::cout << int(sizeof(rocprofiler_pc_sampling_record_t) * samples_per_us) << " MB/s)"
<< std::endl;
std::cout << int(sizeof(PcSamplingRecordT) * samples_per_us) << " MB/s)" << std::endl;
}
delete[] userdata.first;
@@ -90,7 +93,12 @@ Benchmark(bool bWarmup)
TEST(pcs_parser, benchmark_test)
{
EXPECT_EQ(Benchmark(true), true);
EXPECT_EQ(Benchmark(false), true);
EXPECT_EQ(Benchmark(false), true);
// Tests for host trap v0 records
EXPECT_EQ(Benchmark<rocprofiler_pc_sampling_record_host_trap_v0_t>(true), true);
EXPECT_EQ(Benchmark<rocprofiler_pc_sampling_record_host_trap_v0_t>(false), true);
EXPECT_EQ(Benchmark<rocprofiler_pc_sampling_record_host_trap_v0_t>(false), true);
// tests for stochastic v0 records
EXPECT_EQ(Benchmark<rocprofiler_pc_sampling_record_stochastic_v0_t>(true), true);
EXPECT_EQ(Benchmark<rocprofiler_pc_sampling_record_stochastic_v0_t>(false), true);
EXPECT_EQ(Benchmark<rocprofiler_pc_sampling_record_stochastic_v0_t>(false), true);
}
@@ -33,15 +33,15 @@ std::mt19937 rdgen(1);
/**
* Sample user memory allocation callback.
* It expects userdata to be cast-able to a pointer to
* std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>>
* std::vector<std::pair<PcSamplingRecordT*, uint64_t>>
*/
template <typename PcSamplingRecordT>
static uint64_t
alloc_callback(rocprofiler_pc_sampling_record_t** buffer, uint64_t size, void* userdata)
alloc_callback(PcSamplingRecordT** buffer, uint64_t size, void* userdata)
{
*buffer = new rocprofiler_pc_sampling_record_t[size];
*buffer = new PcSamplingRecordT[size];
auto& vector =
*reinterpret_cast<std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>>*>(
userdata);
*reinterpret_cast<std::vector<std::pair<PcSamplingRecordT*, uint64_t>>*>(userdata);
vector.push_back({*buffer, size});
return size;
}
@@ -50,35 +50,34 @@ alloc_callback(rocprofiler_pc_sampling_record_t** buffer, uint64_t size, void* u
* Uses the MockWave dispatch's unique_id store in the pc field to verify
* the reconstructed correlation_id.
*/
template <typename PcSamplingRecordT>
static bool
check_samples(rocprofiler_pc_sampling_record_t* samples, uint64_t size)
check_samples(PcSamplingRecordT* samples, uint64_t size)
{
// TODO: replace with (code_obj_id, pc)
for(size_t i = 0; i < size; i++)
if(samples[i].correlation_id.internal != samples[i].pc.loaded_code_object_offset)
return false;
if(samples[i].correlation_id.internal != samples[i].pc.code_object_offset) return false;
return true;
}
/**
* Simplest mock classes use, generates a single queue+dispatch with 2 PC samples.
*/
TEST(pcs_parser, hello_world)
template <typename PcSamplingRecordT>
void
pcs_parser_hello_world()
{
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>();
std::shared_ptr<MockQueue> queue = std::make_shared<MockQueue>(16, buffer);
std::shared_ptr<MockDispatch> dispatch = std::make_shared<MockDispatch>(queue);
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
auto queue = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
auto dispatch = std::make_shared<MockDispatch<PcSamplingRecordT>>(queue);
buffer->genUpcomingSamples(2);
MockWave(dispatch).genPCSample();
MockWave(dispatch).genPCSample();
std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>> all_allocations;
std::vector<std::pair<PcSamplingRecordT*, uint64_t>> all_allocations;
CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(),
buffer->packets.size(),
GFXIP_MAJOR,
alloc_callback,
alloc_callback<PcSamplingRecordT>,
(void*) &all_allocations));
EXPECT_EQ(all_allocations.size(), 1); // HelloWorld: Incorrect number of callbacks
@@ -91,23 +90,34 @@ TEST(pcs_parser, hello_world)
}
}
/**
* Simplest mock classes use, generates a single queue+dispatch with 2 PC samples.
*/
TEST(pcs_parser, hello_world)
{
pcs_parser_hello_world<rocprofiler_pc_sampling_record_host_trap_v0_t>();
pcs_parser_hello_world<rocprofiler_pc_sampling_record_stochastic_v0_t>();
}
/**
* A little more complicated.
* Generates a few dispatches for 2 different queues and samples in forward and reverse order.
* Checks if the reconstructed correlation_id is correct.
*/
TEST(pcs_parser, reverse_wave_order)
template <typename PcSamplingRecordT>
void
pcs_parser_reverse_wave_order()
{
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>();
std::shared_ptr<MockQueue> queue1 = std::make_shared<MockQueue>(16, buffer);
std::shared_ptr<MockQueue> queue2 = std::make_shared<MockQueue>(16, buffer);
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
auto queue1 = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
auto queue2 = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
std::vector<std::shared_ptr<MockDispatch>> dispatches;
dispatches.push_back(std::make_shared<MockDispatch>(queue1));
dispatches.push_back(std::make_shared<MockDispatch>(queue1));
dispatches.push_back(std::make_shared<MockDispatch>(queue2));
dispatches.push_back(std::make_shared<MockDispatch>(queue2));
dispatches.push_back(std::make_shared<MockDispatch>(queue1));
std::vector<std::shared_ptr<MockDispatch<PcSamplingRecordT>>> dispatches;
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue1));
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue1));
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue2));
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue2));
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue1));
buffer->genUpcomingSamples(dispatches.size());
for(auto it = dispatches.rbegin(); it != dispatches.rend(); it++)
@@ -116,12 +126,12 @@ TEST(pcs_parser, reverse_wave_order)
for(auto it = dispatches.begin(); it != dispatches.end(); it++)
MockWave(*it).genPCSample();
std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>> all_allocations;
std::vector<std::pair<PcSamplingRecordT*, uint64_t>> all_allocations;
CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(),
buffer->packets.size(),
GFXIP_MAJOR,
alloc_callback,
alloc_callback<PcSamplingRecordT>,
(void*) &all_allocations));
EXPECT_EQ(all_allocations.size(), 2); // ReverseWaveOrder test: Incorrect number of callbacks
@@ -135,29 +145,33 @@ TEST(pcs_parser, reverse_wave_order)
}
}
/**
* Creates a small queue and causes the dispatch_ids to wrap around a few times, and generates
* a single sample per dispatch. Checks the parser is properly handling the wrapping of queues.
*/
TEST(pcs_parser, dispatch_wrapping)
TEST(pcs_parser, reverse_wave_order)
{
const int num_samples = 32;
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>();
std::shared_ptr<MockQueue> queue = std::make_shared<MockQueue>(5, buffer);
pcs_parser_reverse_wave_order<rocprofiler_pc_sampling_record_host_trap_v0_t>();
pcs_parser_reverse_wave_order<rocprofiler_pc_sampling_record_stochastic_v0_t>();
}
template <typename PcSamplingRecordT>
void
pcs_parser_dispatch_wrapping()
{
const int num_samples = 32;
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
auto queue = std::make_shared<MockQueue<PcSamplingRecordT>>(5, buffer);
for(int i = 0; i < num_samples; i++)
{
auto dispatch = std::make_shared<MockDispatch>(queue);
auto dispatch = std::make_shared<MockDispatch<PcSamplingRecordT>>(queue);
buffer->genUpcomingSamples(1);
MockWave(dispatch).genPCSample();
}
std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>> all_allocations;
std::vector<std::pair<PcSamplingRecordT*, uint64_t>> all_allocations;
CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(),
buffer->packets.size(),
GFXIP_MAJOR,
alloc_callback,
alloc_callback<PcSamplingRecordT>,
(void*) &all_allocations));
EXPECT_EQ(all_allocations.size(),
@@ -172,39 +186,47 @@ TEST(pcs_parser, dispatch_wrapping)
}
/**
* Creates a few queues with a few dispatchs per queue.
* Adds random samples per dispatch, and checks the result.
* Creates a small queue and causes the dispatch_ids to wrap around a few times, and generates
* a single sample per dispatch. Checks the parser is properly handling the wrapping of queues.
*/
TEST(pcs_parser, random_samples)
TEST(pcs_parser, dispatch_wrapping)
{
const int num_samples = 1024;
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>();
std::shared_ptr<MockQueue> queue1 = std::make_shared<MockQueue>(16, buffer);
std::shared_ptr<MockQueue> queue2 = std::make_shared<MockQueue>(16, buffer);
std::shared_ptr<MockQueue> queue3 = std::make_shared<MockQueue>(16, buffer);
std::shared_ptr<MockQueue> queue4 = std::make_shared<MockQueue>(16, buffer);
pcs_parser_dispatch_wrapping<rocprofiler_pc_sampling_record_host_trap_v0_t>();
pcs_parser_dispatch_wrapping<rocprofiler_pc_sampling_record_stochastic_v0_t>();
}
std::vector<std::shared_ptr<MockDispatch>> dispatches;
dispatches.push_back(std::make_shared<MockDispatch>(queue1));
dispatches.push_back(std::make_shared<MockDispatch>(queue1));
dispatches.push_back(std::make_shared<MockDispatch>(queue2));
dispatches.push_back(std::make_shared<MockDispatch>(queue3));
dispatches.push_back(std::make_shared<MockDispatch>(queue1));
dispatches.push_back(std::make_shared<MockDispatch>(queue3));
dispatches.push_back(std::make_shared<MockDispatch>(queue3));
dispatches.push_back(std::make_shared<MockDispatch>(queue2));
dispatches.push_back(std::make_shared<MockDispatch>(queue1));
template <typename PcSamplingRecordT>
void
pcs_parser_random_samples()
{
const int num_samples = 1024;
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
auto queue1 = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
auto queue2 = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
auto queue3 = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
auto queue4 = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
std::vector<std::shared_ptr<MockDispatch<PcSamplingRecordT>>> dispatches;
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue1));
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue1));
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue2));
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue3));
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue1));
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue3));
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue3));
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue2));
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue1));
buffer->genUpcomingSamples(num_samples);
for(int i = 0; i < num_samples; i++)
MockWave(dispatches[rdgen() % dispatches.size()]).genPCSample();
std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>> all_allocations;
std::vector<std::pair<PcSamplingRecordT*, uint64_t>> all_allocations;
CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(),
buffer->packets.size(),
GFXIP_MAJOR,
alloc_callback,
alloc_callback<PcSamplingRecordT>,
(void*) &all_allocations));
EXPECT_EQ(all_allocations.size(), 1); // RandomSamples test: Incorrect number of callbacks
@@ -218,21 +240,29 @@ TEST(pcs_parser, random_samples)
}
/**
* Hammers the parser by creating and destrying queues at random, adding dispatches at random
* and generating PC samples at random. By default we use all 4 unique doorbells,
* queue size is 16 and we generate 10k samples dispatch.
* Creates a few queues with a few dispatchs per queue.
* Adds random samples per dispatch, and checks the result.
*/
TEST(pcs_parser, queue_hammer)
TEST(pcs_parser, random_samples)
{
pcs_parser_random_samples<rocprofiler_pc_sampling_record_host_trap_v0_t>();
pcs_parser_random_samples<rocprofiler_pc_sampling_record_stochastic_v0_t>();
}
template <typename PcSamplingRecordT>
void
pcs_parser_queue_hammer()
{
constexpr int NUM_ACTIONS = 10000;
constexpr int QSIZE = 16;
constexpr int NUM_QUEUES = MockDoorBell::num_unique_bells;
constexpr int ACTION_MAX = QSIZE * NUM_QUEUES / 2;
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>();
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
std::array<std::shared_ptr<MockQueue>, NUM_QUEUES> queues;
std::array<std::vector<std::shared_ptr<MockDispatch>>, NUM_QUEUES> active_dispatches;
std::array<std::shared_ptr<MockQueue<PcSamplingRecordT>>, NUM_QUEUES> queues;
std::array<std::vector<std::shared_ptr<MockDispatch<PcSamplingRecordT>>>, NUM_QUEUES>
active_dispatches;
int num_reset_queues = 0;
int num_samples_generated = 0;
@@ -241,9 +271,10 @@ TEST(pcs_parser, queue_hammer)
size_t max_q_occupancy = 0;
for(int i = 0; i < NUM_QUEUES; i++)
queues[i] = std::make_shared<MockQueue>(QSIZE, buffer);
queues[i] = std::make_shared<MockQueue<PcSamplingRecordT>>(QSIZE, buffer);
for(int i = 0; i < NUM_QUEUES; i++)
active_dispatches[i].push_back(std::make_shared<MockDispatch>(queues[i]));
active_dispatches[i].push_back(
std::make_shared<MockDispatch<PcSamplingRecordT>>(queues[i]));
for(int i = 0; i < NUM_ACTIONS; i++)
{
@@ -254,7 +285,7 @@ TEST(pcs_parser, queue_hammer)
// Delete queue and create new one
active_dispatches[q] = {};
queues[q].reset();
queues[q] = std::make_shared<MockQueue>(QSIZE, buffer);
queues[q] = std::make_shared<MockQueue<PcSamplingRecordT>>(QSIZE, buffer);
num_reset_queues++;
}
else if(action > ACTION_MAX / 2 && active_dispatches[q].size() > 1)
@@ -267,7 +298,8 @@ TEST(pcs_parser, queue_hammer)
// Add new dispatch
if(active_dispatches[q].size() < QSIZE)
{
active_dispatches[q].push_back(std::make_shared<MockDispatch>(queues[q]));
active_dispatches[q].push_back(
std::make_shared<MockDispatch<PcSamplingRecordT>>(queues[q]));
num_dispatches_generated += 1;
}
@@ -276,7 +308,8 @@ TEST(pcs_parser, queue_hammer)
for(auto& queue : active_dispatches)
{
EXPECT_NE(queue.size(), 0);
std::shared_ptr<MockDispatch> rand_dispatch = queue[rdgen() % queue.size()];
std::shared_ptr<MockDispatch<PcSamplingRecordT>> rand_dispatch =
queue[rdgen() % queue.size()];
MockWave(rand_dispatch).genPCSample();
num_samples_generated += 1;
avg_q_occupancy += queue.size();
@@ -292,20 +325,20 @@ TEST(pcs_parser, queue_hammer)
<< std::endl;
std::cout << "Max queue occupancy: " << max_q_occupancy << "\n\n" << std::endl;
std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>> all_allocations;
std::vector<std::pair<PcSamplingRecordT*, uint64_t>> all_allocations;
CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(),
buffer->packets.size(),
GFXIP_MAJOR,
alloc_callback,
alloc_callback<PcSamplingRecordT>,
(void*) &all_allocations));
EXPECT_EQ(all_allocations.size(),
NUM_ACTIONS); // QueueHammer test: Incorrect number of callbacks
for(auto sb = 0ul; sb < all_allocations.size(); sb++)
{
rocprofiler_pc_sampling_record_t* samples = all_allocations[sb].first;
size_t num_samples = all_allocations[sb].second;
PcSamplingRecordT* samples = all_allocations[sb].first;
size_t num_samples = all_allocations[sb].second;
EXPECT_EQ(num_samples, NUM_QUEUES); // QueueHammer: Incorrect number of samples
EXPECT_EQ(check_samples(samples, num_samples),
@@ -314,12 +347,25 @@ TEST(pcs_parser, queue_hammer)
}
}
TEST(pcs_parser, multi_buffer)
/**
* Hammers the parser by creating and destrying queues at random, adding dispatches at random
* and generating PC samples at random. By default we use all 4 unique doorbells,
* queue size is 16 and we generate 10k samples dispatch.
*/
TEST(pcs_parser, queue_hammer)
{
std::shared_ptr<MockRuntimeBuffer> firstBuffer = std::make_shared<MockRuntimeBuffer>();
std::shared_ptr<MockQueue> queue = std::make_shared<MockQueue>(16, firstBuffer);
std::shared_ptr<MockDispatch> dispatch1 = std::make_shared<MockDispatch>(queue);
std::shared_ptr<MockDispatch> dispatch2 = std::make_shared<MockDispatch>(queue);
pcs_parser_queue_hammer<rocprofiler_pc_sampling_record_host_trap_v0_t>();
pcs_parser_queue_hammer<rocprofiler_pc_sampling_record_stochastic_v0_t>();
}
template <typename PcSamplingRecordT>
void
pcs_parser_multi_buffer()
{
auto firstBuffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
auto queue = std::make_shared<MockQueue<PcSamplingRecordT>>(16, firstBuffer);
auto dispatch1 = std::make_shared<MockDispatch<PcSamplingRecordT>>(queue);
auto dispatch2 = std::make_shared<MockDispatch<PcSamplingRecordT>>(queue);
firstBuffer->genUpcomingSamples(4);
MockWave(dispatch1).genPCSample();
@@ -327,21 +373,21 @@ TEST(pcs_parser, multi_buffer)
MockWave(dispatch1).genPCSample();
MockWave(dispatch2).genPCSample();
std::shared_ptr<MockRuntimeBuffer> secondBuffer = std::make_shared<MockRuntimeBuffer>();
const auto& packets = firstBuffer->packets;
secondBuffer->packets = std::vector<packet_union_t>(packets.begin() + 2, packets.end());
auto secondBuffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
const auto& packets = firstBuffer->packets;
secondBuffer->packets = std::vector<packet_union_t>(packets.begin() + 2, packets.end());
std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>> all_allocations;
std::vector<std::pair<PcSamplingRecordT*, uint64_t>> all_allocations;
CHECK_PARSER(parse_buffer((generic_sample_t*) firstBuffer->packets.data(),
firstBuffer->packets.size(),
GFXIP_MAJOR,
alloc_callback,
alloc_callback<PcSamplingRecordT>,
(void*) &all_allocations));
CHECK_PARSER(parse_buffer((generic_sample_t*) secondBuffer->packets.data(),
secondBuffer->packets.size(),
GFXIP_MAJOR,
alloc_callback,
alloc_callback<PcSamplingRecordT>,
(void*) &all_allocations));
EXPECT_EQ(all_allocations.size(), 2); // MultiBuffer: Incorrect number of callbacks
@@ -352,4 +398,10 @@ TEST(pcs_parser, multi_buffer)
delete[] all_allocations[0].first;
delete[] all_allocations[1].first;
};
}
TEST(pcs_parser, multi_buffer)
{
pcs_parser_multi_buffer<rocprofiler_pc_sampling_record_host_trap_v0_t>();
pcs_parser_multi_buffer<rocprofiler_pc_sampling_record_stochastic_v0_t>();
}
@@ -100,14 +100,15 @@
ARBCHECK2(ISSUE_EXP); \
ARBCHECK2(ISSUE_MISC);
template <typename PcSamplingRecordT>
class WaveSnapTest
{
public:
WaveSnapTest()
{
buffer = std::make_shared<MockRuntimeBuffer>();
queue = std::make_shared<MockQueue>(16, buffer);
dispatch = std::make_shared<MockDispatch>(queue);
buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
queue = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
dispatch = std::make_shared<MockDispatch<PcSamplingRecordT>>(queue);
}
void Test()
@@ -140,25 +141,27 @@ public:
dispatch->submit(packet_union_t{.snap = snap});
};
std::shared_ptr<MockRuntimeBuffer> buffer;
std::shared_ptr<MockQueue> queue;
std::shared_ptr<MockDispatch> dispatch;
std::shared_ptr<MockRuntimeBuffer<PcSamplingRecordT>> buffer;
std::shared_ptr<MockQueue<PcSamplingRecordT>> queue;
std::shared_ptr<MockDispatch<PcSamplingRecordT>> dispatch;
};
class WaveCntTest : public WaveSnapTest
template <typename PcSamplingRecordT>
class WaveCntTest : public WaveSnapTest<PcSamplingRecordT>
{
public:
void FillBuffers() override
{
// Loop over all possible wave_cnt
buffer->genUpcomingSamples(max_wave_number);
this->buffer->genUpcomingSamples(max_wave_number);
for(size_t i = 0; i < max_wave_number; i++)
genPCSample(i, GFX9::TYPE_LDS, GFX9::REASON_ALU, GFX9::ISSUE_VALU, GFX9::ISSUE_VALU);
this->genPCSample(
i, GFX9::TYPE_LDS, GFX9::REASON_ALU, GFX9::ISSUE_VALU, GFX9::ISSUE_VALU);
}
void CheckBuffers() override
{
auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9
assert(parsed.size() == 1);
assert(parsed[0].size() == max_wave_number);
@@ -166,204 +169,336 @@ public:
assert(parsed[0][i].wave_count == i);
}
const size_t max_wave_number = 64;
std::vector<rocprofiler_pc_sampling_snapshot_v1_t> snapshots;
const size_t max_wave_number = 64;
std::vector<PcSamplingRecordT> snapshots;
};
class InstTypeTest : public WaveSnapTest
// class InstTypeTest : public WaveSnapTest
// {
// public:
// void FillBuffers() override
// {
// // Loop over inst_type_issued
// UNROLL_TYPECHECK();
// buffer->genUpcomingSamples(GFX9::TYPE_LAST);
// for(int i = 0; i < GFX9::TYPE_LAST; i++)
// genPCSample(i, i, GFX9::REASON_ALU, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX);
// }
// void CheckBuffers() override
// {
// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
// assert(parsed.size() == 1);
// assert(parsed[0].size() == GFX9::TYPE_LAST);
// assert(snapshots.size() == GFX9::TYPE_LAST);
// for(size_t i = 0; i < GFX9::TYPE_LAST; i++)
// assert(snapshots[i].inst_type == parsed[0][i].snapshot.inst_type);
// }
// std::vector<rocprofiler_pc_sampling_snapshot_v1_t> snapshots;
// };
// class StallReasonTest : public WaveSnapTest
// {
// public:
// void FillBuffers() override
// {
// // Loop over reason_not_issued
// UNROLL_REASONCHECK();
// buffer->genUpcomingSamples(GFX9::REASON_LAST);
// for(int i = 0; i < GFX9::REASON_LAST; i++)
// genPCSample(i, GFX9::TYPE_MATRIX, i, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX);
// }
// void CheckBuffers() override
// {
// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
// assert(parsed.size() == 1);
// assert(parsed[0].size() == GFX9::REASON_LAST);
// assert(snapshots.size() == GFX9::REASON_LAST);
// for(size_t i = 0; i < GFX9::REASON_LAST; i++)
// assert(snapshots[i].reason_not_issued == parsed[0][i].snapshot.reason_not_issued);
// }
// std::vector<rocprofiler_pc_sampling_snapshot_v1_t> snapshots;
// };
// class ArbStateTest : public WaveSnapTest
// {
// public:
// void FillBuffers() override
// {
// // Loop over arb_state_issue
// UNROLL_ARBCHECK();
// buffer->genUpcomingSamples(GFX9::ISSUE_LAST * GFX9::ISSUE_LAST);
// for(int i = 0; i < GFX9::ISSUE_LAST; i++)
// for(int j = 0; j < GFX9::ISSUE_LAST; j++)
// genPCSample(i, GFX9::TYPE_MATRIX, GFX9::REASON_ALU, 1 << i, 1 << j);
// }
// void CheckBuffers() override
// {
// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
// assert(parsed.size() == 1);
// assert(parsed[0].size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST);
// assert(snapshots.size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST);
// for(size_t i = 0; i < GFX9::ISSUE_LAST * GFX9::ISSUE_LAST; i++)
// {
// auto& snap = snapshots[i];
// assert(snap.arb_state_issue == parsed[0][i].snapshot.arb_state_issue);
// assert(snap.arb_state_stall == parsed[0][i].snapshot.arb_state_stall);
// }
// }
// std::vector<rocprofiler_pc_sampling_snapshot_v1_t> snapshots;
// };
// class WaveIssueAndErrorTest : public WaveSnapTest
// {
// void FillBuffers() override
// {
// buffer->genUpcomingSamples(16);
// for(int valid = 0; valid <= 1; valid++)
// for(int issued = 0; issued <= 1; issued++)
// for(int dual = 0; dual <= 1; dual++)
// for(int error = 0; error <= 1; error++)
// genPCSample(valid, issued, dual, error);
// }
// void CheckBuffers() override
// {
// const int num_combinations = 16;
// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
// assert(parsed.size() == 1);
// assert(parsed[0].size() == num_combinations);
// assert(compare.size() == num_combinations);
// for(size_t i = 0; i < num_combinations; i++)
// {
// assert(compare[i].flags.valid == parsed[0][i].flags.valid);
// assert(compare[i].wave_issued == parsed[0][i].wave_issued);
// assert(compare[i].snapshot.dual_issue_valu == parsed[0][i].snapshot.dual_issue_valu);
// }
// }
// union trap_snapshot_v1
// {
// struct
// {
// uint32_t valid : 1;
// uint32_t issued : 1;
// uint32_t dual : 1;
// uint32_t reserved : 23;
// uint32_t error : 1;
// uint32_t reserved2 : 5;
// };
// uint32_t raw;
// };
// void genPCSample(bool valid, bool issued, bool dual, bool error)
// {
// rocprofiler_pc_sampling_record_t sample;
// ::memset(&sample, 0, sizeof(sample));
// // TODO: Since code objects are not mocked, use pc.code_object_offset
// // as the absolute physical address of the mocked PC.
// sample.pc.code_object_offset = dispatch->unique_id;
// sample.correlation_id.internal = dispatch->getMockId().raw;
// sample.flags.valid = valid && !error;
// sample.wave_issued = issued;
// sample.snapshot.dual_issue_valu = dual;
// assert(dispatch.get());
// compare.push_back(sample);
// trap_snapshot_v1 snap;
// snap.valid = valid;
// snap.issued = issued;
// snap.dual = dual;
// snap.error = error;
// perf_sample_snapshot_v1 pss;
// pss.perf_snapshot_data = snap.raw;
// pss.correlation_id = dispatch->getMockId().raw;
// dispatch->submit(std::move(pss));
// };
// std::vector<rocprofiler_pc_sampling_record_t> compare;
// };
template <typename PcSamplingRecordT>
class HwIdTest : public WaveSnapTest<PcSamplingRecordT>
{
public:
void FillBuffers() override
{
// Loop over inst_type_issued
UNROLL_TYPECHECK();
buffer->genUpcomingSamples(GFX9::TYPE_LAST);
for(int i = 0; i < GFX9::TYPE_LAST; i++)
genPCSample(i, i, GFX9::REASON_ALU, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX);
}
void CheckBuffers() override
{
auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
assert(parsed.size() == 1);
assert(parsed[0].size() == GFX9::TYPE_LAST);
assert(snapshots.size() == GFX9::TYPE_LAST);
for(size_t i = 0; i < GFX9::TYPE_LAST; i++)
assert(snapshots[i].inst_type == parsed[0][i].snapshot.inst_type);
}
std::vector<rocprofiler_pc_sampling_snapshot_v1_t> snapshots;
};
class StallReasonTest : public WaveSnapTest
{
public:
void FillBuffers() override
{
// Loop over reason_not_issued
UNROLL_REASONCHECK();
buffer->genUpcomingSamples(GFX9::REASON_LAST);
for(int i = 0; i < GFX9::REASON_LAST; i++)
genPCSample(i, GFX9::TYPE_MATRIX, i, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX);
}
void CheckBuffers() override
{
auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
assert(parsed.size() == 1);
assert(parsed[0].size() == GFX9::REASON_LAST);
assert(snapshots.size() == GFX9::REASON_LAST);
for(size_t i = 0; i < GFX9::REASON_LAST; i++)
assert(snapshots[i].reason_not_issued == parsed[0][i].snapshot.reason_not_issued);
}
std::vector<rocprofiler_pc_sampling_snapshot_v1_t> snapshots;
};
class ArbStateTest : public WaveSnapTest
{
public:
void FillBuffers() override
{
// Loop over arb_state_issue
UNROLL_ARBCHECK();
buffer->genUpcomingSamples(GFX9::ISSUE_LAST * GFX9::ISSUE_LAST);
for(int i = 0; i < GFX9::ISSUE_LAST; i++)
for(int j = 0; j < GFX9::ISSUE_LAST; j++)
genPCSample(i, GFX9::TYPE_MATRIX, GFX9::REASON_ALU, 1 << i, 1 << j);
}
void CheckBuffers() override
{
auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
assert(parsed.size() == 1);
assert(parsed[0].size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST);
assert(snapshots.size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST);
for(size_t i = 0; i < GFX9::ISSUE_LAST * GFX9::ISSUE_LAST; i++)
{
auto& snap = snapshots[i];
assert(snap.arb_state_issue == parsed[0][i].snapshot.arb_state_issue);
assert(snap.arb_state_stall == parsed[0][i].snapshot.arb_state_stall);
}
}
std::vector<rocprofiler_pc_sampling_snapshot_v1_t> snapshots;
};
class WaveIssueAndErrorTest : public WaveSnapTest
{
void FillBuffers() override
{
buffer->genUpcomingSamples(16);
for(int valid = 0; valid <= 1; valid++)
for(int issued = 0; issued <= 1; issued++)
for(int dual = 0; dual <= 1; dual++)
for(int error = 0; error <= 1; error++)
genPCSample(valid, issued, dual, error);
}
void CheckBuffers() override
{
const int num_combinations = 16;
auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
assert(parsed.size() == 1);
assert(parsed[0].size() == num_combinations);
assert(compare.size() == num_combinations);
for(size_t i = 0; i < num_combinations; i++)
{
assert(compare[i].flags.valid == parsed[0][i].flags.valid);
assert(compare[i].wave_issued == parsed[0][i].wave_issued);
assert(compare[i].snapshot.dual_issue_valu == parsed[0][i].snapshot.dual_issue_valu);
}
}
union trap_snapshot_v1
union gfx9_hw_id_t
{
uint32_t raw;
struct
{
uint32_t valid : 1;
uint32_t issued : 1;
uint32_t dual : 1;
uint32_t reserved : 23;
uint32_t error : 1;
uint32_t reserved2 : 5;
uint32_t wave_id : 4; ///< wave slot index
uint32_t simd_id : 2; ///< SIMD index
uint32_t pipe_id : 2; ///< pipe index
uint32_t cu_id : 4; ///< Index of compute unit on GFX9 or workgroup processer on other
///< architectures
uint32_t shader_array_id : 1; ///< Shared array index
uint32_t shader_engine_id : 3; ///< shared engine index
uint32_t
threadgroup_id : 4; ///< thread_group index on GFX9, and workgroup index on GFX10+
uint32_t vm_id : 4; ///< virtual memory ID
uint32_t queue_id : 3; ///< queue id
uint32_t gfx_context_state_id : 3; ///< GFX context (state) id (only on GFX9) - ignored
uint32_t microengine_id : 2; ///< ACE (microengine) index
};
uint32_t raw;
};
void genPCSample(bool valid, bool issued, bool dual, bool error)
{
rocprofiler_pc_sampling_record_t sample;
::memset(&sample, 0, sizeof(sample));
// TODO: Since code objects are not mocked, use pc.loaded_code_object_offset
// as the absolute physical address of the mocked PC.
sample.pc.loaded_code_object_offset = dispatch->unique_id;
sample.correlation_id.internal = dispatch->getMockId().raw;
sample.flags.valid = valid && !error;
sample.wave_issued = issued;
sample.snapshot.dual_issue_valu = dual;
assert(dispatch.get());
compare.push_back(sample);
trap_snapshot_v1 snap;
snap.valid = valid;
snap.issued = issued;
snap.dual = dual;
snap.error = error;
perf_sample_snapshot_v1 pss;
pss.perf_snapshot_data = snap.raw;
pss.correlation_id = dispatch->getMockId().raw;
dispatch->submit(std::move(pss));
};
std::vector<rocprofiler_pc_sampling_record_t> compare;
};
class WaveOtherFieldsTest : public WaveSnapTest
{
void FillBuffers() override
{
buffer->genUpcomingSamples(3);
genPCSample(1, 2, 3, 4, 5, 6, 7, 8); // Counting
genPCSample(3, 5, 7, 11, 13, 17, 19, 23); // Some prime numbers
genPCSample(23, 19, 17, 13, 11, 7, 5, 3); // Some reversed primes
gfx9_hw_id_t hw_id_val0;
hw_id_val0.wave_id = 0;
hw_id_val0.simd_id = 0;
hw_id_val0.pipe_id = 0;
hw_id_val0.cu_id = 0;
hw_id_val0.shader_array_id = 0;
hw_id_val0.shader_engine_id = 0;
hw_id_val0.threadgroup_id = 0;
hw_id_val0.vm_id = 0;
hw_id_val0.queue_id = 0;
hw_id_val0.gfx_context_state_id = 0;
hw_id_val0.microengine_id = 0;
gfx9_hw_id_t hw_id_val1;
hw_id_val0.wave_id = 15;
hw_id_val0.simd_id = 3;
hw_id_val0.pipe_id = 3;
hw_id_val0.cu_id = 15;
hw_id_val0.shader_array_id = 1;
hw_id_val0.shader_engine_id = 7;
hw_id_val0.threadgroup_id = 15;
hw_id_val0.vm_id = 15;
hw_id_val0.queue_id = 7;
hw_id_val0.gfx_context_state_id = 7;
hw_id_val0.microengine_id = 3;
gfx9_hw_id_t hw_id_val2;
hw_id_val2.wave_id = 7;
hw_id_val2.simd_id = 2;
hw_id_val2.pipe_id = 2;
hw_id_val2.cu_id = 6;
hw_id_val2.shader_array_id = 0;
hw_id_val2.shader_engine_id = 3;
hw_id_val2.threadgroup_id = 8;
hw_id_val2.vm_id = 9;
hw_id_val2.queue_id = 3;
hw_id_val2.gfx_context_state_id = 2;
hw_id_val2.microengine_id = 1;
this->buffer->genUpcomingSamples(3);
genPCSample(hw_id_val0);
genPCSample(hw_id_val1);
genPCSample(hw_id_val2);
}
void CheckBuffers() override
{
auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9
assert(parsed.size() == 1);
assert(parsed[0].size() == 3);
assert(compare.size() == 3);
for(size_t i = 0; i < 3; i++)
{
assert(parsed[0][i].flags.has_stall_reason == true);
assert(parsed[0][i].flags.has_wave_cnt == true);
assert(parsed[0][i].flags.reserved == false);
// Comparing individual fields
assert(compare[i].hw_id.wave_id == parsed[0][i].hw_id.wave_id);
assert(compare[i].hw_id.simd_id == parsed[0][i].hw_id.simd_id);
assert(compare[i].hw_id.pipe_id == parsed[0][i].hw_id.pipe_id);
assert(compare[i].hw_id.cu_or_wgp_id == parsed[0][i].hw_id.cu_or_wgp_id);
assert(compare[i].hw_id.shader_array_id == parsed[0][i].hw_id.shader_array_id);
assert(compare[i].hw_id.shader_engine_id == parsed[0][i].hw_id.shader_engine_id);
assert(compare[i].hw_id.workgroup_id == parsed[0][i].hw_id.workgroup_id);
assert(compare[i].hw_id.vm_id == parsed[0][i].hw_id.vm_id);
assert(compare[i].hw_id.queue_id == parsed[0][i].hw_id.queue_id);
assert(compare[i].hw_id.microengine_id == parsed[0][i].hw_id.microengine_id);
}
}
void genPCSample(gfx9_hw_id_t hw_id)
{
PcSamplingRecordT sample;
::memset(&sample, 0, sizeof(sample));
// Unpacking individual fields
// NOTE: chiplet is tested in a WaveOtherFieldsTest test, becuase it's not
// transferred via hw_id, but chiplet_and_wave_id field.
sample.hw_id.wave_id = hw_id.wave_id;
sample.hw_id.simd_id = hw_id.simd_id;
sample.hw_id.pipe_id = hw_id.pipe_id;
sample.hw_id.cu_or_wgp_id = hw_id.cu_id;
sample.hw_id.shader_array_id = hw_id.shader_array_id;
sample.hw_id.shader_engine_id = hw_id.shader_engine_id;
sample.hw_id.workgroup_id = hw_id.threadgroup_id;
sample.hw_id.vm_id = hw_id.vm_id;
sample.hw_id.queue_id = hw_id.queue_id;
sample.hw_id.microengine_id = hw_id.microengine_id;
compare.push_back(sample);
perf_sample_snapshot_v1 snap;
::memset(&snap, 0, sizeof(snap));
// raw register value
snap.hw_id = hw_id.raw;
snap.correlation_id = this->dispatch->getMockId().raw;
assert(this->dispatch.get());
this->dispatch->submit(snap);
};
std::vector<PcSamplingRecordT> compare;
};
template <typename PcSamplingRecordT>
class WaveOtherFieldsTest : public WaveSnapTest<PcSamplingRecordT>
{
void FillBuffers() override
{
this->buffer->genUpcomingSamples(3);
genPCSample(1, 2, 3, 4, 5, 6, 7); // Counting
genPCSample(3, 5, 7, 11, 13, 17, 19); // Some prime numbers
genPCSample(23, 19, 17, 13, 11, 7, 5); // Some reversed primes
}
void CheckBuffers() override
{
auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9
assert(parsed.size() == 1);
assert(parsed[0].size() == 3);
assert(compare.size() == 3);
for(size_t i = 0; i < 3; i++)
{
// TODO: if we decide to test flags, make specialization for
// rocprofiler_pc_sampling_record_stochastic_v0_t
// assert(parsed[0][i].flags.has_stall_reason == true);
// assert(parsed[0][i].flags.has_wave_cnt == true);
// assert(parsed[0][i].flags.reserved == false);
assert(compare[i].exec_mask == parsed[0][i].exec_mask);
assert(compare[i].workgroup_id == parsed[0][i].workgroup_id);
assert(compare[i].chiplet == parsed[0][i].chiplet);
assert(compare[i].wave_id == parsed[0][i].wave_id);
assert(compare[i].hw_id == parsed[0][i].hw_id);
assert(compare[i].hw_id.chiplet == parsed[0][i].hw_id.chiplet);
assert(compare[i].wave_in_group == parsed[0][i].wave_in_group);
// TODO: handle HW_ID as well.
// assert(compare[i].hw_id == parsed[0][i].hw_id);
assert(compare[i].correlation_id.internal == parsed[0][i].correlation_id.internal);
}
}
void genPCSample(int pc, int exec, int blkx, int blky, int blkz, int chip, int wave, int hwid)
void genPCSample(int pc, int exec, int blkx, int blky, int blkz, int chip, int wave)
{
rocprofiler_pc_sampling_record_t sample;
PcSamplingRecordT sample;
::memset(&sample, 0, sizeof(sample));
sample.exec_mask = exec;
@@ -371,13 +506,15 @@ class WaveOtherFieldsTest : public WaveSnapTest
sample.workgroup_id.y = blky;
sample.workgroup_id.z = blkz;
sample.chiplet = chip;
sample.wave_id = wave;
sample.hw_id = hwid;
sample.correlation_id.internal = dispatch->unique_id;
sample.hw_id.chiplet = chip;
sample.wave_in_group = wave;
sample.correlation_id.internal = this->dispatch->unique_id;
compare.push_back(sample);
// We're testing fields commong for both perf_sample_host_trap_v1 and
// perf_sample_snapshot_v1, so either struct is suitable here. No need to make
// specialization,
perf_sample_snapshot_v1 snap;
::memset(&snap, 0, sizeof(snap));
snap.exec_mask = exec;
@@ -386,26 +523,31 @@ class WaveOtherFieldsTest : public WaveSnapTest
snap.workgroup_id_y = blky;
snap.workgroup_id_z = blkz;
snap.chiplet_and_wave_id = (chip << 8) | (wave & 0x3F);
snap.hw_id = hwid;
snap.correlation_id = dispatch->getMockId().raw;
snap.correlation_id = this->dispatch->getMockId().raw;
assert(dispatch.get());
dispatch->submit(snap);
assert(this->dispatch.get());
this->dispatch->submit(snap);
(void) pc;
};
std::vector<rocprofiler_pc_sampling_record_t> compare;
std::vector<PcSamplingRecordT> compare;
};
TEST(pcs_parser, gfx9_test)
{
WaveCntTest{}.Test();
InstTypeTest{}.Test();
StallReasonTest{}.Test();
ArbStateTest{}.Test();
WaveIssueAndErrorTest{}.Test();
WaveOtherFieldsTest{}.Test();
// Tests specific to stochastic sampling only
WaveCntTest<rocprofiler_pc_sampling_record_stochastic_v0_t>{}.Test();
// InstTypeTest{}.Test();
// StallReasonTest{}.Test();
// ArbStateTest{}.Test();
// WaveIssueAndErrorTest{}.Test();
// Tests commong for both host trap and stochastic sampling.
HwIdTest<rocprofiler_pc_sampling_record_host_trap_v0_t>{}.Test();
HwIdTest<rocprofiler_pc_sampling_record_stochastic_v0_t>{}.Test();
WaveOtherFieldsTest<rocprofiler_pc_sampling_record_host_trap_v0_t>{}.Test();
WaveOtherFieldsTest<rocprofiler_pc_sampling_record_stochastic_v0_t>{}.Test();
std::cout << "GFX9 Test Done." << std::endl;
}
@@ -46,6 +46,7 @@
/**
* Mimics the rocprofiler buffer sent to the parser.
*/
template <typename PcSamplingRecordT>
class MockRuntimeBuffer
{
public:
@@ -59,18 +60,21 @@ public:
void submit(const packet_union_t& packet) { packets.push_back(packet); };
//! Submits a "upcoming_samples_t" packet signaling the next num_samples packets are PC samples
void genUpcomingSamples(int num_samples)
void genUpcomingSamples(int num_samples, upcoming_sample_t sample_type)
{
packet_union_t uni;
::memset(&uni, 0, sizeof(uni));
uni.upcoming.type = AMD_UPCOMING_SAMPLES;
uni.upcoming.which_sample_type = AMD_SNAPSHOT_V1;
uni.upcoming.which_sample_type = sample_type;
uni.upcoming.num_samples = num_samples;
uni.upcoming.device.handle = device;
submit(uni);
}
std::vector<std::vector<rocprofiler_pc_sampling_record_t>> get_parsed_buffer(int GFXIP_MAJOR)
//! Submits a "upcoming_samples_t" packet signaling the next num_samples packets are PC samples
void genUpcomingSamples(int num_samples);
std::vector<std::vector<PcSamplingRecordT>> get_parsed_buffer(int GFXIP_MAJOR)
{
parsed_data = {};
@@ -83,22 +87,38 @@ public:
return parsed_data;
}
static uint64_t alloc_parse_memory(rocprofiler_pc_sampling_record_t** sample,
uint64_t req_size,
void* userdata)
static uint64_t alloc_parse_memory(PcSamplingRecordT** sample,
uint64_t req_size,
void* userdata)
{
auto* buffer = reinterpret_cast<MockRuntimeBuffer*>(userdata);
buffer->parsed_data.push_back(std::vector<rocprofiler_pc_sampling_record_t>(req_size));
buffer->parsed_data.push_back(std::vector<PcSamplingRecordT>(req_size));
*sample = buffer->parsed_data.back().data();
return req_size;
}
std::vector<packet_union_t> packets;
std::vector<std::vector<rocprofiler_pc_sampling_record_t>> parsed_data;
std::vector<packet_union_t> packets;
std::vector<std::vector<PcSamplingRecordT>> parsed_data;
const uint32_t device;
};
template <>
void
MockRuntimeBuffer<rocprofiler_pc_sampling_record_host_trap_v0_t>::genUpcomingSamples(
int num_samples)
{
genUpcomingSamples(num_samples, AMD_HOST_TRAP_V1);
}
template <>
void
MockRuntimeBuffer<rocprofiler_pc_sampling_record_stochastic_v0_t>::genUpcomingSamples(
int num_samples)
{
this->genUpcomingSamples(num_samples, AMD_SNAPSHOT_V1);
}
/**
* Mimics a HSA doorbell. Every live instance of this class has an unique ID (handler).
* The handler itself may be not unique considering dead instances.
@@ -149,10 +169,11 @@ private:
* read and write pointers.
* Creating an instance of this class automatically adds a queue creation packet to the buffer.
*/
template <typename PcSamplingRecordT>
class MockQueue
{
public:
MockQueue(int size_, std::shared_ptr<MockRuntimeBuffer>& buffer_)
MockQueue(int size_, std::shared_ptr<MockRuntimeBuffer<PcSamplingRecordT>>& buffer_)
: id(getUniqueId())
, size(size_)
, doorbell()
@@ -184,7 +205,7 @@ public:
const MockDoorBell doorbell;
const uint32_t device;
std::shared_ptr<MockRuntimeBuffer> const buffer;
std::shared_ptr<MockRuntimeBuffer<PcSamplingRecordT>> const buffer;
private:
static size_t getUniqueId()
@@ -198,10 +219,11 @@ private:
* Mimics a kernel dispatch.
* Creating an instance of this class automatically adds a dispatch creation packet to the buffer.
*/
template <typename PcSamplingRecordT>
class MockDispatch
{
public:
MockDispatch(std::shared_ptr<MockQueue>& queue_)
MockDispatch(std::shared_ptr<MockQueue<PcSamplingRecordT>>& queue_)
: queue(queue_)
, dispatch_id(queue->write_index)
, doorbell_id(queue->doorbell.handler)
@@ -251,7 +273,7 @@ public:
<< " ds_id:" << dispatch_id << std::endl;
}
std::shared_ptr<MockQueue> const queue;
std::shared_ptr<MockQueue<PcSamplingRecordT>> const queue;
const size_t dispatch_id;
const size_t doorbell_id;
@@ -273,10 +295,11 @@ private:
* Instead of generating a valid program counter, this class uses the snapshot.pc field to
* store the original dispatch's unique_id for later correctness verification.
*/
template <typename PcSamplingRecordT>
class MockWave
{
public:
MockWave(const std::shared_ptr<MockDispatch>& dispatch_)
MockWave(const std::shared_ptr<MockDispatch<PcSamplingRecordT>>& dispatch_)
: dispatch(dispatch_)
{}
@@ -295,5 +318,5 @@ public:
<< dispatch->unique_id << std::endl;
}
std::shared_ptr<MockDispatch> const dispatch;
std::shared_ptr<MockDispatch<PcSamplingRecordT>> const dispatch;
};
@@ -46,19 +46,20 @@ public:
/**
* Sample user memory allocation callback.
* It expects userdata to be cast-able to a pointer to
* std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>>
* std::vector<std::pair<PcSamplingRecordT*, uint64_t>>
*/
template <typename PcSamplingRecordT>
static uint64_t
alloc_callback(rocprofiler_pc_sampling_record_t** buffer, uint64_t size, void* userdata)
alloc_callback(PcSamplingRecordT** buffer, uint64_t size, void* userdata)
{
*buffer = new rocprofiler_pc_sampling_record_t[size];
*buffer = new PcSamplingRecordT[size];
auto& vector =
*reinterpret_cast<std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>>*>(
userdata);
*reinterpret_cast<std::vector<std::pair<PcSamplingRecordT*, uint64_t>>*>(userdata);
vector.push_back({*buffer, size});
return size;
}
template <typename PcSamplingRecordT>
void
multithread_queue_hammer(size_t tid, Latch* latch)
{
@@ -70,10 +71,11 @@ multithread_queue_hammer(size_t tid, Latch* latch)
constexpr int NUM_QUEUES = MockDoorBell::num_unique_bells / NUM_THREADS;
constexpr int ACTION_MAX = QSIZE * NUM_QUEUES / 2;
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>(tid);
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>(tid);
std::array<std::shared_ptr<MockQueue>, NUM_QUEUES> queues;
std::array<std::vector<std::shared_ptr<MockDispatch>>, NUM_QUEUES> active_dispatches;
std::array<std::shared_ptr<MockQueue<PcSamplingRecordT>>, NUM_QUEUES> queues;
std::array<std::vector<std::shared_ptr<MockDispatch<PcSamplingRecordT>>>, NUM_QUEUES>
active_dispatches;
int num_reset_queues = 0;
int num_samples_generated = 0;
@@ -82,9 +84,10 @@ multithread_queue_hammer(size_t tid, Latch* latch)
size_t max_q_occupancy = 0;
for(int i = 0; i < NUM_QUEUES; i++)
queues[i] = std::make_shared<MockQueue>(QSIZE, buffer);
queues[i] = std::make_shared<MockQueue<PcSamplingRecordT>>(QSIZE, buffer);
for(int i = 0; i < NUM_QUEUES; i++)
active_dispatches[i].push_back(std::make_shared<MockDispatch>(queues[i]));
active_dispatches[i].push_back(
std::make_shared<MockDispatch<PcSamplingRecordT>>(queues[i]));
for(int i = 0; i < NUM_ACTIONS; i++)
{
@@ -95,7 +98,7 @@ multithread_queue_hammer(size_t tid, Latch* latch)
// Delete queue and create new one
active_dispatches[q] = {};
queues[q].reset();
queues[q] = std::make_shared<MockQueue>(QSIZE, buffer);
queues[q] = std::make_shared<MockQueue<PcSamplingRecordT>>(QSIZE, buffer);
num_reset_queues++;
}
else if(action > ACTION_MAX / 2 && active_dispatches[q].size() > 1)
@@ -108,7 +111,8 @@ multithread_queue_hammer(size_t tid, Latch* latch)
// Add new dispatch
if(active_dispatches[q].size() < QSIZE)
{
active_dispatches[q].push_back(std::make_shared<MockDispatch>(queues[q]));
active_dispatches[q].push_back(
std::make_shared<MockDispatch<PcSamplingRecordT>>(queues[q]));
num_dispatches_generated += 1;
}
@@ -117,7 +121,8 @@ multithread_queue_hammer(size_t tid, Latch* latch)
for(auto& queue : active_dispatches)
{
EXPECT_NE(queue.size(), 0);
std::shared_ptr<MockDispatch> rand_dispatch = queue[rdgen() % queue.size()];
std::shared_ptr<MockDispatch<PcSamplingRecordT>> rand_dispatch =
queue[rdgen() % queue.size()];
MockWave(rand_dispatch).genPCSample();
num_samples_generated += 1;
avg_q_occupancy += queue.size();
@@ -127,23 +132,23 @@ multithread_queue_hammer(size_t tid, Latch* latch)
latch->sync();
std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>> all_allocations;
std::vector<std::pair<PcSamplingRecordT*, uint64_t>> all_allocations;
CHECK_PARSER(_parse_buffer<GFX9>((generic_sample_t*) buffer->packets.data(),
buffer->packets.size(),
alloc_callback,
alloc_callback<PcSamplingRecordT>,
(void*) &all_allocations,
&corr_map));
EXPECT_EQ(all_allocations.size(), NUM_ACTIONS); // Incorrect number of callbacks
for(auto sb = 0ul; sb < all_allocations.size(); sb++)
{
rocprofiler_pc_sampling_record_t* samples = all_allocations[sb].first;
size_t num_samples = all_allocations[sb].second;
PcSamplingRecordT* samples = all_allocations[sb].first;
size_t num_samples = all_allocations[sb].second;
EXPECT_EQ(num_samples, NUM_QUEUES);
for(size_t i = 0; i < num_samples; i++)
EXPECT_EQ(samples[i].correlation_id.internal, samples[i].pc.loaded_code_object_offset);
EXPECT_EQ(samples[i].correlation_id.internal, samples[i].pc.code_object_offset);
delete[] samples;
}
}
@@ -152,6 +157,7 @@ multithread_queue_hammer(size_t tid, Latch* latch)
* Benchmarks how fast the parser can process samples on a single threaded case
* Current: 5600X with -Ofast, up to >140 million samples/s or ~9GB/s R/W (18GB/s bidirectional)
*/
template <typename PcSamplingRecordT>
static std::pair<size_t, size_t>
MultiThread_BenchMark(size_t tid, Latch* latch)
{
@@ -161,14 +167,16 @@ MultiThread_BenchMark(size_t tid, Latch* latch)
constexpr size_t DISP_PER_QUEUE = 16;
constexpr size_t NUM_QUEUES = 1;
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>(tid);
std::array<std::vector<std::shared_ptr<MockDispatch>>, NUM_QUEUES> active_dispatches;
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>(tid);
std::array<std::vector<std::shared_ptr<MockDispatch<PcSamplingRecordT>>>, NUM_QUEUES>
active_dispatches;
for(size_t q = 0; q < NUM_QUEUES; q++)
{
std::shared_ptr<MockQueue> queue = std::make_shared<MockQueue>(DISP_PER_QUEUE * 2, buffer);
auto queue = std::make_shared<MockQueue<PcSamplingRecordT>>(DISP_PER_QUEUE * 2, buffer);
for(size_t d = 0; d < DISP_PER_QUEUE; d++)
active_dispatches[q].push_back(std::make_shared<MockDispatch>(queue));
active_dispatches[q].push_back(
std::make_shared<MockDispatch<PcSamplingRecordT>>(queue));
}
constexpr size_t TOTAL_NUM_SAMPLES = NUM_QUEUES * DISP_PER_QUEUE * SAMPLE_PER_DISPATCH;
@@ -179,29 +187,31 @@ MultiThread_BenchMark(size_t tid, Latch* latch)
for(size_t i = 0; i < SAMPLE_PER_DISPATCH; i++)
MockWave(dispatch).genPCSample();
std::pair<rocprofiler_pc_sampling_record_t*, size_t> userdata;
userdata.first = new rocprofiler_pc_sampling_record_t[TOTAL_NUM_SAMPLES];
std::pair<PcSamplingRecordT*, size_t> userdata;
userdata.first = new PcSamplingRecordT[TOTAL_NUM_SAMPLES];
userdata.second = TOTAL_NUM_SAMPLES;
latch->sync();
auto t0 = std::chrono::system_clock::now();
CHECK_PARSER(_parse_buffer<GFX9>(
(generic_sample_t*) buffer->packets.data(),
buffer->packets.size(),
[](rocprofiler_pc_sampling_record_t** sample, uint64_t size, void* userdata_) {
auto* pair =
reinterpret_cast<std::pair<rocprofiler_pc_sampling_record_t*, size_t>*>(userdata_);
*sample = pair->first;
user_callback_t<PcSamplingRecordT> user_cb =
[](PcSamplingRecordT** sample, uint64_t size, void* userdata_) {
auto* pair = reinterpret_cast<std::pair<PcSamplingRecordT*, size_t>*>(userdata_);
*sample = pair->first;
return size;
},
&userdata,
&corr_map));
};
auto t0 = std::chrono::system_clock::now();
CHECK_PARSER(_parse_buffer<GFX9>((generic_sample_t*) buffer->packets.data(),
buffer->packets.size(),
user_cb,
&userdata,
&corr_map));
auto t1 = std::chrono::system_clock::now();
delete[] userdata.first;
return {TOTAL_NUM_SAMPLES, (t1 - t0).count()};
}
template <typename PcSamplingRecordT>
void
multithread_codeobj(size_t tid, Latch* latch)
{
@@ -215,11 +225,11 @@ multithread_codeobj(size_t tid, Latch* latch)
constexpr int NUM_SAMPLES = 50;
constexpr int QSIZE = 16;
auto buffer = std::make_shared<MockRuntimeBuffer>(tid);
auto queue = std::make_shared<MockQueue>(QSIZE, buffer);
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>(tid);
auto queue = std::make_shared<MockQueue<PcSamplingRecordT>>(QSIZE, buffer);
std::pair<rocprofiler_pc_sampling_record_t*, size_t> userdata;
userdata.first = new rocprofiler_pc_sampling_record_t[NUM_SAMPLES];
std::pair<PcSamplingRecordT*, size_t> userdata;
userdata.first = new PcSamplingRecordT[NUM_SAMPLES];
userdata.second = NUM_SAMPLES;
latch->sync();
@@ -227,7 +237,7 @@ multithread_codeobj(size_t tid, Latch* latch)
for(int d = 0; d < NUM_DISPATCH; d++)
{
buffer->packets.clear();
auto dispatch = std::make_shared<MockDispatch>(queue);
auto dispatch = std::make_shared<MockDispatch<PcSamplingRecordT>>(queue);
const size_t pc_base_addr = NUM_SAMPLES * dispatch->unique_id;
table->insert(addr_range_t{pc_base_addr, NUM_SAMPLES, dispatch->unique_id});
@@ -242,25 +252,25 @@ multithread_codeobj(size_t tid, Latch* latch)
dispatch->submit(uni);
}
CHECK_PARSER(_parse_buffer<GFX9>(
(generic_sample_t*) buffer->packets.data(),
buffer->packets.size(),
[](rocprofiler_pc_sampling_record_t** sample, uint64_t size, void* userdata_) {
auto* pair =
reinterpret_cast<std::pair<rocprofiler_pc_sampling_record_t*, size_t>*>(
userdata_);
*sample = pair->first;
user_callback_t<PcSamplingRecordT> user_cb =
[](PcSamplingRecordT** sample, uint64_t size, void* userdata_) {
auto* pair = reinterpret_cast<std::pair<PcSamplingRecordT*, size_t>*>(userdata_);
*sample = pair->first;
assert(size <= NUM_SAMPLES);
return size;
},
&userdata,
&corr_map));
};
CHECK_PARSER(_parse_buffer<GFX9>((generic_sample_t*) buffer->packets.data(),
buffer->packets.size(),
user_cb,
&userdata,
&corr_map));
for(int s = 0; s < NUM_SAMPLES; s++)
{
const auto& pc = userdata.first[s].pc;
EXPECT_EQ(pc.loaded_code_object_id, dispatch->unique_id);
EXPECT_EQ(pc.loaded_code_object_offset, s);
EXPECT_EQ(pc.code_object_id, dispatch->unique_id);
EXPECT_EQ(pc.code_object_offset, s);
}
table->remove(addr_range_t{pc_base_addr, NUM_SAMPLES, dispatch->unique_id});
@@ -269,7 +279,9 @@ multithread_codeobj(size_t tid, Latch* latch)
delete[] userdata.first;
}
TEST(pcs_parser, bench_test)
template <typename PcSamplingRecordT>
void
pcs_parser_bench_test()
{
size_t time = 0;
size_t samples = 0;
@@ -280,7 +292,8 @@ TEST(pcs_parser, bench_test)
std::vector<std::future<std::pair<size_t, size_t>>> threads{};
for(size_t t = 0; t < NUM_THREADS; t++)
threads.push_back(std::async(std::launch::async, MultiThread_BenchMark, t, &latch));
threads.push_back(std::async(
std::launch::async, MultiThread_BenchMark<PcSamplingRecordT>, t, &latch));
if(it == 0) continue; // Skip warmup
@@ -295,23 +308,47 @@ TEST(pcs_parser, bench_test)
double mean = 1E3 * NUM_THREADS * samples / time;
std::cout << "Benchmark: Parsed " << int(mean * 1E3 + 0.5) * 1E-3f << " Msample/s (";
std::cout << int(sizeof(rocprofiler_pc_sampling_record_t) * mean) << " MB/s)" << std::endl;
std::cout << int(sizeof(PcSamplingRecordT) * mean) << " MB/s)" << std::endl;
};
TEST(pcs_parser, bench_test)
{
pcs_parser_bench_test<rocprofiler_pc_sampling_record_host_trap_v0_t>();
pcs_parser_bench_test<rocprofiler_pc_sampling_record_stochastic_v0_t>();
}
template <typename PcSamplingRecordT>
void
pcs_parser_hammer_test()
{
Latch latch(NUM_THREADS);
std::vector<std::future<void>> threads{};
for(size_t i = 0; i < NUM_THREADS; i++)
threads.push_back(
std::async(std::launch::async, multithread_queue_hammer<PcSamplingRecordT>, i, &latch));
};
TEST(pcs_parser, hammer_test)
{
Latch latch(NUM_THREADS);
pcs_parser_hammer_test<rocprofiler_pc_sampling_record_host_trap_v0_t>();
pcs_parser_hammer_test<rocprofiler_pc_sampling_record_stochastic_v0_t>();
}
std::vector<std::future<void>> threads{};
for(size_t i = 0; i < NUM_THREADS; i++)
threads.push_back(std::async(std::launch::async, multithread_queue_hammer, i, &latch));
};
TEST(pcs_parser, codeobj_test)
template <typename PcSamplingRecordT>
void
pcs_parser_codeobj_test()
{
Latch latch(NUM_THREADS);
std::vector<std::future<void>> threads{};
for(size_t i = 0; i < NUM_THREADS; i++)
threads.push_back(std::async(std::launch::async, multithread_codeobj, i, &latch));
};
threads.push_back(
std::async(std::launch::async, multithread_codeobj<PcSamplingRecordT>, i, &latch));
}
TEST(pcs_parser, codeobj_test)
{
pcs_parser_codeobj_test<rocprofiler_pc_sampling_record_host_trap_v0_t>();
pcs_parser_codeobj_test<rocprofiler_pc_sampling_record_stochastic_v0_t>();
}
+253 -153
Voir le fichier
@@ -28,184 +28,284 @@
#include "lib/rocprofiler-sdk/pc_sampling/parser/gfx11.hpp"
#include "lib/rocprofiler-sdk/pc_sampling/parser/gfx9.hpp"
#include "lib/rocprofiler-sdk/pc_sampling/parser/parser_types.h"
#include "lib/rocprofiler-sdk/pc_sampling/parser/parser_types.hpp"
#include "lib/rocprofiler-sdk/pc_sampling/parser/rocr.h"
#include "lib/rocprofiler-sdk/pc_sampling/parser/stochastic_records.h"
template <typename SType>
inline rocprofiler_pc_sampling_record_t
// TODO: refactor the commented code for stochastic sampling
// template <typename gfx>
// inline rocprofiler_pc_sampling_record_t
// copyStochasticSample(const perf_sample_snapshot_v1& sample);
// template <>
// inline rocprofiler_pc_sampling_record_t
// copyStochasticSample<GFX9>(const perf_sample_snapshot_v1& sample)
// {
// rocprofiler_pc_sampling_record_t ret = copySampleHeader<perf_sample_snapshot_v1>(sample);
// ret.flags.valid = sample.perf_snapshot_data & (~sample.perf_snapshot_data >> 26) & 0x1;
// // Check wave_id matches snapshot_wave_id
// ret.flags.has_wave_cnt = true;
// ret.flags.has_stall_reason = true;
// ret.wave_count = sample.perf_snapshot_data1 & 0x3F;
// ret.wave_issued = sample.perf_snapshot_data >> 1;
// ret.snapshot.dual_issue_valu = sample.perf_snapshot_data >> 2;
// ret.snapshot.inst_type = sample.perf_snapshot_data >> 3;
// ret.snapshot.reason_not_issued = (sample.perf_snapshot_data >> 7) & 0x7;
// ret.snapshot.arb_state_issue = (sample.perf_snapshot_data >> 10) & 0xFF;
// ret.snapshot.arb_state_stall = (sample.perf_snapshot_data >> 18) & 0xFF;
// ret.reserved = 0;
// return ret;
// }
// template <>
// inline rocprofiler_pc_sampling_record_t
// copyStochasticSample<GFX11>(const perf_sample_snapshot_v1& sample)
// {
// rocprofiler_pc_sampling_record_t ret = copySampleHeader<perf_sample_snapshot_v1>(sample);
// ret.flags.valid = sample.perf_snapshot_data & (~sample.perf_snapshot_data >> 23) & 0x1;
// // Check wave_id matches snapshot_wave_id
// ret.flags.has_stall_reason = true;
// ret.wave_issued = sample.perf_snapshot_data >> 1;
// ret.snapshot.inst_type = sample.perf_snapshot_data >> 2;
// ret.snapshot.reason_not_issued = (sample.perf_snapshot_data >> 6) & 0x7;
// ret.snapshot.arb_state_issue = (sample.perf_snapshot_data >> 9) & 0x7F;
// ret.snapshot.arb_state_stall = (sample.perf_snapshot_data >> 16) & 0x7F;
// ret.snapshot.dual_issue_valu = false;
// ret.reserved = 0;
// return ret;
// }
// #define BITSHIFT(sname) out |= ((in >> GFX::sname) & 1) << PCSAMPLE::sname
// template <typename GFX>
// inline int
// translate_arb(int in)
// {
// size_t out = 0;
// BITSHIFT(ISSUE_VALU);
// BITSHIFT(ISSUE_MATRIX);
// BITSHIFT(ISSUE_LDS);
// BITSHIFT(ISSUE_LDS_DIRECT);
// BITSHIFT(ISSUE_SCALAR);
// BITSHIFT(ISSUE_VMEM_TEX);
// BITSHIFT(ISSUE_FLAT);
// BITSHIFT(ISSUE_EXP);
// BITSHIFT(ISSUE_MISC);
// BITSHIFT(ISSUE_BRMSG);
// return out & 0x3FF;
// }
// #undef BITSHIFT
// #define LUTOVERLOAD(sname) this->operator[](GFX::sname) = PCSAMPLE::sname
// template <typename GFX>
// class GFX_REASON_LUT : public std::array<int, 32>
// {
// public:
// GFX_REASON_LUT()
// {
// std::memset(data(), 0, size() * sizeof(int));
// LUTOVERLOAD(REASON_NOT_AVAILABLE);
// LUTOVERLOAD(REASON_ALU);
// LUTOVERLOAD(REASON_WAITCNT);
// LUTOVERLOAD(REASON_INTERNAL);
// LUTOVERLOAD(REASON_BARRIER);
// LUTOVERLOAD(REASON_ARBITER);
// LUTOVERLOAD(REASON_EX_STALL);
// LUTOVERLOAD(REASON_OTHER_WAIT);
// LUTOVERLOAD(REASON_SLEEP);
// }
// };
// template <typename GFX>
// class GFX_INST_LUT : public std::array<int, 32>
// {
// public:
// GFX_INST_LUT()
// {
// std::memset(data(), 0, size() * sizeof(int));
// LUTOVERLOAD(TYPE_VALU);
// LUTOVERLOAD(TYPE_MATRIX);
// LUTOVERLOAD(TYPE_SCALAR);
// LUTOVERLOAD(TYPE_TEX);
// LUTOVERLOAD(TYPE_LDS);
// LUTOVERLOAD(TYPE_LDS_DIRECT);
// LUTOVERLOAD(TYPE_FLAT);
// LUTOVERLOAD(TYPE_EXP);
// LUTOVERLOAD(TYPE_MESSAGE);
// LUTOVERLOAD(TYPE_BARRIER);
// LUTOVERLOAD(TYPE_BRANCH_NOT_TAKEN);
// LUTOVERLOAD(TYPE_BRANCH_TAKEN);
// LUTOVERLOAD(TYPE_JUMP);
// LUTOVERLOAD(TYPE_OTHER);
// LUTOVERLOAD(TYPE_NO_INST);
// LUTOVERLOAD(TYPE_DUAL_VALU);
// }
// };
// template <typename GFX>
// inline int
// translate_reason(int in)
// {
// static GFX_REASON_LUT<GFX> lut;
// return lut[in & 0x1F];
// }
// template <typename GFX>
// inline int
// translate_inst(int in)
// {
// static GFX_INST_LUT<GFX> lut;
// return lut[in & 0x1F];
// }
// #undef LUTOVERLOAD
// template <bool HostTrap, typename GFX>
// inline rocprofiler_pc_sampling_record_t
// copySample(const void* sample)
// {
// if(HostTrap) return copyHostTrapSample(*(const perf_sample_host_trap_v1*) sample);
// rocprofiler_pc_sampling_record_t ret =
// copyStochasticSample<GFX>(*(const perf_sample_snapshot_v1*) sample);
// ret.snapshot.inst_type = translate_inst<GFX>(ret.snapshot.inst_type);
// ret.snapshot.arb_state_issue = translate_arb<GFX>(ret.snapshot.arb_state_issue);
// ret.snapshot.arb_state_stall = translate_arb<GFX>(ret.snapshot.arb_state_stall);
// ret.snapshot.reason_not_issued = translate_reason<GFX>(ret.snapshot.reason_not_issued);
// return ret;
// }
#define EXTRACT_BITS(val, bit_end, bit_start) \
(val >> bit_start) & ((1U << (bit_end - bit_start + 1)) - 1)
template <typename GFX, typename PcSamplingRecordT, typename SType>
inline void
copyChipletId(PcSamplingRecordT& record, const SType& sample)
{
// extract chiplet record
record.hw_id.chiplet = sample.chiplet_and_wave_id >> 8;
}
template <typename GFX9, typename HwIdT>
inline void
copyHwId(HwIdT& hw_id, const uint32_t hsa_hw_id);
template <>
inline void
copyHwId<GFX9, rocprofiler_pc_sampling_hw_id_v0_t>(rocprofiler_pc_sampling_hw_id_v0_t& hw_id,
const uint32_t hw_id_reg)
{
// 3:0 -> wave_id
hw_id.wave_id = EXTRACT_BITS(hw_id_reg, 3, 0);
// 5:4 -> simd_id
hw_id.simd_id = EXTRACT_BITS(hw_id_reg, 5, 4);
// 7:6 -> pipe_id;
hw_id.pipe_id = EXTRACT_BITS(hw_id_reg, 7, 6);
// 11:8 -> cu_id
hw_id.cu_or_wgp_id = EXTRACT_BITS(hw_id_reg, 11, 8);
// 12 -> sa_id
hw_id.shader_array_id = EXTRACT_BITS(hw_id_reg, 12, 12);
// 15:13 -> se_id
hw_id.shader_engine_id = EXTRACT_BITS(hw_id_reg, 15, 13);
// 19:16 -> tg_id
hw_id.workgroup_id = EXTRACT_BITS(hw_id_reg, 19, 16);
// 23:20 -> vm_id
hw_id.vm_id = EXTRACT_BITS(hw_id_reg, 23, 20);
// 26:24 -> queue_id
hw_id.queue_id = EXTRACT_BITS(hw_id_reg, 26, 24);
// 29:27 -> state_id (ignored)
// 31:30 -> me_id
hw_id.microengine_id = EXTRACT_BITS(hw_id_reg, 31, 30);
}
#undef EXTRACT_BITS
template <typename PcSamplingRecordT, typename SType>
inline PcSamplingRecordT
copySampleHeader(const SType& sample)
{
rocprofiler_pc_sampling_record_t ret;
ret.flags = pcsample_header_v1_t{.raw = 0}.flags;
ret.flags.type = AMD_SNAPSHOT_V1;
PcSamplingRecordT ret;
// zero out all record fields
std::memset(&ret, 0, sizeof(PcSamplingRecordT));
// Decode fields common for all host-trap and stochastic on all architectures.
ret.size = sizeof(PcSamplingRecordT);
ret.wave_in_group = sample.chiplet_and_wave_id & 0x3F;
ret.exec_mask = sample.exec_mask;
ret.workgroup_id.x = sample.workgroup_id_x;
ret.workgroup_id.y = sample.workgroup_id_y;
ret.workgroup_id.z = sample.workgroup_id_z;
ret.chiplet = sample.chiplet_and_wave_id >> 8;
ret.wave_id = sample.chiplet_and_wave_id & 0x3F;
ret.hw_id = sample.hw_id;
ret.timestamp = sample.timestamp;
return ret;
}
inline rocprofiler_pc_sampling_record_t
copyHostTrapSample(const perf_sample_host_trap_v1& sample)
{
rocprofiler_pc_sampling_record_t ret = copySampleHeader<perf_sample_host_trap_v1>(sample);
ret.flags.type = AMD_HOST_TRAP_V1;
return ret;
}
template <typename gfx>
inline rocprofiler_pc_sampling_record_t
copyStochasticSample(const perf_sample_snapshot_v1& sample);
template <typename GFX, typename PcSamplingRecordT>
inline PcSamplingRecordT
copySample(const void* sample);
/**
* @brief Host trap V0 sample for GFX9
*/
template <>
inline rocprofiler_pc_sampling_record_t
copyStochasticSample<GFX9>(const perf_sample_snapshot_v1& sample)
inline rocprofiler_pc_sampling_record_host_trap_v0_t
copySample<GFX9, rocprofiler_pc_sampling_record_host_trap_v0_t>(const void* sample)
{
rocprofiler_pc_sampling_record_t ret = copySampleHeader<perf_sample_snapshot_v1>(sample);
ret.flags.valid = sample.perf_snapshot_data & (~sample.perf_snapshot_data >> 26) & 0x1;
// Check wave_id matches snapshot_wave_id
ret.flags.has_wave_cnt = true;
ret.flags.has_stall_reason = true;
ret.wave_count = sample.perf_snapshot_data1 & 0x3F;
ret.wave_issued = sample.perf_snapshot_data >> 1;
ret.snapshot.dual_issue_valu = sample.perf_snapshot_data >> 2;
ret.snapshot.inst_type = sample.perf_snapshot_data >> 3;
ret.snapshot.reason_not_issued = (sample.perf_snapshot_data >> 7) & 0x7;
ret.snapshot.arb_state_issue = (sample.perf_snapshot_data >> 10) & 0xFF;
ret.snapshot.arb_state_stall = (sample.perf_snapshot_data >> 18) & 0xFF;
ret.reserved = 0;
const auto& sample_ = *static_cast<const perf_sample_host_trap_v1*>(sample);
auto ret = copySampleHeader<rocprofiler_pc_sampling_record_host_trap_v0_t>(sample_);
copyChipletId<GFX9>(ret, sample_);
copyHwId<GFX9>(ret.hw_id, sample_.hw_id);
// copyHwId<GFX9>(&ret, sample);
return ret;
}
template <>
inline rocprofiler_pc_sampling_record_t
copyStochasticSample<GFX11>(const perf_sample_snapshot_v1& sample)
inline rocprofiler_pc_sampling_record_stochastic_v0_t
copySample<GFX9, rocprofiler_pc_sampling_record_stochastic_v0_t>(const void* sample)
{
rocprofiler_pc_sampling_record_t ret = copySampleHeader<perf_sample_snapshot_v1>(sample);
ret.flags.valid = sample.perf_snapshot_data & (~sample.perf_snapshot_data >> 23) & 0x1;
// Check wave_id matches snapshot_wave_id
ret.flags.has_stall_reason = true;
ret.wave_issued = sample.perf_snapshot_data >> 1;
ret.snapshot.inst_type = sample.perf_snapshot_data >> 2;
ret.snapshot.reason_not_issued = (sample.perf_snapshot_data >> 6) & 0x7;
ret.snapshot.arb_state_issue = (sample.perf_snapshot_data >> 9) & 0x7F;
ret.snapshot.arb_state_stall = (sample.perf_snapshot_data >> 16) & 0x7F;
ret.snapshot.dual_issue_valu = false;
ret.reserved = 0;
const auto& sample_ = *static_cast<const perf_sample_snapshot_v1*>(sample);
auto ret = copySampleHeader<rocprofiler_pc_sampling_record_stochastic_v0_t>(sample_);
copyChipletId<GFX9>(ret, sample_);
copyHwId<GFX9>(ret.hw_id, sample_.hw_id);
ret.wave_count = sample_.perf_snapshot_data1 & 0x3F;
// TODO: implement logic for manipulating stochastic related fields
return ret;
}
#define BITSHIFT(sname) out |= ((in >> GFX::sname) & 1) << PCSAMPLE::sname
template <typename GFX>
inline int
translate_arb(int in)
/**
* @brief Host trap V0 sample for GFX11
*/
template <>
inline rocprofiler_pc_sampling_record_host_trap_v0_t
copySample<GFX11, rocprofiler_pc_sampling_record_host_trap_v0_t>(const void* sample)
{
size_t out = 0;
BITSHIFT(ISSUE_VALU);
BITSHIFT(ISSUE_MATRIX);
BITSHIFT(ISSUE_LDS);
BITSHIFT(ISSUE_LDS_DIRECT);
BITSHIFT(ISSUE_SCALAR);
BITSHIFT(ISSUE_VMEM_TEX);
BITSHIFT(ISSUE_FLAT);
BITSHIFT(ISSUE_EXP);
BITSHIFT(ISSUE_MISC);
BITSHIFT(ISSUE_BRMSG);
return out & 0x3FF;
}
#undef BITSHIFT
#define LUTOVERLOAD(sname) this->operator[](GFX::sname) = PCSAMPLE::sname
template <typename GFX>
class GFX_REASON_LUT : public std::array<int, 32>
{
public:
GFX_REASON_LUT()
{
std::memset(data(), 0, size() * sizeof(int));
LUTOVERLOAD(REASON_NOT_AVAILABLE);
LUTOVERLOAD(REASON_ALU);
LUTOVERLOAD(REASON_WAITCNT);
LUTOVERLOAD(REASON_INTERNAL);
LUTOVERLOAD(REASON_BARRIER);
LUTOVERLOAD(REASON_ARBITER);
LUTOVERLOAD(REASON_EX_STALL);
LUTOVERLOAD(REASON_OTHER_WAIT);
LUTOVERLOAD(REASON_SLEEP);
}
};
template <typename GFX>
class GFX_INST_LUT : public std::array<int, 32>
{
public:
GFX_INST_LUT()
{
std::memset(data(), 0, size() * sizeof(int));
LUTOVERLOAD(TYPE_VALU);
LUTOVERLOAD(TYPE_MATRIX);
LUTOVERLOAD(TYPE_SCALAR);
LUTOVERLOAD(TYPE_TEX);
LUTOVERLOAD(TYPE_LDS);
LUTOVERLOAD(TYPE_LDS_DIRECT);
LUTOVERLOAD(TYPE_FLAT);
LUTOVERLOAD(TYPE_EXP);
LUTOVERLOAD(TYPE_MESSAGE);
LUTOVERLOAD(TYPE_BARRIER);
LUTOVERLOAD(TYPE_BRANCH_NOT_TAKEN);
LUTOVERLOAD(TYPE_BRANCH_TAKEN);
LUTOVERLOAD(TYPE_JUMP);
LUTOVERLOAD(TYPE_OTHER);
LUTOVERLOAD(TYPE_NO_INST);
LUTOVERLOAD(TYPE_DUAL_VALU);
}
};
template <typename GFX>
inline int
translate_reason(int in)
{
static GFX_REASON_LUT<GFX> lut;
return lut[in & 0x1F];
}
template <typename GFX>
inline int
translate_inst(int in)
{
static GFX_INST_LUT<GFX> lut;
return lut[in & 0x1F];
}
#undef LUTOVERLOAD
template <bool HostTrap, typename GFX>
inline rocprofiler_pc_sampling_record_t
copySample(const void* sample)
{
if(HostTrap) return copyHostTrapSample(*(const perf_sample_host_trap_v1*) sample);
rocprofiler_pc_sampling_record_t ret =
copyStochasticSample<GFX>(*(const perf_sample_snapshot_v1*) sample);
ret.snapshot.inst_type = translate_inst<GFX>(ret.snapshot.inst_type);
ret.snapshot.arb_state_issue = translate_arb<GFX>(ret.snapshot.arb_state_issue);
ret.snapshot.arb_state_stall = translate_arb<GFX>(ret.snapshot.arb_state_stall);
ret.snapshot.reason_not_issued = translate_reason<GFX>(ret.snapshot.reason_not_issued);
const auto& sample_ = *static_cast<const perf_sample_host_trap_v1*>(sample);
auto ret = copySampleHeader<rocprofiler_pc_sampling_record_host_trap_v0_t>(sample_);
// TODO: decode other fields.
return ret;
}
// TODO: implement stochastic for GFX11
template <>
inline rocprofiler_pc_sampling_record_stochastic_v0_t
copySample<GFX11, rocprofiler_pc_sampling_record_stochastic_v0_t>(const void* sample)
{
const auto& sample_ = *static_cast<const perf_sample_snapshot_v1*>(sample);
auto ret = copySampleHeader<rocprofiler_pc_sampling_record_stochastic_v0_t>(sample_);
// TODO: decode other fields
// TODO: implement logic for manipulating stochastic related fields
// ret.wave_count = sample_.perf_snapshot_data1 & 0x3F;
return ret;
}
@@ -202,7 +202,8 @@ test_fail_because_of_wrong_agent(const callback_data*
pcs_config->method,
pcs_config->unit,
pcs_config->min_interval,
cb_data->client_buffer),
cb_data->client_buffer,
0),
ROCPROFILER_STATUS_ERROR_AGENT_NOT_FOUND);
}
@@ -218,7 +219,8 @@ test_fail_because_of_wrong_context(const callback_data*
pcs_config->method,
pcs_config->unit,
pcs_config->min_interval,
cb_data->client_buffer),
cb_data->client_buffer,
0),
ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_FOUND);
}
@@ -234,7 +236,8 @@ test_fail_because_of_wrong_buffer(const callback_data*
pcs_config->method,
pcs_config->unit,
pcs_config->min_interval,
not_existing_buffer_id),
not_existing_buffer_id,
0),
ROCPROFILER_STATUS_ERROR_BUFFER_NOT_FOUND);
}
@@ -254,7 +257,8 @@ test_fail_because_of_unsupported_configuration(
pcs_config->method,
pcs_config->unit,
less_than_min_interval,
cb_data->client_buffer),
cb_data->client_buffer,
0),
ROCPROFILER_STATUS_SUCCESS);
EXPECT_NE(rocprofiler_configure_pc_sampling_service(cb_data->client_ctx,
@@ -262,7 +266,8 @@ test_fail_because_of_unsupported_configuration(
pcs_config->method,
pcs_config->unit,
greater_than_max_interval,
cb_data->client_buffer),
cb_data->client_buffer,
0),
ROCPROFILER_STATUS_SUCCESS);
EXPECT_NE(rocprofiler_configure_pc_sampling_service(cb_data->client_ctx,
@@ -270,7 +275,8 @@ test_fail_because_of_unsupported_configuration(
wrong_method,
pcs_config->unit,
pcs_config->max_interval,
cb_data->client_buffer),
cb_data->client_buffer,
0),
ROCPROFILER_STATUS_SUCCESS);
EXPECT_NE(rocprofiler_configure_pc_sampling_service(cb_data->client_ctx,
@@ -278,7 +284,8 @@ test_fail_because_of_unsupported_configuration(
pcs_config->method,
wrong_unit,
pcs_config->max_interval,
cb_data->client_buffer),
cb_data->client_buffer,
0),
ROCPROFILER_STATUS_SUCCESS);
}
@@ -293,7 +300,8 @@ test_fail_because_service_is_already_configured(
pcs_config->method,
pcs_config->unit,
pcs_config->min_interval,
cb_data->client_buffer),
cb_data->client_buffer,
0),
ROCPROFILER_STATUS_ERROR_SERVICE_ALREADY_CONFIGURED);
}
@@ -374,7 +382,8 @@ TEST(pc_sampling, rocprofiler_configure_pc_sampling_service)
pcs_config.method,
pcs_config.unit,
interval,
cb_data->client_buffer),
cb_data->client_buffer,
0),
"Failed to configure PC sampling service");
test_fail_because_service_is_already_configured(cb_data, agent_id, &pcs_config);
@@ -385,7 +394,8 @@ TEST(pc_sampling, rocprofiler_configure_pc_sampling_service)
pcs_config.method,
pcs_config.unit,
interval,
another_buff),
another_buff,
0),
ROCPROFILER_STATUS_ERROR);
}
@@ -452,6 +462,7 @@ TEST(pc_sampling, rocprofiler_configure_pc_sampling_service)
ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP,
ROCPROFILER_PC_SAMPLING_UNIT_TIME,
1,
cb_data.client_buffer),
cb_data.client_buffer,
0),
ROCPROFILER_STATUS_ERROR_CONFIGURATION_LOCKED);
}
@@ -292,7 +292,8 @@ pc_sampling_vs_counter_collection(cc_setup_fn_t cc_setup_fn)
pcs_config.method,
pcs_config.unit,
interval,
cb_data->client_buffer),
cb_data->client_buffer,
0),
ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT);
}
@@ -393,7 +394,8 @@ counter_collection_vs_pc_sampling(cc_setup_fn_t cc_setup_fn)
pcs_config.method,
pcs_config.unit,
interval,
cb_data->client_buffer),
cb_data->client_buffer,
0),
ROCPROFILER_STATUS_SUCCESS);
}
@@ -285,7 +285,8 @@ TEST(pc_sampling, query_configs_after_service_setup)
pcs_config.method,
pcs_config.unit,
interval,
cb_data->client_buffer),
cb_data->client_buffer,
0),
"Failed to configure PC sampling service");
// query configuration and expect to see `pcs_config->max_interval` as the `interval`
@@ -251,7 +251,8 @@ TEST(pc_sampling, processing_pc_samples)
pcs_config.method,
pcs_config.unit,
interval,
cb_data->client_buffer),
cb_data->client_buffer,
0),
"Failed to configure PC sampling service");
ROCPROFILER_CALL(rocprofiler_create_callback_thread(&cb_data->client_thread),
+14 -12
Voir le fichier
@@ -266,7 +266,8 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info,
picked_cfg->method,
picked_cfg->unit,
interval,
buffer_id);
buffer_id,
0);
if(status == ROCPROFILER_STATUS_SUCCESS)
{
*utils::get_output_stream()
@@ -334,24 +335,25 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/,
}
else if(cur_header->category == ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING)
{
if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_SAMPLE)
if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE)
{
auto* pc_sample =
static_cast<rocprofiler_pc_sampling_record_t*>(cur_header->payload);
auto* pc_sample = static_cast<rocprofiler_pc_sampling_record_host_trap_v0_t*>(
cur_header->payload);
ss << "(code_obj_id, offset): (" << pc_sample->pc.loaded_code_object_id
<< ", 0x" << std::hex << pc_sample->pc.loaded_code_object_offset << "), "
ss << "(code_obj_id, offset): (" << pc_sample->pc.code_object_id << ", 0x"
<< std::hex << pc_sample->pc.code_object_offset << "), "
<< "timestamp: " << std::dec << pc_sample->timestamp << ", "
<< "exec: " << std::hex << std::setw(16) << pc_sample->exec_mask << ", "
<< "workgroup_id_(x=" << std::dec << std::setw(5)
<< pc_sample->workgroup_id.x << ", "
<< "y=" << std::setw(5) << pc_sample->workgroup_id.y << ", "
<< "z=" << std::setw(5) << pc_sample->workgroup_id.z << "), "
<< "wave_id: " << std::setw(2)
<< static_cast<unsigned int>(pc_sample->wave_id) << ", "
<< "wave_in_group: " << std::setw(2)
<< static_cast<unsigned int>(pc_sample->wave_in_group) << ", "
<< "chiplet: " << std::setw(2)
<< static_cast<unsigned int>(pc_sample->chiplet) << ", "
<< "cu_id: " << pc_sample->hw_id << ", "
<< static_cast<unsigned int>(pc_sample->hw_id.chiplet)
<< ", "
// << "cu_id: " << pc_sample->hw_id << ", "
<< "correlation: {internal=" << std::setw(7)
<< pc_sample->correlation_id.internal << ", "
<< "external=" << std::setw(5) << pc_sample->correlation_id.external.value
@@ -373,8 +375,8 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/,
assert(corr_id.external.value > 0);
// Decoding the PC
auto inst = translator.get(pc_sample->pc.loaded_code_object_id,
pc_sample->pc.loaded_code_object_offset);
auto inst = translator.get(pc_sample->pc.code_object_id,
pc_sample->pc.code_object_offset);
flat_profile.add_sample(std::move(inst), pc_sample->exec_mask);
}
else