Host trap PC sampling uses new record type (#1207)
* Host trap PC sampling uses new record type
* removing redundant field
* formatting
* simplifying templates in the parser - no need for HostTrap boolean
* reviving some parser tests
* hw_id decoding on GFX9
* HW id parser test
* parser CID test
* Parser multigpu test
* removing rocprofiler_pc_sampling_record_t and some fields from hw_id
* simplifying parser context
* keep bench test internally
* initializing gfx9_hw_id_t differently
* anonymous struct first
* avoiding inlining initialization of struct
[ROCm/rocprofiler-sdk commit: bc52c17e64]
Cette révision appartient à :
@@ -245,7 +245,8 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info,
|
||||
picked_cfg->method,
|
||||
picked_cfg->unit,
|
||||
interval,
|
||||
buffer_id);
|
||||
buffer_id,
|
||||
0);
|
||||
if(status == ROCPROFILER_STATUS_SUCCESS)
|
||||
{
|
||||
*utils::get_output_stream()
|
||||
@@ -305,24 +306,25 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/,
|
||||
}
|
||||
else if(cur_header->category == ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING)
|
||||
{
|
||||
if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_SAMPLE)
|
||||
if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE)
|
||||
{
|
||||
auto* pc_sample =
|
||||
static_cast<rocprofiler_pc_sampling_record_t*>(cur_header->payload);
|
||||
auto* pc_sample = static_cast<rocprofiler_pc_sampling_record_host_trap_v0_t*>(
|
||||
cur_header->payload);
|
||||
|
||||
ss << "(code_obj_id, offset): (" << pc_sample->pc.loaded_code_object_id << ", 0x"
|
||||
<< std::hex << pc_sample->pc.loaded_code_object_offset << "), "
|
||||
ss << "(code_obj_id, offset): (" << pc_sample->pc.code_object_id << ", 0x"
|
||||
<< std::hex << pc_sample->pc.code_object_offset << "), "
|
||||
<< "timestamp: " << std::dec << pc_sample->timestamp << ", "
|
||||
<< "exec: " << std::hex << std::setw(16) << pc_sample->exec_mask << ", "
|
||||
<< "workgroup_id_(x=" << std::dec << std::setw(5) << pc_sample->workgroup_id.x
|
||||
<< ", "
|
||||
<< "y=" << std::setw(5) << pc_sample->workgroup_id.y << ", "
|
||||
<< "z=" << std::setw(5) << pc_sample->workgroup_id.z << "), "
|
||||
<< "wave_id: " << std::setw(2) << static_cast<unsigned int>(pc_sample->wave_id)
|
||||
<< "wave_in_group: " << std::setw(2)
|
||||
<< static_cast<unsigned int>(pc_sample->wave_in_group) << ", "
|
||||
<< "chiplet: " << std::setw(2)
|
||||
<< static_cast<unsigned int>(pc_sample->hw_id.chiplet)
|
||||
<< ", "
|
||||
<< "chiplet: " << std::setw(2) << static_cast<unsigned int>(pc_sample->chiplet)
|
||||
<< ", "
|
||||
<< "cu_id: " << pc_sample->hw_id << ", "
|
||||
// << "cu_id: " << pc_sample->hw_id << ", "
|
||||
<< "correlation: {internal=" << std::setw(7)
|
||||
<< pc_sample->correlation_id.internal << ", "
|
||||
<< "external=" << std::setw(5) << pc_sample->correlation_id.external.value << "}"
|
||||
|
||||
@@ -439,7 +439,8 @@ typedef enum
|
||||
typedef enum
|
||||
{
|
||||
ROCPROFILER_PC_SAMPLING_RECORD_NONE = 0,
|
||||
ROCPROFILER_PC_SAMPLING_RECORD_SAMPLE, ///< ::rocprofiler_pc_sampling_record_t
|
||||
ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE, ///< ::rocprofiler_pc_sampling_record_host_trap_v0_t
|
||||
ROCPROFILER_PC_SAMPLING_RECORD_STOCHASTIC_V0_SAMPLE, ///< for the future use
|
||||
ROCPROFILER_PC_SAMPLING_RECORD_LAST,
|
||||
} rocprofiler_pc_sampling_record_kind_t;
|
||||
|
||||
|
||||
@@ -99,6 +99,7 @@ ROCPROFILER_EXTERN_C_INIT
|
||||
* @param [in] unit - The unit appropriate to the PC sampling type/method.
|
||||
* @param [in] interval - frequency at which PC samples are generated
|
||||
* @param [in] buffer_id - id of the buffer used for delivering PC samples
|
||||
* @param [in] flags - for future use
|
||||
* @return ::rocprofiler_status_t
|
||||
* @retval ::ROCPROFILER_STATUS_SUCCESS PC sampling service configured successfully
|
||||
* @retval ::ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE One of the scenarios is present:
|
||||
@@ -117,7 +118,8 @@ rocprofiler_configure_pc_sampling_service(rocprofiler_context_id_t conte
|
||||
rocprofiler_pc_sampling_method_t method,
|
||||
rocprofiler_pc_sampling_unit_t unit,
|
||||
uint64_t interval,
|
||||
rocprofiler_buffer_id_t buffer_id) ROCPROFILER_API;
|
||||
rocprofiler_buffer_id_t buffer_id,
|
||||
int flags) ROCPROFILER_API;
|
||||
|
||||
/**
|
||||
* @brief PC sampling configuration supported by a GPU agent.
|
||||
@@ -195,122 +197,69 @@ rocprofiler_query_pc_sampling_agent_configurations(
|
||||
void* user_data) ROCPROFILER_API ROCPROFILER_NONNULL(2, 3);
|
||||
|
||||
/**
|
||||
* @brief The header of the @ref rocprofiler_pc_sampling_record_t, indicating
|
||||
* what fields of the @ref rocprofiler_pc_sampling_record_t instance are meaningful
|
||||
* for the sample.
|
||||
* @brief Information about the GPU part where wave was executing
|
||||
* at the moment of sampling.
|
||||
*/
|
||||
typedef struct
|
||||
typedef struct rocprofiler_pc_sampling_hw_id_v0_t
|
||||
{
|
||||
uint8_t valid : 1; /// ::rocprofiler_pc_sampling_snapshot_v1_t field is valid
|
||||
uint8_t type : 4;
|
||||
uint8_t has_stall_reason : 1;
|
||||
uint8_t has_wave_cnt : 1;
|
||||
uint8_t reserved : 1; /// for future use
|
||||
|
||||
/// @var type
|
||||
/// @brief The following values are possible:
|
||||
/// - 0 - reserved
|
||||
/// - 1 - host trap pc sample
|
||||
/// - 2 - stochastic pc sample
|
||||
/// - 3 - perfcounter (unsupported at the moment)
|
||||
/// - other values does not mean anything at the moment
|
||||
/// @var has_stall_reason
|
||||
/// @brief whether the sample contains information about the stall reason.
|
||||
/// If so, please @see rocprofiler_pc_sampling_snapshot_v1_t.
|
||||
/// @var has_wave_cnt
|
||||
/// @brief whether the @ref rocprofiler_pc_sampling_record_t::wave_count
|
||||
/// contains meaningful value
|
||||
} rocprofiler_pc_sampling_header_v1_t;
|
||||
|
||||
/**
|
||||
* @brief For future use.
|
||||
*
|
||||
* @todo: Provide the description
|
||||
* @todo: Should we use bitfields because of C ABI portability?
|
||||
* @todo: Should we abstract this to be architecture agnostic?
|
||||
* @todo: Consider having a query to determine organization of this information.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint32_t dual_issue_valu : 1;
|
||||
uint32_t inst_type : 4;
|
||||
uint32_t reason_not_issued : 7;
|
||||
uint32_t arb_state_issue : 10;
|
||||
uint32_t arb_state_stall : 10;
|
||||
} rocprofiler_pc_sampling_snapshot_v1_t;
|
||||
uint64_t chiplet : 6; ///< chiplet index (3 bits allocated by the ROCr runtime)
|
||||
uint64_t wave_id : 7; ///< wave slot index
|
||||
uint64_t simd_id : 2; ///< SIMD index
|
||||
uint64_t pipe_id : 4; ///< pipe index
|
||||
uint64_t cu_or_wgp_id : 4; ///< Index of compute unit on GFX9 or workgroup processer on other
|
||||
///< architectures
|
||||
uint64_t shader_array_id : 1; ///< Shared array index
|
||||
uint64_t shader_engine_id : 5; ///< shared engine index
|
||||
uint64_t workgroup_id : 7; ///< thread_group index on GFX9, and workgroup index on GFX10+
|
||||
uint64_t vm_id : 6; ///< virtual memory ID
|
||||
uint64_t queue_id : 4; ///< queue id
|
||||
uint64_t microengine_id : 2; ///< ACE (microengine) index
|
||||
uint64_t reserved0 : 16; ///< Reserved for the future use
|
||||
} rocprofiler_pc_sampling_hw_id_v0_t;
|
||||
|
||||
/**
|
||||
* @brief Sampled program counter.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t loaded_code_object_id;
|
||||
uint64_t loaded_code_object_offset;
|
||||
uint64_t code_object_id;
|
||||
uint64_t code_object_offset;
|
||||
|
||||
/// @var loaded_code_object_id
|
||||
/// @var code_object_id
|
||||
/// @brief id of the loaded code object instance that contains sampled PC.
|
||||
/// This fields holds the value ::ROCPROFILER_CODE_OBJECT_ID_NONE
|
||||
/// if the code object cannot be determined
|
||||
/// (e.g., sampled PC belongs to code generated by self modifying code).
|
||||
/// @var loaded_code_object_offset
|
||||
/// @brief If @ref loaded_code_object_id is different than ::ROCPROFILER_CODE_OBJECT_ID_NONE,
|
||||
/// @var code_object_offset
|
||||
/// @brief If @ref code_object_id is different than ::ROCPROFILER_CODE_OBJECT_ID_NONE,
|
||||
/// then this field contains the offset of the sampled PC relative to the
|
||||
/// ::rocprofiler_callback_tracing_code_object_load_data_t::load_base
|
||||
/// of the code object instance with @ref loaded_code_object_id.
|
||||
/// of the code object instance with @ref code_object_id.
|
||||
/// To calculate the original virtual address of the sampled PC, one can add the value
|
||||
/// of this field to the ::rocprofiler_callback_tracing_code_object_load_data_t::load_base.
|
||||
/// The value of @ref loaded_code_object_offset matches
|
||||
/// The value of @ref code_object_offset matches
|
||||
/// the virtual address of the sampled instruction (PC), only if the
|
||||
/// @ref loaded_code_object_id is equal to the ::ROCPROFILER_CODE_OBJECT_ID_NONE.
|
||||
/// @ref code_object_id is equal to the ::ROCPROFILER_CODE_OBJECT_ID_NONE.
|
||||
} rocprofiler_pc_t;
|
||||
|
||||
// TODO: The definition of this structure might change over time
|
||||
// to reduce the space needed to represent a single sample.
|
||||
// TODO: The definition of this struct might change over time.
|
||||
/**
|
||||
* @brief ROCProfiler PC Sampling Record corresponding to the interrupted wave.
|
||||
* @brief ROCProfiler Host-Trap PC Sampling Record.
|
||||
*/
|
||||
typedef struct
|
||||
typedef struct rocprofiler_pc_sampling_record_host_trap_v0_t
|
||||
{
|
||||
uint64_t size; ///< Size of this struct
|
||||
rocprofiler_pc_sampling_header_v1_t flags;
|
||||
uint8_t chiplet; ///< chiplet index
|
||||
uint8_t wave_id; ///< wave identifier within the workgroup
|
||||
uint8_t wave_issued : 1;
|
||||
uint8_t reserved : 7; ///< reserved 7 bits, must be zero
|
||||
uint32_t hw_id; ///< compute unit identifier
|
||||
rocprofiler_pc_t pc; ///< information about sampled program counter
|
||||
uint64_t exec_mask;
|
||||
rocprofiler_dim3_t workgroup_id; ///< wave coordinates within the workgroup
|
||||
uint32_t wave_count;
|
||||
uint64_t timestamp; ///< timestamp when sample is generated
|
||||
rocprofiler_correlation_id_t correlation_id;
|
||||
rocprofiler_pc_sampling_snapshot_v1_t
|
||||
snapshot; ///< @see ::rocprofiler_pc_sampling_snapshot_v1_t
|
||||
uint32_t reserved2; ///< for future use
|
||||
|
||||
/// @var flags
|
||||
/// @brief indicates what fields of this struct are meaningful for the represented sample.
|
||||
/// The values depend on what the underlying GPU agent architecture supports.
|
||||
/// @var wave_issued
|
||||
/// @brief indicates whether the wave is issueing the instruction represented by the @ref pc
|
||||
/// @var exec_mask
|
||||
/// @brief shows how many SIMD lanes of the wave were executing the instruction
|
||||
/// represented by the @ref pc. Useful to understand thread-divergance within the wave
|
||||
/// @var wave_count
|
||||
/// @brief number of active waves on the CU at the moment of sample generation
|
||||
/// @var correlation_id
|
||||
/// @brief correlation id of the API call that initiated a dispatch of the kernel
|
||||
/// during whose execution the wave was interrupted at @ref pc.
|
||||
} rocprofiler_pc_sampling_record_t;
|
||||
uint64_t size; ///< Size of this struct
|
||||
rocprofiler_pc_sampling_hw_id_v0_t hw_id; ///< @see ::rocprofiler_pc_sampling_hw_id_0_t
|
||||
rocprofiler_pc_t pc; ///< information about sampled program counter
|
||||
uint64_t exec_mask; ///< active SIMD lanes when sampled
|
||||
uint64_t timestamp; ///< timestamp when sample is generated
|
||||
uint64_t dispatch_id; ///< originating kernel dispatch ID
|
||||
rocprofiler_correlation_id_t correlation_id; ///< API launch call id that matches dispatch ID
|
||||
rocprofiler_dim3_t workgroup_id; ///< wave coordinates within the workgroup
|
||||
uint32_t wave_in_group : 8; ///< wave position within the workgroup (0-31)
|
||||
uint32_t reserved0 : 24; ///< wave position within the workgroup (0-31)
|
||||
} rocprofiler_pc_sampling_record_host_trap_v0_t;
|
||||
|
||||
/** @} */
|
||||
|
||||
ROCPROFILER_EXTERN_C_FINI
|
||||
|
||||
ROCPROFILER_CXX_CODE(
|
||||
static_assert(sizeof(rocprofiler_pc_sampling_record_t) == 88,
|
||||
"Increasing the size of the pc sampling record is not permitted."));
|
||||
|
||||
ROCPROFILER_CXX_CODE(static_assert(offsetof(rocprofiler_pc_sampling_record_t, chiplet) == 9 &&
|
||||
offsetof(rocprofiler_pc_sampling_record_t, reserved2) == 84,
|
||||
"PC sampling record layout changed."));
|
||||
|
||||
@@ -63,7 +63,8 @@ rocprofiler_configure_pc_sampling_service(rocprofiler_context_id_t conte
|
||||
rocprofiler_pc_sampling_method_t method,
|
||||
rocprofiler_pc_sampling_unit_t unit,
|
||||
uint64_t interval,
|
||||
rocprofiler_buffer_id_t buffer_id)
|
||||
rocprofiler_buffer_id_t buffer_id,
|
||||
int /*flags*/)
|
||||
{
|
||||
if(!is_pc_sampling_explicitly_enabled()) return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED;
|
||||
|
||||
|
||||
+2
-2
@@ -1,7 +1,7 @@
|
||||
set(ROCPROFILER_LIB_PC_SAMPLING_PARSER_SOURCES pc_record_interface.cpp)
|
||||
set(ROCPROFILER_LIB_PC_SAMPLING_PARSER_HEADERS
|
||||
correlation.hpp gfx9.hpp gfx11.hpp parser_types.h pc_record_interface.hpp rocr.h
|
||||
translation.hpp)
|
||||
correlation.hpp gfx9.hpp gfx11.hpp parser_types.hpp pc_record_interface.hpp rocr.h
|
||||
stochastic_records.h translation.hpp)
|
||||
|
||||
target_sources(
|
||||
rocprofiler-sdk-object-library PRIVATE ${ROCPROFILER_LIB_PC_SAMPLING_PARSER_SOURCES}
|
||||
|
||||
+41
-36
@@ -205,13 +205,13 @@ private:
|
||||
|
||||
using address_range_t = rocprofiler::sdk::codeobj::segment::address_range_t;
|
||||
|
||||
template <bool bHostTrap, typename GFXIP>
|
||||
template <typename GFXIP, typename PcSamplingRecordT>
|
||||
inline pcsample_status_t
|
||||
add_upcoming_samples(const device_handle device,
|
||||
const generic_sample_t* buffer,
|
||||
const size_t available_samples,
|
||||
Parser::CorrelationMap* corr_map,
|
||||
rocprofiler_pc_sampling_record_t* samples)
|
||||
add_upcoming_samples(const device_handle device,
|
||||
const generic_sample_t* buffer,
|
||||
const size_t available_samples,
|
||||
Parser::CorrelationMap* corr_map,
|
||||
PcSamplingRecordT* samples)
|
||||
{
|
||||
pcsample_status_t status = PCSAMPLE_STATUS_SUCCESS;
|
||||
auto cache_addr_range = address_range_t{0, 0, ROCPROFILER_CODE_OBJECT_ID_NONE};
|
||||
@@ -226,15 +226,14 @@ add_upcoming_samples(const device_handle device,
|
||||
const auto* snap = reinterpret_cast<const perf_sample_snapshot_v1*>(buffer + p);
|
||||
|
||||
auto& pc_sample = samples[p];
|
||||
pc_sample = copySample<bHostTrap, GFXIP>((const void*) (buffer + p));
|
||||
pc_sample.size = sizeof(rocprofiler_pc_sampling_record_t);
|
||||
pc_sample = copySample<GFXIP, PcSamplingRecordT>((const void*) (buffer + p));
|
||||
|
||||
// Convert PC -> (loaded code object id containing PC, offset within code object)
|
||||
if(!cache_addr_range.inrange(snap->pc))
|
||||
cache_addr_range = table->find_codeobj_in_range(snap->pc);
|
||||
|
||||
pc_sample.pc.loaded_code_object_id = cache_addr_range.id;
|
||||
pc_sample.pc.loaded_code_object_offset = snap->pc - cache_addr_range.addr;
|
||||
pc_sample.pc.code_object_id = cache_addr_range.id;
|
||||
pc_sample.pc.code_object_offset = snap->pc - cache_addr_range.addr;
|
||||
|
||||
try
|
||||
{
|
||||
@@ -251,13 +250,13 @@ add_upcoming_samples(const device_handle device,
|
||||
return status;
|
||||
}
|
||||
|
||||
template <typename GFXIP>
|
||||
template <typename GFXIP, typename PcSamplingRecordT>
|
||||
inline pcsample_status_t
|
||||
_parse_buffer(generic_sample_t* buffer,
|
||||
uint64_t buffer_size,
|
||||
user_callback_t callback,
|
||||
void* userdata,
|
||||
Parser::CorrelationMap* corr_map)
|
||||
_parse_buffer(generic_sample_t* buffer,
|
||||
uint64_t buffer_size,
|
||||
user_callback_t<PcSamplingRecordT> callback,
|
||||
void* userdata,
|
||||
Parser::CorrelationMap* corr_map)
|
||||
{
|
||||
// Maximum size
|
||||
uint64_t index = 0;
|
||||
@@ -283,26 +282,31 @@ _parse_buffer(generic_sample_t* buffer,
|
||||
uint64_t pkt_counter = pkt.num_samples;
|
||||
if(index + pkt_counter > buffer_size) return PCSAMPLE_STATUS_OUT_OF_BOUNDS_ERROR;
|
||||
|
||||
bool bIsHostTrap = pkt.which_sample_type == AMD_HOST_TRAP_V1;
|
||||
// I don't think we need this.
|
||||
// bool bIsHostTrap = pkt.which_sample_type == AMD_HOST_TRAP_V1;
|
||||
|
||||
while(pkt_counter > 0)
|
||||
{
|
||||
rocprofiler_pc_sampling_record_t* samples = nullptr;
|
||||
PcSamplingRecordT* samples = nullptr;
|
||||
uint64_t available_samples = callback(&samples, pkt_counter, userdata);
|
||||
|
||||
if(available_samples == 0 || available_samples > pkt_counter)
|
||||
return PCSAMPLE_STATUS_CALLBACK_ERROR;
|
||||
|
||||
if(bIsHostTrap)
|
||||
{
|
||||
status |= add_upcoming_samples<true, GFXIP>(
|
||||
pkt.device, buffer + index, available_samples, corr_map, samples);
|
||||
}
|
||||
else
|
||||
{
|
||||
status |= add_upcoming_samples<false, GFXIP>(
|
||||
pkt.device, buffer + index, available_samples, corr_map, samples);
|
||||
}
|
||||
// I don't think we need if-else here
|
||||
// if(bIsHostTrap)
|
||||
// {
|
||||
// status |= add_upcoming_samples<GFXIP>(
|
||||
// pkt.device, buffer + index, available_samples, corr_map, samples);
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// status |= add_upcoming_samples<GFXIP>(
|
||||
// pkt.device, buffer + index, available_samples, corr_map, samples);
|
||||
// }
|
||||
|
||||
status |= add_upcoming_samples<GFXIP>(
|
||||
pkt.device, buffer + index, available_samples, corr_map, samples);
|
||||
|
||||
index += available_samples;
|
||||
pkt_counter -= available_samples;
|
||||
@@ -329,19 +333,20 @@ _parse_buffer(generic_sample_t* buffer,
|
||||
* a size smaller than requested, then it may be called again requesting more memory.
|
||||
* @param[in] userdata parameter forwarded to the user callback.
|
||||
*/
|
||||
pcsample_status_t inline parse_buffer(generic_sample_t* buffer,
|
||||
uint64_t buffer_size,
|
||||
int gfxip_major,
|
||||
user_callback_t callback,
|
||||
void* userdata)
|
||||
template <typename PcSamplingRecordT>
|
||||
pcsample_status_t inline parse_buffer(generic_sample_t* buffer,
|
||||
uint64_t buffer_size,
|
||||
int gfxip_major,
|
||||
user_callback_t<PcSamplingRecordT> callback,
|
||||
void* userdata)
|
||||
{
|
||||
static auto corr_map = std::make_unique<Parser::CorrelationMap>();
|
||||
|
||||
auto parseSample_func = _parse_buffer<GFX9>;
|
||||
auto parseSample_func = _parse_buffer<GFX9, PcSamplingRecordT>;
|
||||
if(gfxip_major == 9)
|
||||
parseSample_func = _parse_buffer<GFX9>;
|
||||
parseSample_func = _parse_buffer<GFX9, PcSamplingRecordT>;
|
||||
else if(gfxip_major == 11)
|
||||
parseSample_func = _parse_buffer<GFX11>;
|
||||
parseSample_func = _parse_buffer<GFX11, PcSamplingRecordT>;
|
||||
else
|
||||
return PCSAMPLE_STATUS_INVALID_GFXIP;
|
||||
|
||||
|
||||
+2
-7
@@ -81,13 +81,8 @@ enum pcsample_arb_issue_state
|
||||
};
|
||||
} // namespace PCSAMPLE
|
||||
|
||||
union pcsample_header_v1_t
|
||||
{
|
||||
rocprofiler_pc_sampling_header_v1_t flags;
|
||||
uint8_t raw;
|
||||
};
|
||||
|
||||
typedef uint64_t (*user_callback_t)(rocprofiler_pc_sampling_record_t**, uint64_t, void*);
|
||||
template <typename PcSamplingRecordT>
|
||||
using user_callback_t = uint64_t (*)(PcSamplingRecordT**, uint64_t, void*);
|
||||
|
||||
/**
|
||||
* The types of errors to be returned by parse_buffer.
|
||||
+63
-11
@@ -22,13 +22,31 @@
|
||||
|
||||
#include "lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.hpp"
|
||||
|
||||
template <>
|
||||
uint64_t
|
||||
PCSamplingParserContext::alloc(rocprofiler_pc_sampling_record_t** buffer, uint64_t size)
|
||||
PCSamplingParserContext::alloc<rocprofiler_pc_sampling_record_host_trap_v0_t>(
|
||||
rocprofiler_pc_sampling_record_host_trap_v0_t** buffer,
|
||||
uint64_t size)
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> lock(mut);
|
||||
assert(buffer != nullptr);
|
||||
data.emplace_back(std::make_unique<PCSamplingData>(size));
|
||||
*buffer = data.back()->samples.data();
|
||||
host_trap_data.emplace_back(
|
||||
std::make_unique<PCSamplingData<rocprofiler_pc_sampling_record_host_trap_v0_t>>(size));
|
||||
*buffer = host_trap_data.back()->samples.data();
|
||||
return size;
|
||||
}
|
||||
|
||||
template <>
|
||||
uint64_t
|
||||
PCSamplingParserContext::alloc<rocprofiler_pc_sampling_record_stochastic_v0_t>(
|
||||
rocprofiler_pc_sampling_record_stochastic_v0_t** buffer,
|
||||
uint64_t size)
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> lock(mut);
|
||||
assert(buffer != nullptr);
|
||||
stochastic_data.emplace_back(
|
||||
std::make_unique<PCSamplingData<rocprofiler_pc_sampling_record_stochastic_v0_t>>(size));
|
||||
*buffer = stochastic_data.back()->samples.data();
|
||||
return size;
|
||||
}
|
||||
|
||||
@@ -39,10 +57,21 @@ PCSamplingParserContext::parse(const upcoming_samples_t& upcoming,
|
||||
std::condition_variable& midway_signal,
|
||||
bool bRocrBufferFlip)
|
||||
{
|
||||
bool bIsHostTrap = upcoming.which_sample_type == AMD_HOST_TRAP_V1;
|
||||
|
||||
// Template instantiation is faster!
|
||||
auto parseSample_func = &PCSamplingParserContext::_parse<GFX9>;
|
||||
auto parseSample_func =
|
||||
bIsHostTrap
|
||||
? &PCSamplingParserContext::_parse<GFX9, rocprofiler_pc_sampling_record_host_trap_v0_t>
|
||||
: &PCSamplingParserContext::_parse<GFX9,
|
||||
rocprofiler_pc_sampling_record_stochastic_v0_t>;
|
||||
if(gfxip_major == 11)
|
||||
parseSample_func = &PCSamplingParserContext::_parse<GFX11>;
|
||||
parseSample_func =
|
||||
bIsHostTrap
|
||||
? &PCSamplingParserContext::_parse<GFX11,
|
||||
rocprofiler_pc_sampling_record_host_trap_v0_t>
|
||||
: &PCSamplingParserContext::_parse<GFX11,
|
||||
rocprofiler_pc_sampling_record_stochastic_v0_t>;
|
||||
else if(gfxip_major != 9)
|
||||
return PCSAMPLE_STATUS_INVALID_GFXIP;
|
||||
|
||||
@@ -98,11 +127,13 @@ PCSamplingParserContext::shouldFlipRocrBuffer(const dispatch_pkt_id_t& pkt) cons
|
||||
return corr_map->checkDispatch(pkt);
|
||||
}
|
||||
|
||||
template <typename PcSamplingRecordKindT>
|
||||
void
|
||||
PCSamplingParserContext::generate_upcoming_pc_record(
|
||||
uint64_t agent_id_handle,
|
||||
const rocprofiler_pc_sampling_record_t* samples,
|
||||
size_t num_samples)
|
||||
uint64_t agent_id_handle,
|
||||
const PcSamplingRecordKindT* samples,
|
||||
size_t num_samples,
|
||||
rocprofiler_pc_sampling_record_kind_t record_kind)
|
||||
{
|
||||
auto buff_id = _agent_buffers.at(rocprofiler_agent_id_t{agent_id_handle});
|
||||
rocprofiler::buffer::instance* buff = rocprofiler::buffer::get_buffer(buff_id);
|
||||
@@ -111,7 +142,28 @@ PCSamplingParserContext::generate_upcoming_pc_record(
|
||||
throw std::runtime_error(fmt::format("Buffer with id: {} does not exists", buff_id.handle));
|
||||
|
||||
for(size_t i = 0; i < num_samples; i++)
|
||||
buff->emplace(ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING,
|
||||
ROCPROFILER_PC_SAMPLING_RECORD_SAMPLE,
|
||||
samples[i]);
|
||||
buff->emplace(ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING, record_kind, samples[i]);
|
||||
}
|
||||
|
||||
template <>
|
||||
void
|
||||
PCSamplingParserContext::generate_upcoming_pc_record<rocprofiler_pc_sampling_record_host_trap_v0_t>(
|
||||
uint64_t agent_id_handle,
|
||||
const rocprofiler_pc_sampling_record_host_trap_v0_t* samples,
|
||||
size_t num_samples)
|
||||
{
|
||||
this->generate_upcoming_pc_record(
|
||||
agent_id_handle, samples, num_samples, ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE);
|
||||
}
|
||||
|
||||
template <>
|
||||
void
|
||||
PCSamplingParserContext::generate_upcoming_pc_record<
|
||||
rocprofiler_pc_sampling_record_stochastic_v0_t>(
|
||||
uint64_t agent_id_handle,
|
||||
const rocprofiler_pc_sampling_record_stochastic_v0_t* samples,
|
||||
size_t num_samples)
|
||||
{
|
||||
this->generate_upcoming_pc_record(
|
||||
agent_id_handle, samples, num_samples, ROCPROFILER_PC_SAMPLING_RECORD_STOCHASTIC_V0_SAMPLE);
|
||||
}
|
||||
|
||||
+31
-14
@@ -24,7 +24,8 @@
|
||||
|
||||
#include "lib/rocprofiler-sdk/buffer.hpp"
|
||||
#include "lib/rocprofiler-sdk/pc_sampling/parser/correlation.hpp"
|
||||
#include "lib/rocprofiler-sdk/pc_sampling/parser/parser_types.h"
|
||||
#include "lib/rocprofiler-sdk/pc_sampling/parser/parser_types.hpp"
|
||||
#include "lib/rocprofiler-sdk/pc_sampling/parser/stochastic_records.h"
|
||||
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/cxx/hash.hpp>
|
||||
@@ -41,13 +42,14 @@
|
||||
#include <thread>
|
||||
#include <unordered_set>
|
||||
|
||||
template <typename PcSamplingRecordT>
|
||||
struct PCSamplingData
|
||||
{
|
||||
PCSamplingData(size_t size)
|
||||
: samples(size){};
|
||||
PCSamplingData& operator=(PCSamplingData&) = delete;
|
||||
|
||||
std::vector<rocprofiler_pc_sampling_record_t> samples;
|
||||
std::vector<PcSamplingRecordT> samples;
|
||||
};
|
||||
|
||||
class PCSamplingParserContext
|
||||
@@ -55,13 +57,16 @@ class PCSamplingParserContext
|
||||
public:
|
||||
PCSamplingParserContext()
|
||||
: corr_map(std::make_unique<Parser::CorrelationMap>()){};
|
||||
|
||||
/**
|
||||
* @brief Allocates some memory. TODO: Translate to Jonathan's buffer implementation.
|
||||
* @brief Allocates some memory for samples.
|
||||
* TODO: Translate to Jonathan's buffer implementation.
|
||||
* @param[out] buffer Pointer where samples are to be written to.
|
||||
* @param[in] size Number of samples requested.
|
||||
* @returns Number of samples actually allocated on *buffer.
|
||||
*/
|
||||
uint64_t alloc(rocprofiler_pc_sampling_record_t** buffer, uint64_t size);
|
||||
template <typename PcSamplingRecordT>
|
||||
uint64_t alloc(PcSamplingRecordT** buffer, uint64_t size);
|
||||
|
||||
/**
|
||||
* @brief Parses a chunk of samples.
|
||||
@@ -127,7 +132,7 @@ protected:
|
||||
* @brief Parses the given input data and generates pc sampling records.
|
||||
* Calls generate_upcoming_pc_record().
|
||||
*/
|
||||
template <typename GFX>
|
||||
template <typename GFX, typename PcSamplingRecordT>
|
||||
pcsample_status_t _parse(const upcoming_samples_t& upcoming, const generic_sample_t* data_)
|
||||
{
|
||||
// std::shared_lock<std::shared_mutex> lock(mut);
|
||||
@@ -139,16 +144,16 @@ protected:
|
||||
|
||||
while(pkt_counter > 0)
|
||||
{
|
||||
rocprofiler_pc_sampling_record_t* samples = nullptr;
|
||||
uint64_t memsize = alloc(&samples, pkt_counter);
|
||||
PcSamplingRecordT* samples = nullptr;
|
||||
uint64_t memsize = alloc(&samples, pkt_counter);
|
||||
|
||||
if(memsize == 0 || memsize > pkt_counter) return PCSAMPLE_STATUS_CALLBACK_ERROR;
|
||||
|
||||
auto* map = corr_map.get();
|
||||
if(bIsHostTrap)
|
||||
status |= add_upcoming_samples<true, GFX>(dev, data_, memsize, map, samples);
|
||||
status |= add_upcoming_samples<GFX>(dev, data_, memsize, map, samples);
|
||||
else
|
||||
status |= add_upcoming_samples<false, GFX>(dev, data_, memsize, map, samples);
|
||||
status |= add_upcoming_samples<GFX>(dev, data_, memsize, map, samples);
|
||||
|
||||
data_ += memsize;
|
||||
pkt_counter -= memsize;
|
||||
@@ -164,14 +169,26 @@ protected:
|
||||
*/
|
||||
pcsample_status_t flushForgetList();
|
||||
static void generate_id_completion_record(const dispatch_pkt_id_t& pkt) { (void) pkt; };
|
||||
void generate_upcoming_pc_record(uint64_t agent_id_handle,
|
||||
const rocprofiler_pc_sampling_record_t* samples,
|
||||
size_t num_samples);
|
||||
|
||||
template <typename PcSamplingRecordT>
|
||||
void generate_upcoming_pc_record(uint64_t agent_id_handle,
|
||||
const PcSamplingRecordT* samples,
|
||||
size_t num_samples);
|
||||
|
||||
template <typename PcSamplingRecordT>
|
||||
void generate_upcoming_pc_record(uint64_t agent_id_handle,
|
||||
const PcSamplingRecordT* samples,
|
||||
size_t num_samples,
|
||||
rocprofiler_pc_sampling_record_kind_t record_kind);
|
||||
|
||||
//! Maps doorbells and dispatch_index to correlation_id
|
||||
std::unique_ptr<Parser::CorrelationMap> corr_map;
|
||||
//! Data allocated to store samples. Temporary.
|
||||
std::vector<std::unique_ptr<PCSamplingData>> data;
|
||||
//! Data allocated to store host trap and stochastic samples, respectively.
|
||||
//! Temporary solution until we figured out a smooth way to copy data directly to SDK's buffers.
|
||||
std::vector<std::unique_ptr<PCSamplingData<rocprofiler_pc_sampling_record_host_trap_v0_t>>>
|
||||
host_trap_data;
|
||||
std::vector<std::unique_ptr<PCSamplingData<rocprofiler_pc_sampling_record_stochastic_v0_t>>>
|
||||
stochastic_data;
|
||||
//! Dispatches not yet completed.
|
||||
// Uses only the internal correlation_id.
|
||||
std::unordered_map<uint64_t, dispatch_pkt_id_t> active_dispatches;
|
||||
|
||||
+180
@@ -0,0 +1,180 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/pc_sampling.h>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/**
|
||||
* @brief The header of the @ref rocprofiler_pc_sampling_record_stochastic_v0_t, indicating
|
||||
* what fields of the @ref rocprofiler_pc_sampling_record_stochastic_v0_t instance are meaningful
|
||||
* for the sample.
|
||||
*/
|
||||
typedef struct rocprofiler_pc_sampling_record_stochastic_header_t
|
||||
{
|
||||
uint8_t valid : 1; ///< pc sample is valid
|
||||
uint8_t has_memory_counter : 1; ///< pc sample provides memory counters information
|
||||
///< via ::rocprofiler_pc_sampling_memory_counters_t
|
||||
uint8_t reserved_type : 6;
|
||||
} rocprofiler_pc_sampling_record_stochastic_header_t;
|
||||
|
||||
/**
|
||||
* @brief Enumaration describing sampled instruction type.
|
||||
*/
|
||||
typedef enum rocprofiler_pc_sampling_instruction_type_t
|
||||
{
|
||||
// Do we need ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_NONE=0? (we defined *_NONE in some other
|
||||
// enums ) If so, then parser needs to add offset +1 after determining the type
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_VALU = 0,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_MATRIX,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_SCALAR,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_TEX,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LDS,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LDS_DIRECT,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_FLAT,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_EXPORT,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_MESSAGE,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BARRIER,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_NOT_TAKEN,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_TAKEN,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_JUMP,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_OTHER,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_NO_INST,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_DUAL_VALU,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LAST
|
||||
} rocprofiler_pc_sampling_instruction_type_t;
|
||||
|
||||
/**
|
||||
* @brief Enumaration describing reason for not issuing an instruction.
|
||||
*/
|
||||
typedef enum pcsample_reason_not_issued
|
||||
{
|
||||
// Do we need ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_NONE=0? (we defined *_NONE in some
|
||||
// other enums ) If so, then parser needs to add offset +1 after determining the reason.
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NOT_AVAILABLE = 0,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_WAITCNT,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_INTERNAL,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_BARRIER,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_EX_STALL,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_OTHER_WAIT,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_SLEEP,
|
||||
ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_LAST
|
||||
} rocprofiler_pc_sampling_instruction_not_issued_reason_t;
|
||||
|
||||
/**
|
||||
* @brief Data provided by stochastic sampling hardware.
|
||||
*
|
||||
*/
|
||||
typedef struct rocprofiler_pc_sampling_snapshot_v0_t
|
||||
{
|
||||
uint32_t
|
||||
reason_not_issued : 4; ///< The reason for not issuing an instruction.
|
||||
///< (9 different issue reason fits in 4 bits)
|
||||
///< The field takes one of the value defined in
|
||||
///< @ref ::rocprofiler_pc_sampling_instruction_not_issued_reason_t
|
||||
uint32_t reserved0 : 1; ///< reserved for future use
|
||||
uint32_t arb_state_issue_valu : 1; ///< arbiter issued a VALU instruction
|
||||
uint32_t arb_state_issue_matrix : 1; ///< arbiter issued a matrix instruction
|
||||
uint32_t arb_state_issue_lds : 1; ///< arbiter issued a LDS instruction
|
||||
uint32_t arb_state_issue_lds_direct : 1; ///< arbiter issued a LDS direct instruction
|
||||
uint32_t arb_state_issue_scalar : 1; ///< arbiter issued a scalar (SALU/SMEM) instruction
|
||||
uint32_t arb_state_issue_vmem_tex : 1; ///< arbiter issued a texture instruction
|
||||
uint32_t arb_state_issue_flat : 1; ///< arbiter issued a FLAT instruction
|
||||
uint32_t arb_state_issue_exp : 1; ///< arbiter issued a export instruction
|
||||
uint32_t arb_state_issue_misc : 1; ///< arbiter issued a miscellaneous instruction
|
||||
uint32_t arb_state_issue_brmsg : 1; ///< arbiter issued a branch/message instruction
|
||||
uint32_t arb_state_issue_reserved : 1; ///< reserved for the future use
|
||||
// Replacing `uint32_t arb_state_stall : 10;`
|
||||
uint32_t arb_state_stall_valu : 1; ///< VALU instruction was stalled when sampled is generated
|
||||
uint32_t
|
||||
arb_state_stall_matrix : 1; ///< matrix instruction was stalled when sampled is generated
|
||||
uint32_t arb_state_stall_lds : 1; ///< LDS instruction was stalled when sampled is generated
|
||||
uint32_t arb_state_stall_lds_direct : 1; ///< LDS direct instruction was stalled when sampled
|
||||
///< is generated
|
||||
uint32_t arb_state_stall_scalar : 1; ///< Scalar (SALU/SMEM) instruction was stalled when
|
||||
///< sampled is generated
|
||||
uint32_t arb_state_stall_vmem_tex : 1; ///< texture instruction was stalled when sampled is
|
||||
///< generated
|
||||
uint32_t arb_state_stall_flat : 1; ///< flat instruction was stalled when sampled is generated
|
||||
uint32_t arb_state_stall_exp : 1; ///< export instruction was stalled when sampled is generated
|
||||
uint32_t arb_state_stall_misc : 1; ///< miscellaneous instruction was stalled when sampled is
|
||||
///< generated
|
||||
uint32_t arb_state_stall_brmsg : 1; ///< branch/message instruction was stalled when sampled is
|
||||
///< generated
|
||||
uint32_t arb_state_state_reserved : 1; ///< reserved for the future use
|
||||
// We have two reserved bits
|
||||
uint32_t
|
||||
dual_issue_valu : 1; ///< two VALU instructions issued for coexecution (MI3xx specific)
|
||||
uint32_t reserved1 : 1; ///< reserved for the future use
|
||||
uint32_t reserved2 : 3; ///< reserved for the future use
|
||||
} rocprofiler_pc_sampling_snapshot_v0_t;
|
||||
|
||||
/**
|
||||
* @brief Counters of issued instructions.
|
||||
*/
|
||||
typedef struct rocprofiler_pc_sampling_memory_counters_t
|
||||
{
|
||||
uint32_t load_cnt : 6; ///< Counts the number of VMEM load instructions issued but not yet
|
||||
///< completed.
|
||||
uint32_t store_cnt : 6; ///< Counts the number of VMEM store instructions issued but not yet
|
||||
///< completed.
|
||||
uint32_t
|
||||
bvh_cnt : 3; ///< Counts the number of VMEM BVH instructions issued but not yet completed.
|
||||
uint32_t sample_cnt : 6; ///< Counts the number of VMEM sample instructions issued but not yet
|
||||
///< completed.
|
||||
uint32_t ds_cnt : 6; ///< Counts the number of LDS instructions issued but not yet completed.
|
||||
uint32_t km_cnt : 5; ///< Counts the number of scalar memory reads and memory instructions
|
||||
///< issued but not yet completed.
|
||||
} rocprofiler_pc_sampling_memory_counters_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Stochastic PC Sampling Record.
|
||||
*/
|
||||
typedef struct rocprofiler_pc_sampling_record_stochastic_v0_t
|
||||
{
|
||||
// TODO: use size to know whether memory counters exist or not
|
||||
uint64_t size; ///< Size of this struct
|
||||
rocprofiler_pc_sampling_record_stochastic_header_t
|
||||
flags; ///< defines what fields are relevant for the sample
|
||||
uint8_t wave_in_group; ///< wave position within the workgroup (0-15)
|
||||
uint8_t wave_issued : 1; ///< wave issued the instruction represented with the PC
|
||||
uint8_t inst_type : 5; ///< instruction type, takes a value defined in @ref
|
||||
///< ::rocprofiler_pc_sampling_instruction_type_t
|
||||
uint8_t reserved : 2; ///< reserved 2 bits must be zero
|
||||
rocprofiler_pc_sampling_hw_id_v0_t hw_id; ///< @see ::rocprofiler_pc_sampling_hw_id_v0_t
|
||||
rocprofiler_pc_t pc; ///< information about sampled program counter
|
||||
uint64_t exec_mask; ///< active SIMD lanes at the moment sampling
|
||||
rocprofiler_dim3_t workgroup_id; ///< wave coordinates within the workgroup
|
||||
uint32_t wave_count; /// active waves on the CU at the moment of sampling
|
||||
uint64_t timestamp; ///< timestamp when sample is generated
|
||||
uint64_t dispatch_id; ///< originating kernel dispatch ID
|
||||
rocprofiler_correlation_id_t correlation_id;
|
||||
rocprofiler_pc_sampling_snapshot_v0_t
|
||||
snapshot; ///< @see ::rocprofiler_pc_sampling_snapshot_v0_t
|
||||
rocprofiler_pc_sampling_memory_counters_t
|
||||
memory_counters; ///< @see ::rocprofiler_pc_sampling_memory_counters_t
|
||||
} rocprofiler_pc_sampling_record_stochastic_v0_t;
|
||||
+29
-21
@@ -31,6 +31,7 @@
|
||||
* Benchmarks how fast the parser can process samples on a single threaded case
|
||||
* Current: 5600X with -Ofast, up to >140 million samples/s or ~9GB/s R/W (18GB/s bidirectional)
|
||||
*/
|
||||
template <typename PcSamplingRecordT>
|
||||
static bool
|
||||
Benchmark(bool bWarmup)
|
||||
{
|
||||
@@ -38,14 +39,16 @@ Benchmark(bool bWarmup)
|
||||
constexpr size_t DISP_PER_QUEUE = 8;
|
||||
constexpr size_t NUM_QUEUES = 4;
|
||||
|
||||
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>();
|
||||
std::array<std::vector<std::shared_ptr<MockDispatch>>, NUM_QUEUES> active_dispatches;
|
||||
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
|
||||
std::array<std::vector<std::shared_ptr<MockDispatch<PcSamplingRecordT>>>, NUM_QUEUES>
|
||||
active_dispatches;
|
||||
|
||||
for(size_t q = 0; q < NUM_QUEUES; q++)
|
||||
{
|
||||
std::shared_ptr<MockQueue> queue = std::make_shared<MockQueue>(DISP_PER_QUEUE * 2, buffer);
|
||||
auto queue = std::make_shared<MockQueue<PcSamplingRecordT>>(DISP_PER_QUEUE * 2, buffer);
|
||||
for(size_t d = 0; d < DISP_PER_QUEUE; d++)
|
||||
active_dispatches[q].push_back(std::make_shared<MockDispatch>(queue));
|
||||
active_dispatches[q].push_back(
|
||||
std::make_shared<MockDispatch<PcSamplingRecordT>>(queue));
|
||||
}
|
||||
|
||||
constexpr size_t TOTAL_NUM_SAMPLES = NUM_QUEUES * DISP_PER_QUEUE * SAMPLE_PER_DISPATCH;
|
||||
@@ -56,23 +59,24 @@ Benchmark(bool bWarmup)
|
||||
for(size_t i = 0; i < SAMPLE_PER_DISPATCH; i++)
|
||||
MockWave(dispatch).genPCSample();
|
||||
|
||||
std::pair<rocprofiler_pc_sampling_record_t*, size_t> userdata;
|
||||
userdata.first = new rocprofiler_pc_sampling_record_t[TOTAL_NUM_SAMPLES];
|
||||
std::pair<PcSamplingRecordT*, size_t> userdata;
|
||||
userdata.first = new PcSamplingRecordT[TOTAL_NUM_SAMPLES];
|
||||
userdata.second = TOTAL_NUM_SAMPLES;
|
||||
|
||||
auto t0 = std::chrono::system_clock::now();
|
||||
CHECK_PARSER(parse_buffer(
|
||||
(generic_sample_t*) buffer->packets.data(),
|
||||
buffer->packets.size(),
|
||||
GFXIP_MAJOR,
|
||||
[](rocprofiler_pc_sampling_record_t** sample, uint64_t size, void* userdata_) {
|
||||
auto* pair =
|
||||
reinterpret_cast<std::pair<rocprofiler_pc_sampling_record_t*, size_t>*>(userdata_);
|
||||
user_callback_t<PcSamplingRecordT> user_cb =
|
||||
[](PcSamplingRecordT** sample, uint64_t size, void* userdata_) {
|
||||
auto* pair = reinterpret_cast<std::pair<PcSamplingRecordT*, size_t>*>(userdata_);
|
||||
assert(TOTAL_NUM_SAMPLES == pair->second);
|
||||
*sample = pair->first;
|
||||
return size;
|
||||
},
|
||||
&userdata));
|
||||
};
|
||||
|
||||
auto t0 = std::chrono::system_clock::now();
|
||||
CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(),
|
||||
buffer->packets.size(),
|
||||
GFXIP_MAJOR,
|
||||
user_cb,
|
||||
&userdata));
|
||||
auto t1 = std::chrono::system_clock::now();
|
||||
float samples_per_us = float(TOTAL_NUM_SAMPLES) / (t1 - t0).count() * 1E3f;
|
||||
|
||||
@@ -80,8 +84,7 @@ Benchmark(bool bWarmup)
|
||||
{
|
||||
std::cout << "Benchmark: Parsed " << int(samples_per_us * 1E3f + 0.5f) * 1E-3f
|
||||
<< " Msample/s (";
|
||||
std::cout << int(sizeof(rocprofiler_pc_sampling_record_t) * samples_per_us) << " MB/s)"
|
||||
<< std::endl;
|
||||
std::cout << int(sizeof(PcSamplingRecordT) * samples_per_us) << " MB/s)" << std::endl;
|
||||
}
|
||||
|
||||
delete[] userdata.first;
|
||||
@@ -90,7 +93,12 @@ Benchmark(bool bWarmup)
|
||||
|
||||
TEST(pcs_parser, benchmark_test)
|
||||
{
|
||||
EXPECT_EQ(Benchmark(true), true);
|
||||
EXPECT_EQ(Benchmark(false), true);
|
||||
EXPECT_EQ(Benchmark(false), true);
|
||||
// Tests for host trap v0 records
|
||||
EXPECT_EQ(Benchmark<rocprofiler_pc_sampling_record_host_trap_v0_t>(true), true);
|
||||
EXPECT_EQ(Benchmark<rocprofiler_pc_sampling_record_host_trap_v0_t>(false), true);
|
||||
EXPECT_EQ(Benchmark<rocprofiler_pc_sampling_record_host_trap_v0_t>(false), true);
|
||||
// tests for stochastic v0 records
|
||||
EXPECT_EQ(Benchmark<rocprofiler_pc_sampling_record_stochastic_v0_t>(true), true);
|
||||
EXPECT_EQ(Benchmark<rocprofiler_pc_sampling_record_stochastic_v0_t>(false), true);
|
||||
EXPECT_EQ(Benchmark<rocprofiler_pc_sampling_record_stochastic_v0_t>(false), true);
|
||||
}
|
||||
|
||||
+141
-89
@@ -33,15 +33,15 @@ std::mt19937 rdgen(1);
|
||||
/**
|
||||
* Sample user memory allocation callback.
|
||||
* It expects userdata to be cast-able to a pointer to
|
||||
* std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>>
|
||||
* std::vector<std::pair<PcSamplingRecordT*, uint64_t>>
|
||||
*/
|
||||
template <typename PcSamplingRecordT>
|
||||
static uint64_t
|
||||
alloc_callback(rocprofiler_pc_sampling_record_t** buffer, uint64_t size, void* userdata)
|
||||
alloc_callback(PcSamplingRecordT** buffer, uint64_t size, void* userdata)
|
||||
{
|
||||
*buffer = new rocprofiler_pc_sampling_record_t[size];
|
||||
*buffer = new PcSamplingRecordT[size];
|
||||
auto& vector =
|
||||
*reinterpret_cast<std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>>*>(
|
||||
userdata);
|
||||
*reinterpret_cast<std::vector<std::pair<PcSamplingRecordT*, uint64_t>>*>(userdata);
|
||||
vector.push_back({*buffer, size});
|
||||
return size;
|
||||
}
|
||||
@@ -50,35 +50,34 @@ alloc_callback(rocprofiler_pc_sampling_record_t** buffer, uint64_t size, void* u
|
||||
* Uses the MockWave dispatch's unique_id store in the pc field to verify
|
||||
* the reconstructed correlation_id.
|
||||
*/
|
||||
template <typename PcSamplingRecordT>
|
||||
static bool
|
||||
check_samples(rocprofiler_pc_sampling_record_t* samples, uint64_t size)
|
||||
check_samples(PcSamplingRecordT* samples, uint64_t size)
|
||||
{
|
||||
// TODO: replace with (code_obj_id, pc)
|
||||
for(size_t i = 0; i < size; i++)
|
||||
if(samples[i].correlation_id.internal != samples[i].pc.loaded_code_object_offset)
|
||||
return false;
|
||||
if(samples[i].correlation_id.internal != samples[i].pc.code_object_offset) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simplest mock classes use, generates a single queue+dispatch with 2 PC samples.
|
||||
*/
|
||||
TEST(pcs_parser, hello_world)
|
||||
template <typename PcSamplingRecordT>
|
||||
void
|
||||
pcs_parser_hello_world()
|
||||
{
|
||||
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>();
|
||||
std::shared_ptr<MockQueue> queue = std::make_shared<MockQueue>(16, buffer);
|
||||
std::shared_ptr<MockDispatch> dispatch = std::make_shared<MockDispatch>(queue);
|
||||
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
|
||||
auto queue = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
|
||||
auto dispatch = std::make_shared<MockDispatch<PcSamplingRecordT>>(queue);
|
||||
|
||||
buffer->genUpcomingSamples(2);
|
||||
MockWave(dispatch).genPCSample();
|
||||
MockWave(dispatch).genPCSample();
|
||||
|
||||
std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>> all_allocations;
|
||||
std::vector<std::pair<PcSamplingRecordT*, uint64_t>> all_allocations;
|
||||
|
||||
CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(),
|
||||
buffer->packets.size(),
|
||||
GFXIP_MAJOR,
|
||||
alloc_callback,
|
||||
alloc_callback<PcSamplingRecordT>,
|
||||
(void*) &all_allocations));
|
||||
|
||||
EXPECT_EQ(all_allocations.size(), 1); // HelloWorld: Incorrect number of callbacks
|
||||
@@ -91,23 +90,34 @@ TEST(pcs_parser, hello_world)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Simplest mock classes use, generates a single queue+dispatch with 2 PC samples.
|
||||
*/
|
||||
TEST(pcs_parser, hello_world)
|
||||
{
|
||||
pcs_parser_hello_world<rocprofiler_pc_sampling_record_host_trap_v0_t>();
|
||||
pcs_parser_hello_world<rocprofiler_pc_sampling_record_stochastic_v0_t>();
|
||||
}
|
||||
|
||||
/**
|
||||
* A little more complicated.
|
||||
* Generates a few dispatches for 2 different queues and samples in forward and reverse order.
|
||||
* Checks if the reconstructed correlation_id is correct.
|
||||
*/
|
||||
TEST(pcs_parser, reverse_wave_order)
|
||||
template <typename PcSamplingRecordT>
|
||||
void
|
||||
pcs_parser_reverse_wave_order()
|
||||
{
|
||||
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>();
|
||||
std::shared_ptr<MockQueue> queue1 = std::make_shared<MockQueue>(16, buffer);
|
||||
std::shared_ptr<MockQueue> queue2 = std::make_shared<MockQueue>(16, buffer);
|
||||
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
|
||||
auto queue1 = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
|
||||
auto queue2 = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
|
||||
|
||||
std::vector<std::shared_ptr<MockDispatch>> dispatches;
|
||||
dispatches.push_back(std::make_shared<MockDispatch>(queue1));
|
||||
dispatches.push_back(std::make_shared<MockDispatch>(queue1));
|
||||
dispatches.push_back(std::make_shared<MockDispatch>(queue2));
|
||||
dispatches.push_back(std::make_shared<MockDispatch>(queue2));
|
||||
dispatches.push_back(std::make_shared<MockDispatch>(queue1));
|
||||
std::vector<std::shared_ptr<MockDispatch<PcSamplingRecordT>>> dispatches;
|
||||
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue1));
|
||||
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue1));
|
||||
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue2));
|
||||
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue2));
|
||||
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue1));
|
||||
|
||||
buffer->genUpcomingSamples(dispatches.size());
|
||||
for(auto it = dispatches.rbegin(); it != dispatches.rend(); it++)
|
||||
@@ -116,12 +126,12 @@ TEST(pcs_parser, reverse_wave_order)
|
||||
for(auto it = dispatches.begin(); it != dispatches.end(); it++)
|
||||
MockWave(*it).genPCSample();
|
||||
|
||||
std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>> all_allocations;
|
||||
std::vector<std::pair<PcSamplingRecordT*, uint64_t>> all_allocations;
|
||||
|
||||
CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(),
|
||||
buffer->packets.size(),
|
||||
GFXIP_MAJOR,
|
||||
alloc_callback,
|
||||
alloc_callback<PcSamplingRecordT>,
|
||||
(void*) &all_allocations));
|
||||
|
||||
EXPECT_EQ(all_allocations.size(), 2); // ReverseWaveOrder test: Incorrect number of callbacks
|
||||
@@ -135,29 +145,33 @@ TEST(pcs_parser, reverse_wave_order)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a small queue and causes the dispatch_ids to wrap around a few times, and generates
|
||||
* a single sample per dispatch. Checks the parser is properly handling the wrapping of queues.
|
||||
*/
|
||||
TEST(pcs_parser, dispatch_wrapping)
|
||||
TEST(pcs_parser, reverse_wave_order)
|
||||
{
|
||||
const int num_samples = 32;
|
||||
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>();
|
||||
std::shared_ptr<MockQueue> queue = std::make_shared<MockQueue>(5, buffer);
|
||||
pcs_parser_reverse_wave_order<rocprofiler_pc_sampling_record_host_trap_v0_t>();
|
||||
pcs_parser_reverse_wave_order<rocprofiler_pc_sampling_record_stochastic_v0_t>();
|
||||
}
|
||||
|
||||
template <typename PcSamplingRecordT>
|
||||
void
|
||||
pcs_parser_dispatch_wrapping()
|
||||
{
|
||||
const int num_samples = 32;
|
||||
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
|
||||
auto queue = std::make_shared<MockQueue<PcSamplingRecordT>>(5, buffer);
|
||||
|
||||
for(int i = 0; i < num_samples; i++)
|
||||
{
|
||||
auto dispatch = std::make_shared<MockDispatch>(queue);
|
||||
auto dispatch = std::make_shared<MockDispatch<PcSamplingRecordT>>(queue);
|
||||
buffer->genUpcomingSamples(1);
|
||||
MockWave(dispatch).genPCSample();
|
||||
}
|
||||
|
||||
std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>> all_allocations;
|
||||
std::vector<std::pair<PcSamplingRecordT*, uint64_t>> all_allocations;
|
||||
|
||||
CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(),
|
||||
buffer->packets.size(),
|
||||
GFXIP_MAJOR,
|
||||
alloc_callback,
|
||||
alloc_callback<PcSamplingRecordT>,
|
||||
(void*) &all_allocations));
|
||||
|
||||
EXPECT_EQ(all_allocations.size(),
|
||||
@@ -172,39 +186,47 @@ TEST(pcs_parser, dispatch_wrapping)
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a few queues with a few dispatchs per queue.
|
||||
* Adds random samples per dispatch, and checks the result.
|
||||
* Creates a small queue and causes the dispatch_ids to wrap around a few times, and generates
|
||||
* a single sample per dispatch. Checks the parser is properly handling the wrapping of queues.
|
||||
*/
|
||||
TEST(pcs_parser, random_samples)
|
||||
TEST(pcs_parser, dispatch_wrapping)
|
||||
{
|
||||
const int num_samples = 1024;
|
||||
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>();
|
||||
std::shared_ptr<MockQueue> queue1 = std::make_shared<MockQueue>(16, buffer);
|
||||
std::shared_ptr<MockQueue> queue2 = std::make_shared<MockQueue>(16, buffer);
|
||||
std::shared_ptr<MockQueue> queue3 = std::make_shared<MockQueue>(16, buffer);
|
||||
std::shared_ptr<MockQueue> queue4 = std::make_shared<MockQueue>(16, buffer);
|
||||
pcs_parser_dispatch_wrapping<rocprofiler_pc_sampling_record_host_trap_v0_t>();
|
||||
pcs_parser_dispatch_wrapping<rocprofiler_pc_sampling_record_stochastic_v0_t>();
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<MockDispatch>> dispatches;
|
||||
dispatches.push_back(std::make_shared<MockDispatch>(queue1));
|
||||
dispatches.push_back(std::make_shared<MockDispatch>(queue1));
|
||||
dispatches.push_back(std::make_shared<MockDispatch>(queue2));
|
||||
dispatches.push_back(std::make_shared<MockDispatch>(queue3));
|
||||
dispatches.push_back(std::make_shared<MockDispatch>(queue1));
|
||||
dispatches.push_back(std::make_shared<MockDispatch>(queue3));
|
||||
dispatches.push_back(std::make_shared<MockDispatch>(queue3));
|
||||
dispatches.push_back(std::make_shared<MockDispatch>(queue2));
|
||||
dispatches.push_back(std::make_shared<MockDispatch>(queue1));
|
||||
template <typename PcSamplingRecordT>
|
||||
void
|
||||
pcs_parser_random_samples()
|
||||
{
|
||||
const int num_samples = 1024;
|
||||
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
|
||||
auto queue1 = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
|
||||
auto queue2 = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
|
||||
auto queue3 = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
|
||||
auto queue4 = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
|
||||
|
||||
std::vector<std::shared_ptr<MockDispatch<PcSamplingRecordT>>> dispatches;
|
||||
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue1));
|
||||
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue1));
|
||||
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue2));
|
||||
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue3));
|
||||
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue1));
|
||||
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue3));
|
||||
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue3));
|
||||
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue2));
|
||||
dispatches.push_back(std::make_shared<MockDispatch<PcSamplingRecordT>>(queue1));
|
||||
|
||||
buffer->genUpcomingSamples(num_samples);
|
||||
for(int i = 0; i < num_samples; i++)
|
||||
MockWave(dispatches[rdgen() % dispatches.size()]).genPCSample();
|
||||
|
||||
std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>> all_allocations;
|
||||
std::vector<std::pair<PcSamplingRecordT*, uint64_t>> all_allocations;
|
||||
|
||||
CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(),
|
||||
buffer->packets.size(),
|
||||
GFXIP_MAJOR,
|
||||
alloc_callback,
|
||||
alloc_callback<PcSamplingRecordT>,
|
||||
(void*) &all_allocations));
|
||||
|
||||
EXPECT_EQ(all_allocations.size(), 1); // RandomSamples test: Incorrect number of callbacks
|
||||
@@ -218,21 +240,29 @@ TEST(pcs_parser, random_samples)
|
||||
}
|
||||
|
||||
/**
|
||||
* Hammers the parser by creating and destrying queues at random, adding dispatches at random
|
||||
* and generating PC samples at random. By default we use all 4 unique doorbells,
|
||||
* queue size is 16 and we generate 10k samples dispatch.
|
||||
* Creates a few queues with a few dispatchs per queue.
|
||||
* Adds random samples per dispatch, and checks the result.
|
||||
*/
|
||||
TEST(pcs_parser, queue_hammer)
|
||||
TEST(pcs_parser, random_samples)
|
||||
{
|
||||
pcs_parser_random_samples<rocprofiler_pc_sampling_record_host_trap_v0_t>();
|
||||
pcs_parser_random_samples<rocprofiler_pc_sampling_record_stochastic_v0_t>();
|
||||
}
|
||||
|
||||
template <typename PcSamplingRecordT>
|
||||
void
|
||||
pcs_parser_queue_hammer()
|
||||
{
|
||||
constexpr int NUM_ACTIONS = 10000;
|
||||
constexpr int QSIZE = 16;
|
||||
constexpr int NUM_QUEUES = MockDoorBell::num_unique_bells;
|
||||
constexpr int ACTION_MAX = QSIZE * NUM_QUEUES / 2;
|
||||
|
||||
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>();
|
||||
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
|
||||
|
||||
std::array<std::shared_ptr<MockQueue>, NUM_QUEUES> queues;
|
||||
std::array<std::vector<std::shared_ptr<MockDispatch>>, NUM_QUEUES> active_dispatches;
|
||||
std::array<std::shared_ptr<MockQueue<PcSamplingRecordT>>, NUM_QUEUES> queues;
|
||||
std::array<std::vector<std::shared_ptr<MockDispatch<PcSamplingRecordT>>>, NUM_QUEUES>
|
||||
active_dispatches;
|
||||
|
||||
int num_reset_queues = 0;
|
||||
int num_samples_generated = 0;
|
||||
@@ -241,9 +271,10 @@ TEST(pcs_parser, queue_hammer)
|
||||
size_t max_q_occupancy = 0;
|
||||
|
||||
for(int i = 0; i < NUM_QUEUES; i++)
|
||||
queues[i] = std::make_shared<MockQueue>(QSIZE, buffer);
|
||||
queues[i] = std::make_shared<MockQueue<PcSamplingRecordT>>(QSIZE, buffer);
|
||||
for(int i = 0; i < NUM_QUEUES; i++)
|
||||
active_dispatches[i].push_back(std::make_shared<MockDispatch>(queues[i]));
|
||||
active_dispatches[i].push_back(
|
||||
std::make_shared<MockDispatch<PcSamplingRecordT>>(queues[i]));
|
||||
|
||||
for(int i = 0; i < NUM_ACTIONS; i++)
|
||||
{
|
||||
@@ -254,7 +285,7 @@ TEST(pcs_parser, queue_hammer)
|
||||
// Delete queue and create new one
|
||||
active_dispatches[q] = {};
|
||||
queues[q].reset();
|
||||
queues[q] = std::make_shared<MockQueue>(QSIZE, buffer);
|
||||
queues[q] = std::make_shared<MockQueue<PcSamplingRecordT>>(QSIZE, buffer);
|
||||
num_reset_queues++;
|
||||
}
|
||||
else if(action > ACTION_MAX / 2 && active_dispatches[q].size() > 1)
|
||||
@@ -267,7 +298,8 @@ TEST(pcs_parser, queue_hammer)
|
||||
// Add new dispatch
|
||||
if(active_dispatches[q].size() < QSIZE)
|
||||
{
|
||||
active_dispatches[q].push_back(std::make_shared<MockDispatch>(queues[q]));
|
||||
active_dispatches[q].push_back(
|
||||
std::make_shared<MockDispatch<PcSamplingRecordT>>(queues[q]));
|
||||
num_dispatches_generated += 1;
|
||||
}
|
||||
|
||||
@@ -276,7 +308,8 @@ TEST(pcs_parser, queue_hammer)
|
||||
for(auto& queue : active_dispatches)
|
||||
{
|
||||
EXPECT_NE(queue.size(), 0);
|
||||
std::shared_ptr<MockDispatch> rand_dispatch = queue[rdgen() % queue.size()];
|
||||
std::shared_ptr<MockDispatch<PcSamplingRecordT>> rand_dispatch =
|
||||
queue[rdgen() % queue.size()];
|
||||
MockWave(rand_dispatch).genPCSample();
|
||||
num_samples_generated += 1;
|
||||
avg_q_occupancy += queue.size();
|
||||
@@ -292,20 +325,20 @@ TEST(pcs_parser, queue_hammer)
|
||||
<< std::endl;
|
||||
std::cout << "Max queue occupancy: " << max_q_occupancy << "\n\n" << std::endl;
|
||||
|
||||
std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>> all_allocations;
|
||||
std::vector<std::pair<PcSamplingRecordT*, uint64_t>> all_allocations;
|
||||
|
||||
CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(),
|
||||
buffer->packets.size(),
|
||||
GFXIP_MAJOR,
|
||||
alloc_callback,
|
||||
alloc_callback<PcSamplingRecordT>,
|
||||
(void*) &all_allocations));
|
||||
|
||||
EXPECT_EQ(all_allocations.size(),
|
||||
NUM_ACTIONS); // QueueHammer test: Incorrect number of callbacks
|
||||
for(auto sb = 0ul; sb < all_allocations.size(); sb++)
|
||||
{
|
||||
rocprofiler_pc_sampling_record_t* samples = all_allocations[sb].first;
|
||||
size_t num_samples = all_allocations[sb].second;
|
||||
PcSamplingRecordT* samples = all_allocations[sb].first;
|
||||
size_t num_samples = all_allocations[sb].second;
|
||||
|
||||
EXPECT_EQ(num_samples, NUM_QUEUES); // QueueHammer: Incorrect number of samples
|
||||
EXPECT_EQ(check_samples(samples, num_samples),
|
||||
@@ -314,12 +347,25 @@ TEST(pcs_parser, queue_hammer)
|
||||
}
|
||||
}
|
||||
|
||||
TEST(pcs_parser, multi_buffer)
|
||||
/**
|
||||
* Hammers the parser by creating and destrying queues at random, adding dispatches at random
|
||||
* and generating PC samples at random. By default we use all 4 unique doorbells,
|
||||
* queue size is 16 and we generate 10k samples dispatch.
|
||||
*/
|
||||
TEST(pcs_parser, queue_hammer)
|
||||
{
|
||||
std::shared_ptr<MockRuntimeBuffer> firstBuffer = std::make_shared<MockRuntimeBuffer>();
|
||||
std::shared_ptr<MockQueue> queue = std::make_shared<MockQueue>(16, firstBuffer);
|
||||
std::shared_ptr<MockDispatch> dispatch1 = std::make_shared<MockDispatch>(queue);
|
||||
std::shared_ptr<MockDispatch> dispatch2 = std::make_shared<MockDispatch>(queue);
|
||||
pcs_parser_queue_hammer<rocprofiler_pc_sampling_record_host_trap_v0_t>();
|
||||
pcs_parser_queue_hammer<rocprofiler_pc_sampling_record_stochastic_v0_t>();
|
||||
}
|
||||
|
||||
template <typename PcSamplingRecordT>
|
||||
void
|
||||
pcs_parser_multi_buffer()
|
||||
{
|
||||
auto firstBuffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
|
||||
auto queue = std::make_shared<MockQueue<PcSamplingRecordT>>(16, firstBuffer);
|
||||
auto dispatch1 = std::make_shared<MockDispatch<PcSamplingRecordT>>(queue);
|
||||
auto dispatch2 = std::make_shared<MockDispatch<PcSamplingRecordT>>(queue);
|
||||
|
||||
firstBuffer->genUpcomingSamples(4);
|
||||
MockWave(dispatch1).genPCSample();
|
||||
@@ -327,21 +373,21 @@ TEST(pcs_parser, multi_buffer)
|
||||
MockWave(dispatch1).genPCSample();
|
||||
MockWave(dispatch2).genPCSample();
|
||||
|
||||
std::shared_ptr<MockRuntimeBuffer> secondBuffer = std::make_shared<MockRuntimeBuffer>();
|
||||
const auto& packets = firstBuffer->packets;
|
||||
secondBuffer->packets = std::vector<packet_union_t>(packets.begin() + 2, packets.end());
|
||||
auto secondBuffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
|
||||
const auto& packets = firstBuffer->packets;
|
||||
secondBuffer->packets = std::vector<packet_union_t>(packets.begin() + 2, packets.end());
|
||||
|
||||
std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>> all_allocations;
|
||||
std::vector<std::pair<PcSamplingRecordT*, uint64_t>> all_allocations;
|
||||
|
||||
CHECK_PARSER(parse_buffer((generic_sample_t*) firstBuffer->packets.data(),
|
||||
firstBuffer->packets.size(),
|
||||
GFXIP_MAJOR,
|
||||
alloc_callback,
|
||||
alloc_callback<PcSamplingRecordT>,
|
||||
(void*) &all_allocations));
|
||||
CHECK_PARSER(parse_buffer((generic_sample_t*) secondBuffer->packets.data(),
|
||||
secondBuffer->packets.size(),
|
||||
GFXIP_MAJOR,
|
||||
alloc_callback,
|
||||
alloc_callback<PcSamplingRecordT>,
|
||||
(void*) &all_allocations));
|
||||
|
||||
EXPECT_EQ(all_allocations.size(), 2); // MultiBuffer: Incorrect number of callbacks
|
||||
@@ -352,4 +398,10 @@ TEST(pcs_parser, multi_buffer)
|
||||
|
||||
delete[] all_allocations[0].first;
|
||||
delete[] all_allocations[1].first;
|
||||
};
|
||||
}
|
||||
|
||||
TEST(pcs_parser, multi_buffer)
|
||||
{
|
||||
pcs_parser_multi_buffer<rocprofiler_pc_sampling_record_host_trap_v0_t>();
|
||||
pcs_parser_multi_buffer<rocprofiler_pc_sampling_record_stochastic_v0_t>();
|
||||
}
|
||||
|
||||
+335
-193
@@ -100,14 +100,15 @@
|
||||
ARBCHECK2(ISSUE_EXP); \
|
||||
ARBCHECK2(ISSUE_MISC);
|
||||
|
||||
template <typename PcSamplingRecordT>
|
||||
class WaveSnapTest
|
||||
{
|
||||
public:
|
||||
WaveSnapTest()
|
||||
{
|
||||
buffer = std::make_shared<MockRuntimeBuffer>();
|
||||
queue = std::make_shared<MockQueue>(16, buffer);
|
||||
dispatch = std::make_shared<MockDispatch>(queue);
|
||||
buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>();
|
||||
queue = std::make_shared<MockQueue<PcSamplingRecordT>>(16, buffer);
|
||||
dispatch = std::make_shared<MockDispatch<PcSamplingRecordT>>(queue);
|
||||
}
|
||||
|
||||
void Test()
|
||||
@@ -140,25 +141,27 @@ public:
|
||||
dispatch->submit(packet_union_t{.snap = snap});
|
||||
};
|
||||
|
||||
std::shared_ptr<MockRuntimeBuffer> buffer;
|
||||
std::shared_ptr<MockQueue> queue;
|
||||
std::shared_ptr<MockDispatch> dispatch;
|
||||
std::shared_ptr<MockRuntimeBuffer<PcSamplingRecordT>> buffer;
|
||||
std::shared_ptr<MockQueue<PcSamplingRecordT>> queue;
|
||||
std::shared_ptr<MockDispatch<PcSamplingRecordT>> dispatch;
|
||||
};
|
||||
|
||||
class WaveCntTest : public WaveSnapTest
|
||||
template <typename PcSamplingRecordT>
|
||||
class WaveCntTest : public WaveSnapTest<PcSamplingRecordT>
|
||||
{
|
||||
public:
|
||||
void FillBuffers() override
|
||||
{
|
||||
// Loop over all possible wave_cnt
|
||||
buffer->genUpcomingSamples(max_wave_number);
|
||||
this->buffer->genUpcomingSamples(max_wave_number);
|
||||
for(size_t i = 0; i < max_wave_number; i++)
|
||||
genPCSample(i, GFX9::TYPE_LDS, GFX9::REASON_ALU, GFX9::ISSUE_VALU, GFX9::ISSUE_VALU);
|
||||
this->genPCSample(
|
||||
i, GFX9::TYPE_LDS, GFX9::REASON_ALU, GFX9::ISSUE_VALU, GFX9::ISSUE_VALU);
|
||||
}
|
||||
|
||||
void CheckBuffers() override
|
||||
{
|
||||
auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
|
||||
auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9
|
||||
assert(parsed.size() == 1);
|
||||
assert(parsed[0].size() == max_wave_number);
|
||||
|
||||
@@ -166,204 +169,336 @@ public:
|
||||
assert(parsed[0][i].wave_count == i);
|
||||
}
|
||||
|
||||
const size_t max_wave_number = 64;
|
||||
std::vector<rocprofiler_pc_sampling_snapshot_v1_t> snapshots;
|
||||
const size_t max_wave_number = 64;
|
||||
std::vector<PcSamplingRecordT> snapshots;
|
||||
};
|
||||
|
||||
class InstTypeTest : public WaveSnapTest
|
||||
// class InstTypeTest : public WaveSnapTest
|
||||
// {
|
||||
// public:
|
||||
// void FillBuffers() override
|
||||
// {
|
||||
// // Loop over inst_type_issued
|
||||
// UNROLL_TYPECHECK();
|
||||
// buffer->genUpcomingSamples(GFX9::TYPE_LAST);
|
||||
// for(int i = 0; i < GFX9::TYPE_LAST; i++)
|
||||
// genPCSample(i, i, GFX9::REASON_ALU, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX);
|
||||
// }
|
||||
|
||||
// void CheckBuffers() override
|
||||
// {
|
||||
// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
|
||||
// assert(parsed.size() == 1);
|
||||
// assert(parsed[0].size() == GFX9::TYPE_LAST);
|
||||
// assert(snapshots.size() == GFX9::TYPE_LAST);
|
||||
|
||||
// for(size_t i = 0; i < GFX9::TYPE_LAST; i++)
|
||||
// assert(snapshots[i].inst_type == parsed[0][i].snapshot.inst_type);
|
||||
// }
|
||||
|
||||
// std::vector<rocprofiler_pc_sampling_snapshot_v1_t> snapshots;
|
||||
// };
|
||||
|
||||
// class StallReasonTest : public WaveSnapTest
|
||||
// {
|
||||
// public:
|
||||
// void FillBuffers() override
|
||||
// {
|
||||
// // Loop over reason_not_issued
|
||||
// UNROLL_REASONCHECK();
|
||||
// buffer->genUpcomingSamples(GFX9::REASON_LAST);
|
||||
// for(int i = 0; i < GFX9::REASON_LAST; i++)
|
||||
// genPCSample(i, GFX9::TYPE_MATRIX, i, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX);
|
||||
// }
|
||||
|
||||
// void CheckBuffers() override
|
||||
// {
|
||||
// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
|
||||
// assert(parsed.size() == 1);
|
||||
// assert(parsed[0].size() == GFX9::REASON_LAST);
|
||||
// assert(snapshots.size() == GFX9::REASON_LAST);
|
||||
|
||||
// for(size_t i = 0; i < GFX9::REASON_LAST; i++)
|
||||
// assert(snapshots[i].reason_not_issued == parsed[0][i].snapshot.reason_not_issued);
|
||||
// }
|
||||
|
||||
// std::vector<rocprofiler_pc_sampling_snapshot_v1_t> snapshots;
|
||||
// };
|
||||
|
||||
// class ArbStateTest : public WaveSnapTest
|
||||
// {
|
||||
// public:
|
||||
// void FillBuffers() override
|
||||
// {
|
||||
// // Loop over arb_state_issue
|
||||
// UNROLL_ARBCHECK();
|
||||
// buffer->genUpcomingSamples(GFX9::ISSUE_LAST * GFX9::ISSUE_LAST);
|
||||
// for(int i = 0; i < GFX9::ISSUE_LAST; i++)
|
||||
// for(int j = 0; j < GFX9::ISSUE_LAST; j++)
|
||||
// genPCSample(i, GFX9::TYPE_MATRIX, GFX9::REASON_ALU, 1 << i, 1 << j);
|
||||
// }
|
||||
|
||||
// void CheckBuffers() override
|
||||
// {
|
||||
// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
|
||||
// assert(parsed.size() == 1);
|
||||
// assert(parsed[0].size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST);
|
||||
// assert(snapshots.size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST);
|
||||
|
||||
// for(size_t i = 0; i < GFX9::ISSUE_LAST * GFX9::ISSUE_LAST; i++)
|
||||
// {
|
||||
// auto& snap = snapshots[i];
|
||||
// assert(snap.arb_state_issue == parsed[0][i].snapshot.arb_state_issue);
|
||||
// assert(snap.arb_state_stall == parsed[0][i].snapshot.arb_state_stall);
|
||||
// }
|
||||
// }
|
||||
|
||||
// std::vector<rocprofiler_pc_sampling_snapshot_v1_t> snapshots;
|
||||
// };
|
||||
|
||||
// class WaveIssueAndErrorTest : public WaveSnapTest
|
||||
// {
|
||||
// void FillBuffers() override
|
||||
// {
|
||||
// buffer->genUpcomingSamples(16);
|
||||
// for(int valid = 0; valid <= 1; valid++)
|
||||
// for(int issued = 0; issued <= 1; issued++)
|
||||
// for(int dual = 0; dual <= 1; dual++)
|
||||
// for(int error = 0; error <= 1; error++)
|
||||
// genPCSample(valid, issued, dual, error);
|
||||
// }
|
||||
|
||||
// void CheckBuffers() override
|
||||
// {
|
||||
// const int num_combinations = 16;
|
||||
// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
|
||||
// assert(parsed.size() == 1);
|
||||
// assert(parsed[0].size() == num_combinations);
|
||||
// assert(compare.size() == num_combinations);
|
||||
|
||||
// for(size_t i = 0; i < num_combinations; i++)
|
||||
// {
|
||||
// assert(compare[i].flags.valid == parsed[0][i].flags.valid);
|
||||
// assert(compare[i].wave_issued == parsed[0][i].wave_issued);
|
||||
// assert(compare[i].snapshot.dual_issue_valu == parsed[0][i].snapshot.dual_issue_valu);
|
||||
// }
|
||||
// }
|
||||
|
||||
// union trap_snapshot_v1
|
||||
// {
|
||||
// struct
|
||||
// {
|
||||
// uint32_t valid : 1;
|
||||
// uint32_t issued : 1;
|
||||
// uint32_t dual : 1;
|
||||
// uint32_t reserved : 23;
|
||||
// uint32_t error : 1;
|
||||
// uint32_t reserved2 : 5;
|
||||
// };
|
||||
// uint32_t raw;
|
||||
// };
|
||||
|
||||
// void genPCSample(bool valid, bool issued, bool dual, bool error)
|
||||
// {
|
||||
// rocprofiler_pc_sampling_record_t sample;
|
||||
// ::memset(&sample, 0, sizeof(sample));
|
||||
// // TODO: Since code objects are not mocked, use pc.code_object_offset
|
||||
// // as the absolute physical address of the mocked PC.
|
||||
// sample.pc.code_object_offset = dispatch->unique_id;
|
||||
|
||||
// sample.correlation_id.internal = dispatch->getMockId().raw;
|
||||
|
||||
// sample.flags.valid = valid && !error;
|
||||
// sample.wave_issued = issued;
|
||||
// sample.snapshot.dual_issue_valu = dual;
|
||||
|
||||
// assert(dispatch.get());
|
||||
|
||||
// compare.push_back(sample);
|
||||
|
||||
// trap_snapshot_v1 snap;
|
||||
// snap.valid = valid;
|
||||
// snap.issued = issued;
|
||||
// snap.dual = dual;
|
||||
// snap.error = error;
|
||||
|
||||
// perf_sample_snapshot_v1 pss;
|
||||
// pss.perf_snapshot_data = snap.raw;
|
||||
// pss.correlation_id = dispatch->getMockId().raw;
|
||||
// dispatch->submit(std::move(pss));
|
||||
// };
|
||||
|
||||
// std::vector<rocprofiler_pc_sampling_record_t> compare;
|
||||
// };
|
||||
|
||||
template <typename PcSamplingRecordT>
|
||||
class HwIdTest : public WaveSnapTest<PcSamplingRecordT>
|
||||
{
|
||||
public:
|
||||
void FillBuffers() override
|
||||
{
|
||||
// Loop over inst_type_issued
|
||||
UNROLL_TYPECHECK();
|
||||
buffer->genUpcomingSamples(GFX9::TYPE_LAST);
|
||||
for(int i = 0; i < GFX9::TYPE_LAST; i++)
|
||||
genPCSample(i, i, GFX9::REASON_ALU, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX);
|
||||
}
|
||||
|
||||
void CheckBuffers() override
|
||||
{
|
||||
auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
|
||||
assert(parsed.size() == 1);
|
||||
assert(parsed[0].size() == GFX9::TYPE_LAST);
|
||||
assert(snapshots.size() == GFX9::TYPE_LAST);
|
||||
|
||||
for(size_t i = 0; i < GFX9::TYPE_LAST; i++)
|
||||
assert(snapshots[i].inst_type == parsed[0][i].snapshot.inst_type);
|
||||
}
|
||||
|
||||
std::vector<rocprofiler_pc_sampling_snapshot_v1_t> snapshots;
|
||||
};
|
||||
|
||||
class StallReasonTest : public WaveSnapTest
|
||||
{
|
||||
public:
|
||||
void FillBuffers() override
|
||||
{
|
||||
// Loop over reason_not_issued
|
||||
UNROLL_REASONCHECK();
|
||||
buffer->genUpcomingSamples(GFX9::REASON_LAST);
|
||||
for(int i = 0; i < GFX9::REASON_LAST; i++)
|
||||
genPCSample(i, GFX9::TYPE_MATRIX, i, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX);
|
||||
}
|
||||
|
||||
void CheckBuffers() override
|
||||
{
|
||||
auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
|
||||
assert(parsed.size() == 1);
|
||||
assert(parsed[0].size() == GFX9::REASON_LAST);
|
||||
assert(snapshots.size() == GFX9::REASON_LAST);
|
||||
|
||||
for(size_t i = 0; i < GFX9::REASON_LAST; i++)
|
||||
assert(snapshots[i].reason_not_issued == parsed[0][i].snapshot.reason_not_issued);
|
||||
}
|
||||
|
||||
std::vector<rocprofiler_pc_sampling_snapshot_v1_t> snapshots;
|
||||
};
|
||||
|
||||
class ArbStateTest : public WaveSnapTest
|
||||
{
|
||||
public:
|
||||
void FillBuffers() override
|
||||
{
|
||||
// Loop over arb_state_issue
|
||||
UNROLL_ARBCHECK();
|
||||
buffer->genUpcomingSamples(GFX9::ISSUE_LAST * GFX9::ISSUE_LAST);
|
||||
for(int i = 0; i < GFX9::ISSUE_LAST; i++)
|
||||
for(int j = 0; j < GFX9::ISSUE_LAST; j++)
|
||||
genPCSample(i, GFX9::TYPE_MATRIX, GFX9::REASON_ALU, 1 << i, 1 << j);
|
||||
}
|
||||
|
||||
void CheckBuffers() override
|
||||
{
|
||||
auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
|
||||
assert(parsed.size() == 1);
|
||||
assert(parsed[0].size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST);
|
||||
assert(snapshots.size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST);
|
||||
|
||||
for(size_t i = 0; i < GFX9::ISSUE_LAST * GFX9::ISSUE_LAST; i++)
|
||||
{
|
||||
auto& snap = snapshots[i];
|
||||
assert(snap.arb_state_issue == parsed[0][i].snapshot.arb_state_issue);
|
||||
assert(snap.arb_state_stall == parsed[0][i].snapshot.arb_state_stall);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<rocprofiler_pc_sampling_snapshot_v1_t> snapshots;
|
||||
};
|
||||
|
||||
class WaveIssueAndErrorTest : public WaveSnapTest
|
||||
{
|
||||
void FillBuffers() override
|
||||
{
|
||||
buffer->genUpcomingSamples(16);
|
||||
for(int valid = 0; valid <= 1; valid++)
|
||||
for(int issued = 0; issued <= 1; issued++)
|
||||
for(int dual = 0; dual <= 1; dual++)
|
||||
for(int error = 0; error <= 1; error++)
|
||||
genPCSample(valid, issued, dual, error);
|
||||
}
|
||||
|
||||
void CheckBuffers() override
|
||||
{
|
||||
const int num_combinations = 16;
|
||||
auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
|
||||
assert(parsed.size() == 1);
|
||||
assert(parsed[0].size() == num_combinations);
|
||||
assert(compare.size() == num_combinations);
|
||||
|
||||
for(size_t i = 0; i < num_combinations; i++)
|
||||
{
|
||||
assert(compare[i].flags.valid == parsed[0][i].flags.valid);
|
||||
assert(compare[i].wave_issued == parsed[0][i].wave_issued);
|
||||
assert(compare[i].snapshot.dual_issue_valu == parsed[0][i].snapshot.dual_issue_valu);
|
||||
}
|
||||
}
|
||||
|
||||
union trap_snapshot_v1
|
||||
union gfx9_hw_id_t
|
||||
{
|
||||
uint32_t raw;
|
||||
struct
|
||||
{
|
||||
uint32_t valid : 1;
|
||||
uint32_t issued : 1;
|
||||
uint32_t dual : 1;
|
||||
uint32_t reserved : 23;
|
||||
uint32_t error : 1;
|
||||
uint32_t reserved2 : 5;
|
||||
uint32_t wave_id : 4; ///< wave slot index
|
||||
uint32_t simd_id : 2; ///< SIMD index
|
||||
uint32_t pipe_id : 2; ///< pipe index
|
||||
uint32_t cu_id : 4; ///< Index of compute unit on GFX9 or workgroup processer on other
|
||||
///< architectures
|
||||
uint32_t shader_array_id : 1; ///< Shared array index
|
||||
uint32_t shader_engine_id : 3; ///< shared engine index
|
||||
uint32_t
|
||||
threadgroup_id : 4; ///< thread_group index on GFX9, and workgroup index on GFX10+
|
||||
uint32_t vm_id : 4; ///< virtual memory ID
|
||||
uint32_t queue_id : 3; ///< queue id
|
||||
uint32_t gfx_context_state_id : 3; ///< GFX context (state) id (only on GFX9) - ignored
|
||||
uint32_t microengine_id : 2; ///< ACE (microengine) index
|
||||
};
|
||||
uint32_t raw;
|
||||
};
|
||||
|
||||
void genPCSample(bool valid, bool issued, bool dual, bool error)
|
||||
{
|
||||
rocprofiler_pc_sampling_record_t sample;
|
||||
::memset(&sample, 0, sizeof(sample));
|
||||
// TODO: Since code objects are not mocked, use pc.loaded_code_object_offset
|
||||
// as the absolute physical address of the mocked PC.
|
||||
sample.pc.loaded_code_object_offset = dispatch->unique_id;
|
||||
|
||||
sample.correlation_id.internal = dispatch->getMockId().raw;
|
||||
|
||||
sample.flags.valid = valid && !error;
|
||||
sample.wave_issued = issued;
|
||||
sample.snapshot.dual_issue_valu = dual;
|
||||
|
||||
assert(dispatch.get());
|
||||
|
||||
compare.push_back(sample);
|
||||
|
||||
trap_snapshot_v1 snap;
|
||||
snap.valid = valid;
|
||||
snap.issued = issued;
|
||||
snap.dual = dual;
|
||||
snap.error = error;
|
||||
|
||||
perf_sample_snapshot_v1 pss;
|
||||
pss.perf_snapshot_data = snap.raw;
|
||||
pss.correlation_id = dispatch->getMockId().raw;
|
||||
dispatch->submit(std::move(pss));
|
||||
};
|
||||
|
||||
std::vector<rocprofiler_pc_sampling_record_t> compare;
|
||||
};
|
||||
|
||||
class WaveOtherFieldsTest : public WaveSnapTest
|
||||
{
|
||||
void FillBuffers() override
|
||||
{
|
||||
buffer->genUpcomingSamples(3);
|
||||
genPCSample(1, 2, 3, 4, 5, 6, 7, 8); // Counting
|
||||
genPCSample(3, 5, 7, 11, 13, 17, 19, 23); // Some prime numbers
|
||||
genPCSample(23, 19, 17, 13, 11, 7, 5, 3); // Some reversed primes
|
||||
gfx9_hw_id_t hw_id_val0;
|
||||
hw_id_val0.wave_id = 0;
|
||||
hw_id_val0.simd_id = 0;
|
||||
hw_id_val0.pipe_id = 0;
|
||||
hw_id_val0.cu_id = 0;
|
||||
hw_id_val0.shader_array_id = 0;
|
||||
hw_id_val0.shader_engine_id = 0;
|
||||
hw_id_val0.threadgroup_id = 0;
|
||||
hw_id_val0.vm_id = 0;
|
||||
hw_id_val0.queue_id = 0;
|
||||
hw_id_val0.gfx_context_state_id = 0;
|
||||
hw_id_val0.microengine_id = 0;
|
||||
|
||||
gfx9_hw_id_t hw_id_val1;
|
||||
hw_id_val0.wave_id = 15;
|
||||
hw_id_val0.simd_id = 3;
|
||||
hw_id_val0.pipe_id = 3;
|
||||
hw_id_val0.cu_id = 15;
|
||||
hw_id_val0.shader_array_id = 1;
|
||||
hw_id_val0.shader_engine_id = 7;
|
||||
hw_id_val0.threadgroup_id = 15;
|
||||
hw_id_val0.vm_id = 15;
|
||||
hw_id_val0.queue_id = 7;
|
||||
hw_id_val0.gfx_context_state_id = 7;
|
||||
hw_id_val0.microengine_id = 3;
|
||||
|
||||
gfx9_hw_id_t hw_id_val2;
|
||||
hw_id_val2.wave_id = 7;
|
||||
hw_id_val2.simd_id = 2;
|
||||
hw_id_val2.pipe_id = 2;
|
||||
hw_id_val2.cu_id = 6;
|
||||
hw_id_val2.shader_array_id = 0;
|
||||
hw_id_val2.shader_engine_id = 3;
|
||||
hw_id_val2.threadgroup_id = 8;
|
||||
hw_id_val2.vm_id = 9;
|
||||
hw_id_val2.queue_id = 3;
|
||||
hw_id_val2.gfx_context_state_id = 2;
|
||||
hw_id_val2.microengine_id = 1;
|
||||
|
||||
this->buffer->genUpcomingSamples(3);
|
||||
genPCSample(hw_id_val0);
|
||||
genPCSample(hw_id_val1);
|
||||
genPCSample(hw_id_val2);
|
||||
}
|
||||
|
||||
void CheckBuffers() override
|
||||
{
|
||||
auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9
|
||||
auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9
|
||||
assert(parsed.size() == 1);
|
||||
assert(parsed[0].size() == 3);
|
||||
assert(compare.size() == 3);
|
||||
|
||||
for(size_t i = 0; i < 3; i++)
|
||||
{
|
||||
assert(parsed[0][i].flags.has_stall_reason == true);
|
||||
assert(parsed[0][i].flags.has_wave_cnt == true);
|
||||
assert(parsed[0][i].flags.reserved == false);
|
||||
// Comparing individual fields
|
||||
assert(compare[i].hw_id.wave_id == parsed[0][i].hw_id.wave_id);
|
||||
assert(compare[i].hw_id.simd_id == parsed[0][i].hw_id.simd_id);
|
||||
assert(compare[i].hw_id.pipe_id == parsed[0][i].hw_id.pipe_id);
|
||||
assert(compare[i].hw_id.cu_or_wgp_id == parsed[0][i].hw_id.cu_or_wgp_id);
|
||||
assert(compare[i].hw_id.shader_array_id == parsed[0][i].hw_id.shader_array_id);
|
||||
assert(compare[i].hw_id.shader_engine_id == parsed[0][i].hw_id.shader_engine_id);
|
||||
assert(compare[i].hw_id.workgroup_id == parsed[0][i].hw_id.workgroup_id);
|
||||
assert(compare[i].hw_id.vm_id == parsed[0][i].hw_id.vm_id);
|
||||
assert(compare[i].hw_id.queue_id == parsed[0][i].hw_id.queue_id);
|
||||
assert(compare[i].hw_id.microengine_id == parsed[0][i].hw_id.microengine_id);
|
||||
}
|
||||
}
|
||||
|
||||
void genPCSample(gfx9_hw_id_t hw_id)
|
||||
{
|
||||
PcSamplingRecordT sample;
|
||||
::memset(&sample, 0, sizeof(sample));
|
||||
// Unpacking individual fields
|
||||
// NOTE: chiplet is tested in a WaveOtherFieldsTest test, becuase it's not
|
||||
// transferred via hw_id, but chiplet_and_wave_id field.
|
||||
sample.hw_id.wave_id = hw_id.wave_id;
|
||||
sample.hw_id.simd_id = hw_id.simd_id;
|
||||
sample.hw_id.pipe_id = hw_id.pipe_id;
|
||||
sample.hw_id.cu_or_wgp_id = hw_id.cu_id;
|
||||
sample.hw_id.shader_array_id = hw_id.shader_array_id;
|
||||
sample.hw_id.shader_engine_id = hw_id.shader_engine_id;
|
||||
sample.hw_id.workgroup_id = hw_id.threadgroup_id;
|
||||
sample.hw_id.vm_id = hw_id.vm_id;
|
||||
sample.hw_id.queue_id = hw_id.queue_id;
|
||||
sample.hw_id.microengine_id = hw_id.microengine_id;
|
||||
|
||||
compare.push_back(sample);
|
||||
|
||||
perf_sample_snapshot_v1 snap;
|
||||
::memset(&snap, 0, sizeof(snap));
|
||||
|
||||
// raw register value
|
||||
snap.hw_id = hw_id.raw;
|
||||
snap.correlation_id = this->dispatch->getMockId().raw;
|
||||
|
||||
assert(this->dispatch.get());
|
||||
this->dispatch->submit(snap);
|
||||
};
|
||||
|
||||
std::vector<PcSamplingRecordT> compare;
|
||||
};
|
||||
|
||||
template <typename PcSamplingRecordT>
|
||||
class WaveOtherFieldsTest : public WaveSnapTest<PcSamplingRecordT>
|
||||
{
|
||||
void FillBuffers() override
|
||||
{
|
||||
this->buffer->genUpcomingSamples(3);
|
||||
genPCSample(1, 2, 3, 4, 5, 6, 7); // Counting
|
||||
genPCSample(3, 5, 7, 11, 13, 17, 19); // Some prime numbers
|
||||
genPCSample(23, 19, 17, 13, 11, 7, 5); // Some reversed primes
|
||||
}
|
||||
|
||||
void CheckBuffers() override
|
||||
{
|
||||
auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9
|
||||
assert(parsed.size() == 1);
|
||||
assert(parsed[0].size() == 3);
|
||||
assert(compare.size() == 3);
|
||||
|
||||
for(size_t i = 0; i < 3; i++)
|
||||
{
|
||||
// TODO: if we decide to test flags, make specialization for
|
||||
// rocprofiler_pc_sampling_record_stochastic_v0_t
|
||||
// assert(parsed[0][i].flags.has_stall_reason == true);
|
||||
// assert(parsed[0][i].flags.has_wave_cnt == true);
|
||||
// assert(parsed[0][i].flags.reserved == false);
|
||||
|
||||
assert(compare[i].exec_mask == parsed[0][i].exec_mask);
|
||||
assert(compare[i].workgroup_id == parsed[0][i].workgroup_id);
|
||||
|
||||
assert(compare[i].chiplet == parsed[0][i].chiplet);
|
||||
assert(compare[i].wave_id == parsed[0][i].wave_id);
|
||||
assert(compare[i].hw_id == parsed[0][i].hw_id);
|
||||
assert(compare[i].hw_id.chiplet == parsed[0][i].hw_id.chiplet);
|
||||
assert(compare[i].wave_in_group == parsed[0][i].wave_in_group);
|
||||
// TODO: handle HW_ID as well.
|
||||
// assert(compare[i].hw_id == parsed[0][i].hw_id);
|
||||
assert(compare[i].correlation_id.internal == parsed[0][i].correlation_id.internal);
|
||||
}
|
||||
}
|
||||
|
||||
void genPCSample(int pc, int exec, int blkx, int blky, int blkz, int chip, int wave, int hwid)
|
||||
void genPCSample(int pc, int exec, int blkx, int blky, int blkz, int chip, int wave)
|
||||
{
|
||||
rocprofiler_pc_sampling_record_t sample;
|
||||
PcSamplingRecordT sample;
|
||||
::memset(&sample, 0, sizeof(sample));
|
||||
|
||||
sample.exec_mask = exec;
|
||||
@@ -371,13 +506,15 @@ class WaveOtherFieldsTest : public WaveSnapTest
|
||||
sample.workgroup_id.y = blky;
|
||||
sample.workgroup_id.z = blkz;
|
||||
|
||||
sample.chiplet = chip;
|
||||
sample.wave_id = wave;
|
||||
sample.hw_id = hwid;
|
||||
sample.correlation_id.internal = dispatch->unique_id;
|
||||
sample.hw_id.chiplet = chip;
|
||||
sample.wave_in_group = wave;
|
||||
sample.correlation_id.internal = this->dispatch->unique_id;
|
||||
|
||||
compare.push_back(sample);
|
||||
|
||||
// We're testing fields commong for both perf_sample_host_trap_v1 and
|
||||
// perf_sample_snapshot_v1, so either struct is suitable here. No need to make
|
||||
// specialization,
|
||||
perf_sample_snapshot_v1 snap;
|
||||
::memset(&snap, 0, sizeof(snap));
|
||||
snap.exec_mask = exec;
|
||||
@@ -386,26 +523,31 @@ class WaveOtherFieldsTest : public WaveSnapTest
|
||||
snap.workgroup_id_y = blky;
|
||||
snap.workgroup_id_z = blkz;
|
||||
snap.chiplet_and_wave_id = (chip << 8) | (wave & 0x3F);
|
||||
snap.hw_id = hwid;
|
||||
snap.correlation_id = dispatch->getMockId().raw;
|
||||
snap.correlation_id = this->dispatch->getMockId().raw;
|
||||
|
||||
assert(dispatch.get());
|
||||
dispatch->submit(snap);
|
||||
assert(this->dispatch.get());
|
||||
this->dispatch->submit(snap);
|
||||
|
||||
(void) pc;
|
||||
};
|
||||
|
||||
std::vector<rocprofiler_pc_sampling_record_t> compare;
|
||||
std::vector<PcSamplingRecordT> compare;
|
||||
};
|
||||
|
||||
TEST(pcs_parser, gfx9_test)
|
||||
{
|
||||
WaveCntTest{}.Test();
|
||||
InstTypeTest{}.Test();
|
||||
StallReasonTest{}.Test();
|
||||
ArbStateTest{}.Test();
|
||||
WaveIssueAndErrorTest{}.Test();
|
||||
WaveOtherFieldsTest{}.Test();
|
||||
// Tests specific to stochastic sampling only
|
||||
WaveCntTest<rocprofiler_pc_sampling_record_stochastic_v0_t>{}.Test();
|
||||
// InstTypeTest{}.Test();
|
||||
// StallReasonTest{}.Test();
|
||||
// ArbStateTest{}.Test();
|
||||
// WaveIssueAndErrorTest{}.Test();
|
||||
|
||||
// Tests commong for both host trap and stochastic sampling.
|
||||
HwIdTest<rocprofiler_pc_sampling_record_host_trap_v0_t>{}.Test();
|
||||
HwIdTest<rocprofiler_pc_sampling_record_stochastic_v0_t>{}.Test();
|
||||
WaveOtherFieldsTest<rocprofiler_pc_sampling_record_host_trap_v0_t>{}.Test();
|
||||
WaveOtherFieldsTest<rocprofiler_pc_sampling_record_stochastic_v0_t>{}.Test();
|
||||
|
||||
std::cout << "GFX9 Test Done." << std::endl;
|
||||
}
|
||||
|
||||
+38
-15
@@ -46,6 +46,7 @@
|
||||
/**
|
||||
* Mimics the rocprofiler buffer sent to the parser.
|
||||
*/
|
||||
template <typename PcSamplingRecordT>
|
||||
class MockRuntimeBuffer
|
||||
{
|
||||
public:
|
||||
@@ -59,18 +60,21 @@ public:
|
||||
void submit(const packet_union_t& packet) { packets.push_back(packet); };
|
||||
|
||||
//! Submits a "upcoming_samples_t" packet signaling the next num_samples packets are PC samples
|
||||
void genUpcomingSamples(int num_samples)
|
||||
void genUpcomingSamples(int num_samples, upcoming_sample_t sample_type)
|
||||
{
|
||||
packet_union_t uni;
|
||||
::memset(&uni, 0, sizeof(uni));
|
||||
uni.upcoming.type = AMD_UPCOMING_SAMPLES;
|
||||
uni.upcoming.which_sample_type = AMD_SNAPSHOT_V1;
|
||||
uni.upcoming.which_sample_type = sample_type;
|
||||
uni.upcoming.num_samples = num_samples;
|
||||
uni.upcoming.device.handle = device;
|
||||
submit(uni);
|
||||
}
|
||||
|
||||
std::vector<std::vector<rocprofiler_pc_sampling_record_t>> get_parsed_buffer(int GFXIP_MAJOR)
|
||||
//! Submits a "upcoming_samples_t" packet signaling the next num_samples packets are PC samples
|
||||
void genUpcomingSamples(int num_samples);
|
||||
|
||||
std::vector<std::vector<PcSamplingRecordT>> get_parsed_buffer(int GFXIP_MAJOR)
|
||||
{
|
||||
parsed_data = {};
|
||||
|
||||
@@ -83,22 +87,38 @@ public:
|
||||
return parsed_data;
|
||||
}
|
||||
|
||||
static uint64_t alloc_parse_memory(rocprofiler_pc_sampling_record_t** sample,
|
||||
uint64_t req_size,
|
||||
void* userdata)
|
||||
static uint64_t alloc_parse_memory(PcSamplingRecordT** sample,
|
||||
uint64_t req_size,
|
||||
void* userdata)
|
||||
{
|
||||
auto* buffer = reinterpret_cast<MockRuntimeBuffer*>(userdata);
|
||||
buffer->parsed_data.push_back(std::vector<rocprofiler_pc_sampling_record_t>(req_size));
|
||||
buffer->parsed_data.push_back(std::vector<PcSamplingRecordT>(req_size));
|
||||
*sample = buffer->parsed_data.back().data();
|
||||
return req_size;
|
||||
}
|
||||
|
||||
std::vector<packet_union_t> packets;
|
||||
std::vector<std::vector<rocprofiler_pc_sampling_record_t>> parsed_data;
|
||||
std::vector<packet_union_t> packets;
|
||||
std::vector<std::vector<PcSamplingRecordT>> parsed_data;
|
||||
|
||||
const uint32_t device;
|
||||
};
|
||||
|
||||
template <>
|
||||
void
|
||||
MockRuntimeBuffer<rocprofiler_pc_sampling_record_host_trap_v0_t>::genUpcomingSamples(
|
||||
int num_samples)
|
||||
{
|
||||
genUpcomingSamples(num_samples, AMD_HOST_TRAP_V1);
|
||||
}
|
||||
|
||||
template <>
|
||||
void
|
||||
MockRuntimeBuffer<rocprofiler_pc_sampling_record_stochastic_v0_t>::genUpcomingSamples(
|
||||
int num_samples)
|
||||
{
|
||||
this->genUpcomingSamples(num_samples, AMD_SNAPSHOT_V1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Mimics a HSA doorbell. Every live instance of this class has an unique ID (handler).
|
||||
* The handler itself may be not unique considering dead instances.
|
||||
@@ -149,10 +169,11 @@ private:
|
||||
* read and write pointers.
|
||||
* Creating an instance of this class automatically adds a queue creation packet to the buffer.
|
||||
*/
|
||||
template <typename PcSamplingRecordT>
|
||||
class MockQueue
|
||||
{
|
||||
public:
|
||||
MockQueue(int size_, std::shared_ptr<MockRuntimeBuffer>& buffer_)
|
||||
MockQueue(int size_, std::shared_ptr<MockRuntimeBuffer<PcSamplingRecordT>>& buffer_)
|
||||
: id(getUniqueId())
|
||||
, size(size_)
|
||||
, doorbell()
|
||||
@@ -184,7 +205,7 @@ public:
|
||||
const MockDoorBell doorbell;
|
||||
const uint32_t device;
|
||||
|
||||
std::shared_ptr<MockRuntimeBuffer> const buffer;
|
||||
std::shared_ptr<MockRuntimeBuffer<PcSamplingRecordT>> const buffer;
|
||||
|
||||
private:
|
||||
static size_t getUniqueId()
|
||||
@@ -198,10 +219,11 @@ private:
|
||||
* Mimics a kernel dispatch.
|
||||
* Creating an instance of this class automatically adds a dispatch creation packet to the buffer.
|
||||
*/
|
||||
template <typename PcSamplingRecordT>
|
||||
class MockDispatch
|
||||
{
|
||||
public:
|
||||
MockDispatch(std::shared_ptr<MockQueue>& queue_)
|
||||
MockDispatch(std::shared_ptr<MockQueue<PcSamplingRecordT>>& queue_)
|
||||
: queue(queue_)
|
||||
, dispatch_id(queue->write_index)
|
||||
, doorbell_id(queue->doorbell.handler)
|
||||
@@ -251,7 +273,7 @@ public:
|
||||
<< " ds_id:" << dispatch_id << std::endl;
|
||||
}
|
||||
|
||||
std::shared_ptr<MockQueue> const queue;
|
||||
std::shared_ptr<MockQueue<PcSamplingRecordT>> const queue;
|
||||
|
||||
const size_t dispatch_id;
|
||||
const size_t doorbell_id;
|
||||
@@ -273,10 +295,11 @@ private:
|
||||
* Instead of generating a valid program counter, this class uses the snapshot.pc field to
|
||||
* store the original dispatch's unique_id for later correctness verification.
|
||||
*/
|
||||
template <typename PcSamplingRecordT>
|
||||
class MockWave
|
||||
{
|
||||
public:
|
||||
MockWave(const std::shared_ptr<MockDispatch>& dispatch_)
|
||||
MockWave(const std::shared_ptr<MockDispatch<PcSamplingRecordT>>& dispatch_)
|
||||
: dispatch(dispatch_)
|
||||
{}
|
||||
|
||||
@@ -295,5 +318,5 @@ public:
|
||||
<< dispatch->unique_id << std::endl;
|
||||
}
|
||||
|
||||
std::shared_ptr<MockDispatch> const dispatch;
|
||||
std::shared_ptr<MockDispatch<PcSamplingRecordT>> const dispatch;
|
||||
};
|
||||
|
||||
+102
-65
@@ -46,19 +46,20 @@ public:
|
||||
/**
|
||||
* Sample user memory allocation callback.
|
||||
* It expects userdata to be cast-able to a pointer to
|
||||
* std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>>
|
||||
* std::vector<std::pair<PcSamplingRecordT*, uint64_t>>
|
||||
*/
|
||||
template <typename PcSamplingRecordT>
|
||||
static uint64_t
|
||||
alloc_callback(rocprofiler_pc_sampling_record_t** buffer, uint64_t size, void* userdata)
|
||||
alloc_callback(PcSamplingRecordT** buffer, uint64_t size, void* userdata)
|
||||
{
|
||||
*buffer = new rocprofiler_pc_sampling_record_t[size];
|
||||
*buffer = new PcSamplingRecordT[size];
|
||||
auto& vector =
|
||||
*reinterpret_cast<std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>>*>(
|
||||
userdata);
|
||||
*reinterpret_cast<std::vector<std::pair<PcSamplingRecordT*, uint64_t>>*>(userdata);
|
||||
vector.push_back({*buffer, size});
|
||||
return size;
|
||||
}
|
||||
|
||||
template <typename PcSamplingRecordT>
|
||||
void
|
||||
multithread_queue_hammer(size_t tid, Latch* latch)
|
||||
{
|
||||
@@ -70,10 +71,11 @@ multithread_queue_hammer(size_t tid, Latch* latch)
|
||||
constexpr int NUM_QUEUES = MockDoorBell::num_unique_bells / NUM_THREADS;
|
||||
constexpr int ACTION_MAX = QSIZE * NUM_QUEUES / 2;
|
||||
|
||||
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>(tid);
|
||||
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>(tid);
|
||||
|
||||
std::array<std::shared_ptr<MockQueue>, NUM_QUEUES> queues;
|
||||
std::array<std::vector<std::shared_ptr<MockDispatch>>, NUM_QUEUES> active_dispatches;
|
||||
std::array<std::shared_ptr<MockQueue<PcSamplingRecordT>>, NUM_QUEUES> queues;
|
||||
std::array<std::vector<std::shared_ptr<MockDispatch<PcSamplingRecordT>>>, NUM_QUEUES>
|
||||
active_dispatches;
|
||||
|
||||
int num_reset_queues = 0;
|
||||
int num_samples_generated = 0;
|
||||
@@ -82,9 +84,10 @@ multithread_queue_hammer(size_t tid, Latch* latch)
|
||||
size_t max_q_occupancy = 0;
|
||||
|
||||
for(int i = 0; i < NUM_QUEUES; i++)
|
||||
queues[i] = std::make_shared<MockQueue>(QSIZE, buffer);
|
||||
queues[i] = std::make_shared<MockQueue<PcSamplingRecordT>>(QSIZE, buffer);
|
||||
for(int i = 0; i < NUM_QUEUES; i++)
|
||||
active_dispatches[i].push_back(std::make_shared<MockDispatch>(queues[i]));
|
||||
active_dispatches[i].push_back(
|
||||
std::make_shared<MockDispatch<PcSamplingRecordT>>(queues[i]));
|
||||
|
||||
for(int i = 0; i < NUM_ACTIONS; i++)
|
||||
{
|
||||
@@ -95,7 +98,7 @@ multithread_queue_hammer(size_t tid, Latch* latch)
|
||||
// Delete queue and create new one
|
||||
active_dispatches[q] = {};
|
||||
queues[q].reset();
|
||||
queues[q] = std::make_shared<MockQueue>(QSIZE, buffer);
|
||||
queues[q] = std::make_shared<MockQueue<PcSamplingRecordT>>(QSIZE, buffer);
|
||||
num_reset_queues++;
|
||||
}
|
||||
else if(action > ACTION_MAX / 2 && active_dispatches[q].size() > 1)
|
||||
@@ -108,7 +111,8 @@ multithread_queue_hammer(size_t tid, Latch* latch)
|
||||
// Add new dispatch
|
||||
if(active_dispatches[q].size() < QSIZE)
|
||||
{
|
||||
active_dispatches[q].push_back(std::make_shared<MockDispatch>(queues[q]));
|
||||
active_dispatches[q].push_back(
|
||||
std::make_shared<MockDispatch<PcSamplingRecordT>>(queues[q]));
|
||||
num_dispatches_generated += 1;
|
||||
}
|
||||
|
||||
@@ -117,7 +121,8 @@ multithread_queue_hammer(size_t tid, Latch* latch)
|
||||
for(auto& queue : active_dispatches)
|
||||
{
|
||||
EXPECT_NE(queue.size(), 0);
|
||||
std::shared_ptr<MockDispatch> rand_dispatch = queue[rdgen() % queue.size()];
|
||||
std::shared_ptr<MockDispatch<PcSamplingRecordT>> rand_dispatch =
|
||||
queue[rdgen() % queue.size()];
|
||||
MockWave(rand_dispatch).genPCSample();
|
||||
num_samples_generated += 1;
|
||||
avg_q_occupancy += queue.size();
|
||||
@@ -127,23 +132,23 @@ multithread_queue_hammer(size_t tid, Latch* latch)
|
||||
|
||||
latch->sync();
|
||||
|
||||
std::vector<std::pair<rocprofiler_pc_sampling_record_t*, uint64_t>> all_allocations;
|
||||
std::vector<std::pair<PcSamplingRecordT*, uint64_t>> all_allocations;
|
||||
|
||||
CHECK_PARSER(_parse_buffer<GFX9>((generic_sample_t*) buffer->packets.data(),
|
||||
buffer->packets.size(),
|
||||
alloc_callback,
|
||||
alloc_callback<PcSamplingRecordT>,
|
||||
(void*) &all_allocations,
|
||||
&corr_map));
|
||||
|
||||
EXPECT_EQ(all_allocations.size(), NUM_ACTIONS); // Incorrect number of callbacks
|
||||
for(auto sb = 0ul; sb < all_allocations.size(); sb++)
|
||||
{
|
||||
rocprofiler_pc_sampling_record_t* samples = all_allocations[sb].first;
|
||||
size_t num_samples = all_allocations[sb].second;
|
||||
PcSamplingRecordT* samples = all_allocations[sb].first;
|
||||
size_t num_samples = all_allocations[sb].second;
|
||||
|
||||
EXPECT_EQ(num_samples, NUM_QUEUES);
|
||||
for(size_t i = 0; i < num_samples; i++)
|
||||
EXPECT_EQ(samples[i].correlation_id.internal, samples[i].pc.loaded_code_object_offset);
|
||||
EXPECT_EQ(samples[i].correlation_id.internal, samples[i].pc.code_object_offset);
|
||||
delete[] samples;
|
||||
}
|
||||
}
|
||||
@@ -152,6 +157,7 @@ multithread_queue_hammer(size_t tid, Latch* latch)
|
||||
* Benchmarks how fast the parser can process samples on a single threaded case
|
||||
* Current: 5600X with -Ofast, up to >140 million samples/s or ~9GB/s R/W (18GB/s bidirectional)
|
||||
*/
|
||||
template <typename PcSamplingRecordT>
|
||||
static std::pair<size_t, size_t>
|
||||
MultiThread_BenchMark(size_t tid, Latch* latch)
|
||||
{
|
||||
@@ -161,14 +167,16 @@ MultiThread_BenchMark(size_t tid, Latch* latch)
|
||||
constexpr size_t DISP_PER_QUEUE = 16;
|
||||
constexpr size_t NUM_QUEUES = 1;
|
||||
|
||||
std::shared_ptr<MockRuntimeBuffer> buffer = std::make_shared<MockRuntimeBuffer>(tid);
|
||||
std::array<std::vector<std::shared_ptr<MockDispatch>>, NUM_QUEUES> active_dispatches;
|
||||
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>(tid);
|
||||
std::array<std::vector<std::shared_ptr<MockDispatch<PcSamplingRecordT>>>, NUM_QUEUES>
|
||||
active_dispatches;
|
||||
|
||||
for(size_t q = 0; q < NUM_QUEUES; q++)
|
||||
{
|
||||
std::shared_ptr<MockQueue> queue = std::make_shared<MockQueue>(DISP_PER_QUEUE * 2, buffer);
|
||||
auto queue = std::make_shared<MockQueue<PcSamplingRecordT>>(DISP_PER_QUEUE * 2, buffer);
|
||||
for(size_t d = 0; d < DISP_PER_QUEUE; d++)
|
||||
active_dispatches[q].push_back(std::make_shared<MockDispatch>(queue));
|
||||
active_dispatches[q].push_back(
|
||||
std::make_shared<MockDispatch<PcSamplingRecordT>>(queue));
|
||||
}
|
||||
|
||||
constexpr size_t TOTAL_NUM_SAMPLES = NUM_QUEUES * DISP_PER_QUEUE * SAMPLE_PER_DISPATCH;
|
||||
@@ -179,29 +187,31 @@ MultiThread_BenchMark(size_t tid, Latch* latch)
|
||||
for(size_t i = 0; i < SAMPLE_PER_DISPATCH; i++)
|
||||
MockWave(dispatch).genPCSample();
|
||||
|
||||
std::pair<rocprofiler_pc_sampling_record_t*, size_t> userdata;
|
||||
userdata.first = new rocprofiler_pc_sampling_record_t[TOTAL_NUM_SAMPLES];
|
||||
std::pair<PcSamplingRecordT*, size_t> userdata;
|
||||
userdata.first = new PcSamplingRecordT[TOTAL_NUM_SAMPLES];
|
||||
userdata.second = TOTAL_NUM_SAMPLES;
|
||||
|
||||
latch->sync();
|
||||
|
||||
auto t0 = std::chrono::system_clock::now();
|
||||
CHECK_PARSER(_parse_buffer<GFX9>(
|
||||
(generic_sample_t*) buffer->packets.data(),
|
||||
buffer->packets.size(),
|
||||
[](rocprofiler_pc_sampling_record_t** sample, uint64_t size, void* userdata_) {
|
||||
auto* pair =
|
||||
reinterpret_cast<std::pair<rocprofiler_pc_sampling_record_t*, size_t>*>(userdata_);
|
||||
*sample = pair->first;
|
||||
user_callback_t<PcSamplingRecordT> user_cb =
|
||||
[](PcSamplingRecordT** sample, uint64_t size, void* userdata_) {
|
||||
auto* pair = reinterpret_cast<std::pair<PcSamplingRecordT*, size_t>*>(userdata_);
|
||||
*sample = pair->first;
|
||||
return size;
|
||||
},
|
||||
&userdata,
|
||||
&corr_map));
|
||||
};
|
||||
|
||||
auto t0 = std::chrono::system_clock::now();
|
||||
CHECK_PARSER(_parse_buffer<GFX9>((generic_sample_t*) buffer->packets.data(),
|
||||
buffer->packets.size(),
|
||||
user_cb,
|
||||
&userdata,
|
||||
&corr_map));
|
||||
auto t1 = std::chrono::system_clock::now();
|
||||
delete[] userdata.first;
|
||||
return {TOTAL_NUM_SAMPLES, (t1 - t0).count()};
|
||||
}
|
||||
|
||||
template <typename PcSamplingRecordT>
|
||||
void
|
||||
multithread_codeobj(size_t tid, Latch* latch)
|
||||
{
|
||||
@@ -215,11 +225,11 @@ multithread_codeobj(size_t tid, Latch* latch)
|
||||
constexpr int NUM_SAMPLES = 50;
|
||||
constexpr int QSIZE = 16;
|
||||
|
||||
auto buffer = std::make_shared<MockRuntimeBuffer>(tid);
|
||||
auto queue = std::make_shared<MockQueue>(QSIZE, buffer);
|
||||
auto buffer = std::make_shared<MockRuntimeBuffer<PcSamplingRecordT>>(tid);
|
||||
auto queue = std::make_shared<MockQueue<PcSamplingRecordT>>(QSIZE, buffer);
|
||||
|
||||
std::pair<rocprofiler_pc_sampling_record_t*, size_t> userdata;
|
||||
userdata.first = new rocprofiler_pc_sampling_record_t[NUM_SAMPLES];
|
||||
std::pair<PcSamplingRecordT*, size_t> userdata;
|
||||
userdata.first = new PcSamplingRecordT[NUM_SAMPLES];
|
||||
userdata.second = NUM_SAMPLES;
|
||||
|
||||
latch->sync();
|
||||
@@ -227,7 +237,7 @@ multithread_codeobj(size_t tid, Latch* latch)
|
||||
for(int d = 0; d < NUM_DISPATCH; d++)
|
||||
{
|
||||
buffer->packets.clear();
|
||||
auto dispatch = std::make_shared<MockDispatch>(queue);
|
||||
auto dispatch = std::make_shared<MockDispatch<PcSamplingRecordT>>(queue);
|
||||
|
||||
const size_t pc_base_addr = NUM_SAMPLES * dispatch->unique_id;
|
||||
table->insert(addr_range_t{pc_base_addr, NUM_SAMPLES, dispatch->unique_id});
|
||||
@@ -242,25 +252,25 @@ multithread_codeobj(size_t tid, Latch* latch)
|
||||
dispatch->submit(uni);
|
||||
}
|
||||
|
||||
CHECK_PARSER(_parse_buffer<GFX9>(
|
||||
(generic_sample_t*) buffer->packets.data(),
|
||||
buffer->packets.size(),
|
||||
[](rocprofiler_pc_sampling_record_t** sample, uint64_t size, void* userdata_) {
|
||||
auto* pair =
|
||||
reinterpret_cast<std::pair<rocprofiler_pc_sampling_record_t*, size_t>*>(
|
||||
userdata_);
|
||||
*sample = pair->first;
|
||||
user_callback_t<PcSamplingRecordT> user_cb =
|
||||
[](PcSamplingRecordT** sample, uint64_t size, void* userdata_) {
|
||||
auto* pair = reinterpret_cast<std::pair<PcSamplingRecordT*, size_t>*>(userdata_);
|
||||
*sample = pair->first;
|
||||
assert(size <= NUM_SAMPLES);
|
||||
return size;
|
||||
},
|
||||
&userdata,
|
||||
&corr_map));
|
||||
};
|
||||
|
||||
CHECK_PARSER(_parse_buffer<GFX9>((generic_sample_t*) buffer->packets.data(),
|
||||
buffer->packets.size(),
|
||||
user_cb,
|
||||
&userdata,
|
||||
&corr_map));
|
||||
|
||||
for(int s = 0; s < NUM_SAMPLES; s++)
|
||||
{
|
||||
const auto& pc = userdata.first[s].pc;
|
||||
EXPECT_EQ(pc.loaded_code_object_id, dispatch->unique_id);
|
||||
EXPECT_EQ(pc.loaded_code_object_offset, s);
|
||||
EXPECT_EQ(pc.code_object_id, dispatch->unique_id);
|
||||
EXPECT_EQ(pc.code_object_offset, s);
|
||||
}
|
||||
|
||||
table->remove(addr_range_t{pc_base_addr, NUM_SAMPLES, dispatch->unique_id});
|
||||
@@ -269,7 +279,9 @@ multithread_codeobj(size_t tid, Latch* latch)
|
||||
delete[] userdata.first;
|
||||
}
|
||||
|
||||
TEST(pcs_parser, bench_test)
|
||||
template <typename PcSamplingRecordT>
|
||||
void
|
||||
pcs_parser_bench_test()
|
||||
{
|
||||
size_t time = 0;
|
||||
size_t samples = 0;
|
||||
@@ -280,7 +292,8 @@ TEST(pcs_parser, bench_test)
|
||||
|
||||
std::vector<std::future<std::pair<size_t, size_t>>> threads{};
|
||||
for(size_t t = 0; t < NUM_THREADS; t++)
|
||||
threads.push_back(std::async(std::launch::async, MultiThread_BenchMark, t, &latch));
|
||||
threads.push_back(std::async(
|
||||
std::launch::async, MultiThread_BenchMark<PcSamplingRecordT>, t, &latch));
|
||||
|
||||
if(it == 0) continue; // Skip warmup
|
||||
|
||||
@@ -295,23 +308,47 @@ TEST(pcs_parser, bench_test)
|
||||
double mean = 1E3 * NUM_THREADS * samples / time;
|
||||
|
||||
std::cout << "Benchmark: Parsed " << int(mean * 1E3 + 0.5) * 1E-3f << " Msample/s (";
|
||||
std::cout << int(sizeof(rocprofiler_pc_sampling_record_t) * mean) << " MB/s)" << std::endl;
|
||||
std::cout << int(sizeof(PcSamplingRecordT) * mean) << " MB/s)" << std::endl;
|
||||
};
|
||||
|
||||
TEST(pcs_parser, bench_test)
|
||||
{
|
||||
pcs_parser_bench_test<rocprofiler_pc_sampling_record_host_trap_v0_t>();
|
||||
pcs_parser_bench_test<rocprofiler_pc_sampling_record_stochastic_v0_t>();
|
||||
}
|
||||
|
||||
template <typename PcSamplingRecordT>
|
||||
void
|
||||
pcs_parser_hammer_test()
|
||||
{
|
||||
Latch latch(NUM_THREADS);
|
||||
|
||||
std::vector<std::future<void>> threads{};
|
||||
for(size_t i = 0; i < NUM_THREADS; i++)
|
||||
threads.push_back(
|
||||
std::async(std::launch::async, multithread_queue_hammer<PcSamplingRecordT>, i, &latch));
|
||||
};
|
||||
|
||||
TEST(pcs_parser, hammer_test)
|
||||
{
|
||||
Latch latch(NUM_THREADS);
|
||||
pcs_parser_hammer_test<rocprofiler_pc_sampling_record_host_trap_v0_t>();
|
||||
pcs_parser_hammer_test<rocprofiler_pc_sampling_record_stochastic_v0_t>();
|
||||
}
|
||||
|
||||
std::vector<std::future<void>> threads{};
|
||||
for(size_t i = 0; i < NUM_THREADS; i++)
|
||||
threads.push_back(std::async(std::launch::async, multithread_queue_hammer, i, &latch));
|
||||
};
|
||||
|
||||
TEST(pcs_parser, codeobj_test)
|
||||
template <typename PcSamplingRecordT>
|
||||
void
|
||||
pcs_parser_codeobj_test()
|
||||
{
|
||||
Latch latch(NUM_THREADS);
|
||||
|
||||
std::vector<std::future<void>> threads{};
|
||||
for(size_t i = 0; i < NUM_THREADS; i++)
|
||||
threads.push_back(std::async(std::launch::async, multithread_codeobj, i, &latch));
|
||||
};
|
||||
threads.push_back(
|
||||
std::async(std::launch::async, multithread_codeobj<PcSamplingRecordT>, i, &latch));
|
||||
}
|
||||
|
||||
TEST(pcs_parser, codeobj_test)
|
||||
{
|
||||
pcs_parser_codeobj_test<rocprofiler_pc_sampling_record_host_trap_v0_t>();
|
||||
pcs_parser_codeobj_test<rocprofiler_pc_sampling_record_stochastic_v0_t>();
|
||||
}
|
||||
|
||||
+253
-153
@@ -28,184 +28,284 @@
|
||||
|
||||
#include "lib/rocprofiler-sdk/pc_sampling/parser/gfx11.hpp"
|
||||
#include "lib/rocprofiler-sdk/pc_sampling/parser/gfx9.hpp"
|
||||
#include "lib/rocprofiler-sdk/pc_sampling/parser/parser_types.h"
|
||||
#include "lib/rocprofiler-sdk/pc_sampling/parser/parser_types.hpp"
|
||||
#include "lib/rocprofiler-sdk/pc_sampling/parser/rocr.h"
|
||||
#include "lib/rocprofiler-sdk/pc_sampling/parser/stochastic_records.h"
|
||||
|
||||
template <typename SType>
|
||||
inline rocprofiler_pc_sampling_record_t
|
||||
// TODO: refactor the commented code for stochastic sampling
|
||||
|
||||
// template <typename gfx>
|
||||
// inline rocprofiler_pc_sampling_record_t
|
||||
// copyStochasticSample(const perf_sample_snapshot_v1& sample);
|
||||
|
||||
// template <>
|
||||
// inline rocprofiler_pc_sampling_record_t
|
||||
// copyStochasticSample<GFX9>(const perf_sample_snapshot_v1& sample)
|
||||
// {
|
||||
// rocprofiler_pc_sampling_record_t ret = copySampleHeader<perf_sample_snapshot_v1>(sample);
|
||||
// ret.flags.valid = sample.perf_snapshot_data & (~sample.perf_snapshot_data >> 26) & 0x1;
|
||||
// // Check wave_id matches snapshot_wave_id
|
||||
|
||||
// ret.flags.has_wave_cnt = true;
|
||||
// ret.flags.has_stall_reason = true;
|
||||
|
||||
// ret.wave_count = sample.perf_snapshot_data1 & 0x3F;
|
||||
|
||||
// ret.wave_issued = sample.perf_snapshot_data >> 1;
|
||||
// ret.snapshot.dual_issue_valu = sample.perf_snapshot_data >> 2;
|
||||
// ret.snapshot.inst_type = sample.perf_snapshot_data >> 3;
|
||||
// ret.snapshot.reason_not_issued = (sample.perf_snapshot_data >> 7) & 0x7;
|
||||
// ret.snapshot.arb_state_issue = (sample.perf_snapshot_data >> 10) & 0xFF;
|
||||
// ret.snapshot.arb_state_stall = (sample.perf_snapshot_data >> 18) & 0xFF;
|
||||
// ret.reserved = 0;
|
||||
// return ret;
|
||||
// }
|
||||
|
||||
// template <>
|
||||
// inline rocprofiler_pc_sampling_record_t
|
||||
// copyStochasticSample<GFX11>(const perf_sample_snapshot_v1& sample)
|
||||
// {
|
||||
// rocprofiler_pc_sampling_record_t ret = copySampleHeader<perf_sample_snapshot_v1>(sample);
|
||||
// ret.flags.valid = sample.perf_snapshot_data & (~sample.perf_snapshot_data >> 23) & 0x1;
|
||||
// // Check wave_id matches snapshot_wave_id
|
||||
|
||||
// ret.flags.has_stall_reason = true;
|
||||
|
||||
// ret.wave_issued = sample.perf_snapshot_data >> 1;
|
||||
// ret.snapshot.inst_type = sample.perf_snapshot_data >> 2;
|
||||
// ret.snapshot.reason_not_issued = (sample.perf_snapshot_data >> 6) & 0x7;
|
||||
// ret.snapshot.arb_state_issue = (sample.perf_snapshot_data >> 9) & 0x7F;
|
||||
// ret.snapshot.arb_state_stall = (sample.perf_snapshot_data >> 16) & 0x7F;
|
||||
// ret.snapshot.dual_issue_valu = false;
|
||||
// ret.reserved = 0;
|
||||
// return ret;
|
||||
// }
|
||||
|
||||
// #define BITSHIFT(sname) out |= ((in >> GFX::sname) & 1) << PCSAMPLE::sname
|
||||
|
||||
// template <typename GFX>
|
||||
// inline int
|
||||
// translate_arb(int in)
|
||||
// {
|
||||
// size_t out = 0;
|
||||
// BITSHIFT(ISSUE_VALU);
|
||||
// BITSHIFT(ISSUE_MATRIX);
|
||||
// BITSHIFT(ISSUE_LDS);
|
||||
// BITSHIFT(ISSUE_LDS_DIRECT);
|
||||
// BITSHIFT(ISSUE_SCALAR);
|
||||
// BITSHIFT(ISSUE_VMEM_TEX);
|
||||
// BITSHIFT(ISSUE_FLAT);
|
||||
// BITSHIFT(ISSUE_EXP);
|
||||
// BITSHIFT(ISSUE_MISC);
|
||||
// BITSHIFT(ISSUE_BRMSG);
|
||||
// return out & 0x3FF;
|
||||
// }
|
||||
|
||||
// #undef BITSHIFT
|
||||
|
||||
// #define LUTOVERLOAD(sname) this->operator[](GFX::sname) = PCSAMPLE::sname
|
||||
|
||||
// template <typename GFX>
|
||||
// class GFX_REASON_LUT : public std::array<int, 32>
|
||||
// {
|
||||
// public:
|
||||
// GFX_REASON_LUT()
|
||||
// {
|
||||
// std::memset(data(), 0, size() * sizeof(int));
|
||||
// LUTOVERLOAD(REASON_NOT_AVAILABLE);
|
||||
// LUTOVERLOAD(REASON_ALU);
|
||||
// LUTOVERLOAD(REASON_WAITCNT);
|
||||
// LUTOVERLOAD(REASON_INTERNAL);
|
||||
// LUTOVERLOAD(REASON_BARRIER);
|
||||
// LUTOVERLOAD(REASON_ARBITER);
|
||||
// LUTOVERLOAD(REASON_EX_STALL);
|
||||
// LUTOVERLOAD(REASON_OTHER_WAIT);
|
||||
// LUTOVERLOAD(REASON_SLEEP);
|
||||
// }
|
||||
// };
|
||||
|
||||
// template <typename GFX>
|
||||
// class GFX_INST_LUT : public std::array<int, 32>
|
||||
// {
|
||||
// public:
|
||||
// GFX_INST_LUT()
|
||||
// {
|
||||
// std::memset(data(), 0, size() * sizeof(int));
|
||||
// LUTOVERLOAD(TYPE_VALU);
|
||||
// LUTOVERLOAD(TYPE_MATRIX);
|
||||
// LUTOVERLOAD(TYPE_SCALAR);
|
||||
// LUTOVERLOAD(TYPE_TEX);
|
||||
// LUTOVERLOAD(TYPE_LDS);
|
||||
// LUTOVERLOAD(TYPE_LDS_DIRECT);
|
||||
// LUTOVERLOAD(TYPE_FLAT);
|
||||
// LUTOVERLOAD(TYPE_EXP);
|
||||
// LUTOVERLOAD(TYPE_MESSAGE);
|
||||
// LUTOVERLOAD(TYPE_BARRIER);
|
||||
// LUTOVERLOAD(TYPE_BRANCH_NOT_TAKEN);
|
||||
// LUTOVERLOAD(TYPE_BRANCH_TAKEN);
|
||||
// LUTOVERLOAD(TYPE_JUMP);
|
||||
// LUTOVERLOAD(TYPE_OTHER);
|
||||
// LUTOVERLOAD(TYPE_NO_INST);
|
||||
// LUTOVERLOAD(TYPE_DUAL_VALU);
|
||||
// }
|
||||
// };
|
||||
|
||||
// template <typename GFX>
|
||||
// inline int
|
||||
// translate_reason(int in)
|
||||
// {
|
||||
// static GFX_REASON_LUT<GFX> lut;
|
||||
// return lut[in & 0x1F];
|
||||
// }
|
||||
|
||||
// template <typename GFX>
|
||||
// inline int
|
||||
// translate_inst(int in)
|
||||
// {
|
||||
// static GFX_INST_LUT<GFX> lut;
|
||||
// return lut[in & 0x1F];
|
||||
// }
|
||||
|
||||
// #undef LUTOVERLOAD
|
||||
|
||||
// template <bool HostTrap, typename GFX>
|
||||
// inline rocprofiler_pc_sampling_record_t
|
||||
// copySample(const void* sample)
|
||||
// {
|
||||
// if(HostTrap) return copyHostTrapSample(*(const perf_sample_host_trap_v1*) sample);
|
||||
|
||||
// rocprofiler_pc_sampling_record_t ret =
|
||||
// copyStochasticSample<GFX>(*(const perf_sample_snapshot_v1*) sample);
|
||||
|
||||
// ret.snapshot.inst_type = translate_inst<GFX>(ret.snapshot.inst_type);
|
||||
// ret.snapshot.arb_state_issue = translate_arb<GFX>(ret.snapshot.arb_state_issue);
|
||||
// ret.snapshot.arb_state_stall = translate_arb<GFX>(ret.snapshot.arb_state_stall);
|
||||
// ret.snapshot.reason_not_issued = translate_reason<GFX>(ret.snapshot.reason_not_issued);
|
||||
|
||||
// return ret;
|
||||
// }
|
||||
|
||||
#define EXTRACT_BITS(val, bit_end, bit_start) \
|
||||
(val >> bit_start) & ((1U << (bit_end - bit_start + 1)) - 1)
|
||||
|
||||
template <typename GFX, typename PcSamplingRecordT, typename SType>
|
||||
inline void
|
||||
copyChipletId(PcSamplingRecordT& record, const SType& sample)
|
||||
{
|
||||
// extract chiplet record
|
||||
record.hw_id.chiplet = sample.chiplet_and_wave_id >> 8;
|
||||
}
|
||||
|
||||
template <typename GFX9, typename HwIdT>
|
||||
inline void
|
||||
copyHwId(HwIdT& hw_id, const uint32_t hsa_hw_id);
|
||||
|
||||
template <>
|
||||
inline void
|
||||
copyHwId<GFX9, rocprofiler_pc_sampling_hw_id_v0_t>(rocprofiler_pc_sampling_hw_id_v0_t& hw_id,
|
||||
const uint32_t hw_id_reg)
|
||||
{
|
||||
// 3:0 -> wave_id
|
||||
hw_id.wave_id = EXTRACT_BITS(hw_id_reg, 3, 0);
|
||||
// 5:4 -> simd_id
|
||||
hw_id.simd_id = EXTRACT_BITS(hw_id_reg, 5, 4);
|
||||
// 7:6 -> pipe_id;
|
||||
hw_id.pipe_id = EXTRACT_BITS(hw_id_reg, 7, 6);
|
||||
// 11:8 -> cu_id
|
||||
hw_id.cu_or_wgp_id = EXTRACT_BITS(hw_id_reg, 11, 8);
|
||||
// 12 -> sa_id
|
||||
hw_id.shader_array_id = EXTRACT_BITS(hw_id_reg, 12, 12);
|
||||
// 15:13 -> se_id
|
||||
hw_id.shader_engine_id = EXTRACT_BITS(hw_id_reg, 15, 13);
|
||||
// 19:16 -> tg_id
|
||||
hw_id.workgroup_id = EXTRACT_BITS(hw_id_reg, 19, 16);
|
||||
// 23:20 -> vm_id
|
||||
hw_id.vm_id = EXTRACT_BITS(hw_id_reg, 23, 20);
|
||||
// 26:24 -> queue_id
|
||||
hw_id.queue_id = EXTRACT_BITS(hw_id_reg, 26, 24);
|
||||
// 29:27 -> state_id (ignored)
|
||||
// 31:30 -> me_id
|
||||
hw_id.microengine_id = EXTRACT_BITS(hw_id_reg, 31, 30);
|
||||
}
|
||||
|
||||
#undef EXTRACT_BITS
|
||||
|
||||
template <typename PcSamplingRecordT, typename SType>
|
||||
inline PcSamplingRecordT
|
||||
copySampleHeader(const SType& sample)
|
||||
{
|
||||
rocprofiler_pc_sampling_record_t ret;
|
||||
ret.flags = pcsample_header_v1_t{.raw = 0}.flags;
|
||||
ret.flags.type = AMD_SNAPSHOT_V1;
|
||||
PcSamplingRecordT ret;
|
||||
// zero out all record fields
|
||||
std::memset(&ret, 0, sizeof(PcSamplingRecordT));
|
||||
|
||||
// Decode fields common for all host-trap and stochastic on all architectures.
|
||||
ret.size = sizeof(PcSamplingRecordT);
|
||||
ret.wave_in_group = sample.chiplet_and_wave_id & 0x3F;
|
||||
|
||||
ret.exec_mask = sample.exec_mask;
|
||||
ret.workgroup_id.x = sample.workgroup_id_x;
|
||||
ret.workgroup_id.y = sample.workgroup_id_y;
|
||||
ret.workgroup_id.z = sample.workgroup_id_z;
|
||||
|
||||
ret.chiplet = sample.chiplet_and_wave_id >> 8;
|
||||
ret.wave_id = sample.chiplet_and_wave_id & 0x3F;
|
||||
ret.hw_id = sample.hw_id;
|
||||
ret.timestamp = sample.timestamp;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline rocprofiler_pc_sampling_record_t
|
||||
copyHostTrapSample(const perf_sample_host_trap_v1& sample)
|
||||
{
|
||||
rocprofiler_pc_sampling_record_t ret = copySampleHeader<perf_sample_host_trap_v1>(sample);
|
||||
ret.flags.type = AMD_HOST_TRAP_V1;
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename gfx>
|
||||
inline rocprofiler_pc_sampling_record_t
|
||||
copyStochasticSample(const perf_sample_snapshot_v1& sample);
|
||||
template <typename GFX, typename PcSamplingRecordT>
|
||||
inline PcSamplingRecordT
|
||||
copySample(const void* sample);
|
||||
|
||||
/**
|
||||
* @brief Host trap V0 sample for GFX9
|
||||
*/
|
||||
template <>
|
||||
inline rocprofiler_pc_sampling_record_t
|
||||
copyStochasticSample<GFX9>(const perf_sample_snapshot_v1& sample)
|
||||
inline rocprofiler_pc_sampling_record_host_trap_v0_t
|
||||
copySample<GFX9, rocprofiler_pc_sampling_record_host_trap_v0_t>(const void* sample)
|
||||
{
|
||||
rocprofiler_pc_sampling_record_t ret = copySampleHeader<perf_sample_snapshot_v1>(sample);
|
||||
ret.flags.valid = sample.perf_snapshot_data & (~sample.perf_snapshot_data >> 26) & 0x1;
|
||||
// Check wave_id matches snapshot_wave_id
|
||||
|
||||
ret.flags.has_wave_cnt = true;
|
||||
ret.flags.has_stall_reason = true;
|
||||
|
||||
ret.wave_count = sample.perf_snapshot_data1 & 0x3F;
|
||||
|
||||
ret.wave_issued = sample.perf_snapshot_data >> 1;
|
||||
ret.snapshot.dual_issue_valu = sample.perf_snapshot_data >> 2;
|
||||
ret.snapshot.inst_type = sample.perf_snapshot_data >> 3;
|
||||
ret.snapshot.reason_not_issued = (sample.perf_snapshot_data >> 7) & 0x7;
|
||||
ret.snapshot.arb_state_issue = (sample.perf_snapshot_data >> 10) & 0xFF;
|
||||
ret.snapshot.arb_state_stall = (sample.perf_snapshot_data >> 18) & 0xFF;
|
||||
ret.reserved = 0;
|
||||
const auto& sample_ = *static_cast<const perf_sample_host_trap_v1*>(sample);
|
||||
auto ret = copySampleHeader<rocprofiler_pc_sampling_record_host_trap_v0_t>(sample_);
|
||||
copyChipletId<GFX9>(ret, sample_);
|
||||
copyHwId<GFX9>(ret.hw_id, sample_.hw_id);
|
||||
// copyHwId<GFX9>(&ret, sample);
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline rocprofiler_pc_sampling_record_t
|
||||
copyStochasticSample<GFX11>(const perf_sample_snapshot_v1& sample)
|
||||
inline rocprofiler_pc_sampling_record_stochastic_v0_t
|
||||
copySample<GFX9, rocprofiler_pc_sampling_record_stochastic_v0_t>(const void* sample)
|
||||
{
|
||||
rocprofiler_pc_sampling_record_t ret = copySampleHeader<perf_sample_snapshot_v1>(sample);
|
||||
ret.flags.valid = sample.perf_snapshot_data & (~sample.perf_snapshot_data >> 23) & 0x1;
|
||||
// Check wave_id matches snapshot_wave_id
|
||||
|
||||
ret.flags.has_stall_reason = true;
|
||||
|
||||
ret.wave_issued = sample.perf_snapshot_data >> 1;
|
||||
ret.snapshot.inst_type = sample.perf_snapshot_data >> 2;
|
||||
ret.snapshot.reason_not_issued = (sample.perf_snapshot_data >> 6) & 0x7;
|
||||
ret.snapshot.arb_state_issue = (sample.perf_snapshot_data >> 9) & 0x7F;
|
||||
ret.snapshot.arb_state_stall = (sample.perf_snapshot_data >> 16) & 0x7F;
|
||||
ret.snapshot.dual_issue_valu = false;
|
||||
ret.reserved = 0;
|
||||
const auto& sample_ = *static_cast<const perf_sample_snapshot_v1*>(sample);
|
||||
auto ret = copySampleHeader<rocprofiler_pc_sampling_record_stochastic_v0_t>(sample_);
|
||||
copyChipletId<GFX9>(ret, sample_);
|
||||
copyHwId<GFX9>(ret.hw_id, sample_.hw_id);
|
||||
ret.wave_count = sample_.perf_snapshot_data1 & 0x3F;
|
||||
// TODO: implement logic for manipulating stochastic related fields
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define BITSHIFT(sname) out |= ((in >> GFX::sname) & 1) << PCSAMPLE::sname
|
||||
|
||||
template <typename GFX>
|
||||
inline int
|
||||
translate_arb(int in)
|
||||
/**
|
||||
* @brief Host trap V0 sample for GFX11
|
||||
*/
|
||||
template <>
|
||||
inline rocprofiler_pc_sampling_record_host_trap_v0_t
|
||||
copySample<GFX11, rocprofiler_pc_sampling_record_host_trap_v0_t>(const void* sample)
|
||||
{
|
||||
size_t out = 0;
|
||||
BITSHIFT(ISSUE_VALU);
|
||||
BITSHIFT(ISSUE_MATRIX);
|
||||
BITSHIFT(ISSUE_LDS);
|
||||
BITSHIFT(ISSUE_LDS_DIRECT);
|
||||
BITSHIFT(ISSUE_SCALAR);
|
||||
BITSHIFT(ISSUE_VMEM_TEX);
|
||||
BITSHIFT(ISSUE_FLAT);
|
||||
BITSHIFT(ISSUE_EXP);
|
||||
BITSHIFT(ISSUE_MISC);
|
||||
BITSHIFT(ISSUE_BRMSG);
|
||||
return out & 0x3FF;
|
||||
}
|
||||
|
||||
#undef BITSHIFT
|
||||
|
||||
#define LUTOVERLOAD(sname) this->operator[](GFX::sname) = PCSAMPLE::sname
|
||||
|
||||
template <typename GFX>
|
||||
class GFX_REASON_LUT : public std::array<int, 32>
|
||||
{
|
||||
public:
|
||||
GFX_REASON_LUT()
|
||||
{
|
||||
std::memset(data(), 0, size() * sizeof(int));
|
||||
LUTOVERLOAD(REASON_NOT_AVAILABLE);
|
||||
LUTOVERLOAD(REASON_ALU);
|
||||
LUTOVERLOAD(REASON_WAITCNT);
|
||||
LUTOVERLOAD(REASON_INTERNAL);
|
||||
LUTOVERLOAD(REASON_BARRIER);
|
||||
LUTOVERLOAD(REASON_ARBITER);
|
||||
LUTOVERLOAD(REASON_EX_STALL);
|
||||
LUTOVERLOAD(REASON_OTHER_WAIT);
|
||||
LUTOVERLOAD(REASON_SLEEP);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename GFX>
|
||||
class GFX_INST_LUT : public std::array<int, 32>
|
||||
{
|
||||
public:
|
||||
GFX_INST_LUT()
|
||||
{
|
||||
std::memset(data(), 0, size() * sizeof(int));
|
||||
LUTOVERLOAD(TYPE_VALU);
|
||||
LUTOVERLOAD(TYPE_MATRIX);
|
||||
LUTOVERLOAD(TYPE_SCALAR);
|
||||
LUTOVERLOAD(TYPE_TEX);
|
||||
LUTOVERLOAD(TYPE_LDS);
|
||||
LUTOVERLOAD(TYPE_LDS_DIRECT);
|
||||
LUTOVERLOAD(TYPE_FLAT);
|
||||
LUTOVERLOAD(TYPE_EXP);
|
||||
LUTOVERLOAD(TYPE_MESSAGE);
|
||||
LUTOVERLOAD(TYPE_BARRIER);
|
||||
LUTOVERLOAD(TYPE_BRANCH_NOT_TAKEN);
|
||||
LUTOVERLOAD(TYPE_BRANCH_TAKEN);
|
||||
LUTOVERLOAD(TYPE_JUMP);
|
||||
LUTOVERLOAD(TYPE_OTHER);
|
||||
LUTOVERLOAD(TYPE_NO_INST);
|
||||
LUTOVERLOAD(TYPE_DUAL_VALU);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename GFX>
|
||||
inline int
|
||||
translate_reason(int in)
|
||||
{
|
||||
static GFX_REASON_LUT<GFX> lut;
|
||||
return lut[in & 0x1F];
|
||||
}
|
||||
|
||||
template <typename GFX>
|
||||
inline int
|
||||
translate_inst(int in)
|
||||
{
|
||||
static GFX_INST_LUT<GFX> lut;
|
||||
return lut[in & 0x1F];
|
||||
}
|
||||
|
||||
#undef LUTOVERLOAD
|
||||
|
||||
template <bool HostTrap, typename GFX>
|
||||
inline rocprofiler_pc_sampling_record_t
|
||||
copySample(const void* sample)
|
||||
{
|
||||
if(HostTrap) return copyHostTrapSample(*(const perf_sample_host_trap_v1*) sample);
|
||||
|
||||
rocprofiler_pc_sampling_record_t ret =
|
||||
copyStochasticSample<GFX>(*(const perf_sample_snapshot_v1*) sample);
|
||||
|
||||
ret.snapshot.inst_type = translate_inst<GFX>(ret.snapshot.inst_type);
|
||||
ret.snapshot.arb_state_issue = translate_arb<GFX>(ret.snapshot.arb_state_issue);
|
||||
ret.snapshot.arb_state_stall = translate_arb<GFX>(ret.snapshot.arb_state_stall);
|
||||
ret.snapshot.reason_not_issued = translate_reason<GFX>(ret.snapshot.reason_not_issued);
|
||||
|
||||
const auto& sample_ = *static_cast<const perf_sample_host_trap_v1*>(sample);
|
||||
auto ret = copySampleHeader<rocprofiler_pc_sampling_record_host_trap_v0_t>(sample_);
|
||||
// TODO: decode other fields.
|
||||
return ret;
|
||||
}
|
||||
|
||||
// TODO: implement stochastic for GFX11
|
||||
template <>
|
||||
inline rocprofiler_pc_sampling_record_stochastic_v0_t
|
||||
copySample<GFX11, rocprofiler_pc_sampling_record_stochastic_v0_t>(const void* sample)
|
||||
{
|
||||
const auto& sample_ = *static_cast<const perf_sample_snapshot_v1*>(sample);
|
||||
auto ret = copySampleHeader<rocprofiler_pc_sampling_record_stochastic_v0_t>(sample_);
|
||||
// TODO: decode other fields
|
||||
// TODO: implement logic for manipulating stochastic related fields
|
||||
// ret.wave_count = sample_.perf_snapshot_data1 & 0x3F;
|
||||
return ret;
|
||||
}
|
||||
|
||||
+22
-11
@@ -202,7 +202,8 @@ test_fail_because_of_wrong_agent(const callback_data*
|
||||
pcs_config->method,
|
||||
pcs_config->unit,
|
||||
pcs_config->min_interval,
|
||||
cb_data->client_buffer),
|
||||
cb_data->client_buffer,
|
||||
0),
|
||||
ROCPROFILER_STATUS_ERROR_AGENT_NOT_FOUND);
|
||||
}
|
||||
|
||||
@@ -218,7 +219,8 @@ test_fail_because_of_wrong_context(const callback_data*
|
||||
pcs_config->method,
|
||||
pcs_config->unit,
|
||||
pcs_config->min_interval,
|
||||
cb_data->client_buffer),
|
||||
cb_data->client_buffer,
|
||||
0),
|
||||
ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_FOUND);
|
||||
}
|
||||
|
||||
@@ -234,7 +236,8 @@ test_fail_because_of_wrong_buffer(const callback_data*
|
||||
pcs_config->method,
|
||||
pcs_config->unit,
|
||||
pcs_config->min_interval,
|
||||
not_existing_buffer_id),
|
||||
not_existing_buffer_id,
|
||||
0),
|
||||
ROCPROFILER_STATUS_ERROR_BUFFER_NOT_FOUND);
|
||||
}
|
||||
|
||||
@@ -254,7 +257,8 @@ test_fail_because_of_unsupported_configuration(
|
||||
pcs_config->method,
|
||||
pcs_config->unit,
|
||||
less_than_min_interval,
|
||||
cb_data->client_buffer),
|
||||
cb_data->client_buffer,
|
||||
0),
|
||||
ROCPROFILER_STATUS_SUCCESS);
|
||||
|
||||
EXPECT_NE(rocprofiler_configure_pc_sampling_service(cb_data->client_ctx,
|
||||
@@ -262,7 +266,8 @@ test_fail_because_of_unsupported_configuration(
|
||||
pcs_config->method,
|
||||
pcs_config->unit,
|
||||
greater_than_max_interval,
|
||||
cb_data->client_buffer),
|
||||
cb_data->client_buffer,
|
||||
0),
|
||||
ROCPROFILER_STATUS_SUCCESS);
|
||||
|
||||
EXPECT_NE(rocprofiler_configure_pc_sampling_service(cb_data->client_ctx,
|
||||
@@ -270,7 +275,8 @@ test_fail_because_of_unsupported_configuration(
|
||||
wrong_method,
|
||||
pcs_config->unit,
|
||||
pcs_config->max_interval,
|
||||
cb_data->client_buffer),
|
||||
cb_data->client_buffer,
|
||||
0),
|
||||
ROCPROFILER_STATUS_SUCCESS);
|
||||
|
||||
EXPECT_NE(rocprofiler_configure_pc_sampling_service(cb_data->client_ctx,
|
||||
@@ -278,7 +284,8 @@ test_fail_because_of_unsupported_configuration(
|
||||
pcs_config->method,
|
||||
wrong_unit,
|
||||
pcs_config->max_interval,
|
||||
cb_data->client_buffer),
|
||||
cb_data->client_buffer,
|
||||
0),
|
||||
ROCPROFILER_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
@@ -293,7 +300,8 @@ test_fail_because_service_is_already_configured(
|
||||
pcs_config->method,
|
||||
pcs_config->unit,
|
||||
pcs_config->min_interval,
|
||||
cb_data->client_buffer),
|
||||
cb_data->client_buffer,
|
||||
0),
|
||||
ROCPROFILER_STATUS_ERROR_SERVICE_ALREADY_CONFIGURED);
|
||||
}
|
||||
|
||||
@@ -374,7 +382,8 @@ TEST(pc_sampling, rocprofiler_configure_pc_sampling_service)
|
||||
pcs_config.method,
|
||||
pcs_config.unit,
|
||||
interval,
|
||||
cb_data->client_buffer),
|
||||
cb_data->client_buffer,
|
||||
0),
|
||||
"Failed to configure PC sampling service");
|
||||
|
||||
test_fail_because_service_is_already_configured(cb_data, agent_id, &pcs_config);
|
||||
@@ -385,7 +394,8 @@ TEST(pc_sampling, rocprofiler_configure_pc_sampling_service)
|
||||
pcs_config.method,
|
||||
pcs_config.unit,
|
||||
interval,
|
||||
another_buff),
|
||||
another_buff,
|
||||
0),
|
||||
ROCPROFILER_STATUS_ERROR);
|
||||
}
|
||||
|
||||
@@ -452,6 +462,7 @@ TEST(pc_sampling, rocprofiler_configure_pc_sampling_service)
|
||||
ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP,
|
||||
ROCPROFILER_PC_SAMPLING_UNIT_TIME,
|
||||
1,
|
||||
cb_data.client_buffer),
|
||||
cb_data.client_buffer,
|
||||
0),
|
||||
ROCPROFILER_STATUS_ERROR_CONFIGURATION_LOCKED);
|
||||
}
|
||||
|
||||
+4
-2
@@ -292,7 +292,8 @@ pc_sampling_vs_counter_collection(cc_setup_fn_t cc_setup_fn)
|
||||
pcs_config.method,
|
||||
pcs_config.unit,
|
||||
interval,
|
||||
cb_data->client_buffer),
|
||||
cb_data->client_buffer,
|
||||
0),
|
||||
ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT);
|
||||
}
|
||||
|
||||
@@ -393,7 +394,8 @@ counter_collection_vs_pc_sampling(cc_setup_fn_t cc_setup_fn)
|
||||
pcs_config.method,
|
||||
pcs_config.unit,
|
||||
interval,
|
||||
cb_data->client_buffer),
|
||||
cb_data->client_buffer,
|
||||
0),
|
||||
ROCPROFILER_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
+2
-1
@@ -285,7 +285,8 @@ TEST(pc_sampling, query_configs_after_service_setup)
|
||||
pcs_config.method,
|
||||
pcs_config.unit,
|
||||
interval,
|
||||
cb_data->client_buffer),
|
||||
cb_data->client_buffer,
|
||||
0),
|
||||
"Failed to configure PC sampling service");
|
||||
|
||||
// query configuration and expect to see `pcs_config->max_interval` as the `interval`
|
||||
|
||||
+2
-1
@@ -251,7 +251,8 @@ TEST(pc_sampling, processing_pc_samples)
|
||||
pcs_config.method,
|
||||
pcs_config.unit,
|
||||
interval,
|
||||
cb_data->client_buffer),
|
||||
cb_data->client_buffer,
|
||||
0),
|
||||
"Failed to configure PC sampling service");
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_create_callback_thread(&cb_data->client_thread),
|
||||
|
||||
@@ -266,7 +266,8 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info,
|
||||
picked_cfg->method,
|
||||
picked_cfg->unit,
|
||||
interval,
|
||||
buffer_id);
|
||||
buffer_id,
|
||||
0);
|
||||
if(status == ROCPROFILER_STATUS_SUCCESS)
|
||||
{
|
||||
*utils::get_output_stream()
|
||||
@@ -334,24 +335,25 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/,
|
||||
}
|
||||
else if(cur_header->category == ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING)
|
||||
{
|
||||
if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_SAMPLE)
|
||||
if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE)
|
||||
{
|
||||
auto* pc_sample =
|
||||
static_cast<rocprofiler_pc_sampling_record_t*>(cur_header->payload);
|
||||
auto* pc_sample = static_cast<rocprofiler_pc_sampling_record_host_trap_v0_t*>(
|
||||
cur_header->payload);
|
||||
|
||||
ss << "(code_obj_id, offset): (" << pc_sample->pc.loaded_code_object_id
|
||||
<< ", 0x" << std::hex << pc_sample->pc.loaded_code_object_offset << "), "
|
||||
ss << "(code_obj_id, offset): (" << pc_sample->pc.code_object_id << ", 0x"
|
||||
<< std::hex << pc_sample->pc.code_object_offset << "), "
|
||||
<< "timestamp: " << std::dec << pc_sample->timestamp << ", "
|
||||
<< "exec: " << std::hex << std::setw(16) << pc_sample->exec_mask << ", "
|
||||
<< "workgroup_id_(x=" << std::dec << std::setw(5)
|
||||
<< pc_sample->workgroup_id.x << ", "
|
||||
<< "y=" << std::setw(5) << pc_sample->workgroup_id.y << ", "
|
||||
<< "z=" << std::setw(5) << pc_sample->workgroup_id.z << "), "
|
||||
<< "wave_id: " << std::setw(2)
|
||||
<< static_cast<unsigned int>(pc_sample->wave_id) << ", "
|
||||
<< "wave_in_group: " << std::setw(2)
|
||||
<< static_cast<unsigned int>(pc_sample->wave_in_group) << ", "
|
||||
<< "chiplet: " << std::setw(2)
|
||||
<< static_cast<unsigned int>(pc_sample->chiplet) << ", "
|
||||
<< "cu_id: " << pc_sample->hw_id << ", "
|
||||
<< static_cast<unsigned int>(pc_sample->hw_id.chiplet)
|
||||
<< ", "
|
||||
// << "cu_id: " << pc_sample->hw_id << ", "
|
||||
<< "correlation: {internal=" << std::setw(7)
|
||||
<< pc_sample->correlation_id.internal << ", "
|
||||
<< "external=" << std::setw(5) << pc_sample->correlation_id.external.value
|
||||
@@ -373,8 +375,8 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/,
|
||||
assert(corr_id.external.value > 0);
|
||||
|
||||
// Decoding the PC
|
||||
auto inst = translator.get(pc_sample->pc.loaded_code_object_id,
|
||||
pc_sample->pc.loaded_code_object_offset);
|
||||
auto inst = translator.get(pc_sample->pc.code_object_id,
|
||||
pc_sample->pc.code_object_offset);
|
||||
flat_profile.add_sample(std::move(inst), pc_sample->exec_mask);
|
||||
}
|
||||
else
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur