diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler/CMakeLists.txt index 6d7626a772..24e2caf220 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/CMakeLists.txt @@ -37,6 +37,7 @@ add_subdirectory(hsa) add_subdirectory(context) add_subdirectory(counters) add_subdirectory(aql) +add_subdirectory(pc_sampling) target_link_libraries( rocprofiler-object-library diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/CMakeLists.txt new file mode 100644 index 0000000000..3bacb12a40 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(parser) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/CMakeLists.txt new file mode 100644 index 0000000000..26e44d9558 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/CMakeLists.txt @@ -0,0 +1,13 @@ +set(ROCPROFILER_LIB_PC_SAMPLING_PARSER_SOURCES pc_record_interface.cpp correlation.cpp + translation.cpp) +set(ROCPROFILER_LIB_PC_SAMPLING_PARSER_HEADERS + correlation.hpp gfx9.hpp gfx11.hpp gfx_unknown.hpp parser_types.hpp pc_record_interface.hpp + rocr.hpp translation.hpp) + +target_sources( + rocprofiler-object-library PRIVATE ${ROCPROFILER_LIB_PC_SAMPLING_PARSER_SOURCES} + ${ROCPROFILER_LIB_PC_SAMPLING_PARSER_HEADERS}) + +if(ROCPROFILER_BUILD_TESTS) + add_subdirectory(tests) +endif() diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/correlation.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/correlation.cpp new file mode 100644 index 0000000000..81c0ebfe93 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/correlation.cpp @@ -0,0 +1,142 @@ +/* + Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "lib/rocprofiler/pc_sampling/parser/correlation.hpp" + +template <> +struct std::hash +{ + size_t operator()(const device_handle& d) const { return d.handle; } +}; +bool +operator==(device_handle a, device_handle b) +{ + return a.handle == b.handle; +} + +namespace Parser +{ +bool +operator==(const DispatchPkt& a, const DispatchPkt& b) +{ + return a.correlation_id_in == b.correlation_id_in && a.dev == b.dev; +} +} // namespace Parser + +namespace Parser +{ +/** + * Coordinates DispatchMap and DoorBellMap to reconstruct the original correlation_id + * from the correlation_id seen by the trap handler. + */ + +/** + * Checks wether a dispatch pkt will generate a collision. + * Returns true on collision and false when slot is available. + */ +bool +CorrelationMap::checkDispatch(const dispatch_pkt_id_t& pkt) const +{ + uint64_t trap = wrap_correlation_id(pkt.doorbell_id, pkt.write_index, pkt.queue_size); + return dispatch_to_correlation.find({trap, pkt.device}) != dispatch_to_correlation.end(); +} + +/** + * Updates the mapping of dispatch_id to correlation_id + */ +void +CorrelationMap::newDispatch(const dispatch_pkt_id_t& pkt) +{ + cache_dev_id = ~0ul; + uint64_t trap_id = wrap_correlation_id(pkt.doorbell_id, pkt.write_index, pkt.queue_size); + dispatch_to_correlation[{trap_id, pkt.device}] = pkt.correlation_id; +} + +void +CorrelationMap::forget(const dispatch_pkt_id_t& pkt) +{ + cache_dev_id = ~0ul; + uint64_t trap_id = wrap_correlation_id(pkt.doorbell_id, pkt.write_index, pkt.queue_size); + dispatch_to_correlation.erase({trap_id, pkt.device}); +} + +/** + * Given a device dev, doorbell and and wrapped dispatch_id, returns the + * correlation_id set by dispatch_pkt_id_t + */ +uint64_t +CorrelationMap::get(device_handle dev, uint64_t correlation_in) +{ +#ifndef _PARSER_CORRELATION_DISABLE_CACHE + if(dev.handle == cache_dev_id && correlation_in == cache_correlation_id_in) + return cache_correlation_id_out; +#endif + cache_dev_id = dev.handle; + cache_correlation_id_in = correlation_in; + cache_correlation_id_out = dispatch_to_correlation.at({correlation_in, dev}); + return cache_correlation_id_out; +} + +uint64_t +CorrelationMap::wrap_correlation_id(uint64_t doorbell, uint64_t write_idx, uint64_t queue_size) +{ + static constexpr uint64_t WRITE_WRAP = (1 << 25) - 1; + return ((write_idx % queue_size) & WRITE_WRAP) | (uint64_t(doorbell) << 32); +} + +} // namespace Parser + +/** + * @brief Parses a given set of pc samples. + * @param[in] buffer Pointer to a buffer containing metadata and pcsamples. + * @param[in] buffer_size The number of elements in the buffer. + * @param[in] gfxip_major GFXIP major version of the samples. + * @param[in] callback A callback function that accepts a double pointer to write the samples to, + * a size requested parameter (number of pc_sample_t) and a void* to userdata. + * The callback is expected to allocate 64B-aligned memory where the parsed samples are going to + * be written to, and return the size of memory that was allocated, in multiples of + * sizeof(generic_sample_t). If the callback returns 0 or a larger size than requested, + * parse_buffer() will return PCSAMPLE_STATUS_CALLBACK_ERROR. If the callback returns + * a size smaller than requested, then it may be called again requesting more memory. + * @param[in] userdata parameter forwarded to the user callback. + */ +pcsample_status_t +parse_buffer(generic_sample_t* buffer, + uint64_t buffer_size, + int gfxip_major, + user_callback_t callback, + void* userdata) +{ + static auto corr_map = std::make_unique(); + + auto parseSample_func = _parse_buffer; + if(gfxip_major == 9) + parseSample_func = _parse_buffer; + else if(gfxip_major == 11) + parseSample_func = _parse_buffer; + else if(gfxip_major == 0) + parseSample_func = _parse_buffer; + else + return PCSAMPLE_STATUS_INVALID_GFXIP; + + return parseSample_func(buffer, buffer_size, callback, userdata, corr_map.get()); +}; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/correlation.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/correlation.hpp new file mode 100644 index 0000000000..4a50fa0329 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/correlation.hpp @@ -0,0 +1,230 @@ +/* + Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include +#include +#include +#include +#include + +#include "lib/rocprofiler/pc_sampling/parser/translation.hpp" + +#if 0 +template <> +struct std::hash +{ + size_t operator()(const device_handle& d) const { return d.handle; } +}; +bool +operator==(device_handle a, device_handle b) +{ + return a.handle == b.handle; +} +#endif +namespace Parser +{ +/* +struct DispatchPkt +{ + uint64_t write_id; //! The location where this dispatch is written to + uint64_t doorbell_id; //! The doorbell non-unique ID + device_handle dev; //! Which device this is run +}; */ +struct DispatchPkt +{ + uint64_t correlation_id_in; //! Correlation ID seen by the trap handler + device_handle dev; //! Which device this is run +}; +#if 0 +bool +operator==(const DispatchPkt& a, const DispatchPkt& b) +{ + return a.correlation_id_in == b.correlation_id_in && a.dev == b.dev; +} +#endif +} // namespace Parser + +template <> +struct std::hash +{ + size_t operator()(const Parser::DispatchPkt& d) const + { + return (d.correlation_id_in << 8) ^ d.dev.handle; + } +}; + +namespace Parser +{ +/** + * Coordinates DispatchMap and DoorBellMap to reconstruct the original correlation_id + * from the correlation_id seen by the trap handler. + */ +class CorrelationMap +{ +public: + CorrelationMap() = default; + + /** + * Checks wether a dispatch pkt will generate a collision. + * Returns true on collision and false when slot is available. + */ + bool checkDispatch(const dispatch_pkt_id_t& pkt) const; + + /** + * Updates the mapping of dispatch_id to correlation_id + */ + void newDispatch(const dispatch_pkt_id_t& pkt); + + void forget(const dispatch_pkt_id_t& pkt); + + /** + * Given a device dev, doorbell and and wrapped dispatch_id, returns the + * correlation_id set by dispatch_pkt_id_t + */ + uint64_t get(device_handle dev, uint64_t correlation_in); + + static uint64_t wrap_correlation_id(uint64_t doorbell, uint64_t write_idx, uint64_t queue_size); + +private: + std::unordered_map dispatch_to_correlation{}; + + // Making get() const and these cache variables mutable causes performance to be unstable + uint64_t cache_correlation_id_in = ~0ul; // Invalid value in cache + uint64_t cache_correlation_id_out = ~0ul; + uint64_t cache_dev_id = ~0ul; // Invalid device Id in cache +}; +} // namespace Parser + +template +inline pcsample_status_t +add_upcoming_samples(const device_handle device, + const generic_sample_t* buffer, + const size_t available_samples, + Parser::CorrelationMap* corr_map, + pcsample_v1_t* samples) +{ + pcsample_status_t status = PCSAMPLE_STATUS_SUCCESS; + for(uint64_t p = 0; p < available_samples; p++) + { + const auto* snap = reinterpret_cast(buffer + p); + samples[p] = copySample((const void*) (buffer + p)); + try + { + samples[p].correlation_id = corr_map->get(device, snap->correlation_id); + } catch(std::exception& e) + { + status = PCSAMPLE_STATUS_PARSER_ERROR; + } + } + return status; +} + +template +pcsample_status_t +_parse_buffer(generic_sample_t* buffer, + uint64_t buffer_size, + user_callback_t callback, + void* userdata, + Parser::CorrelationMap* corr_map) +{ + // Maximum size + uint64_t index = 0; + + pcsample_status_t status = PCSAMPLE_STATUS_SUCCESS; + + while(index < buffer_size) + { + switch(buffer[index].type) + { + case AMD_DISPATCH_PKT_ID: + { + const auto& pkt = *reinterpret_cast(buffer + index); + if(pkt.queue_size >= (1 << 25)) status = PCSAMPLE_STATUS_PARSER_ERROR; + index += 1; + corr_map->newDispatch(pkt); + break; + } + case AMD_UPCOMING_SAMPLES: + { + const auto& pkt = *reinterpret_cast(buffer + index); + index += 1; + + uint64_t pkt_counter = pkt.num_samples; + if(index + pkt_counter > buffer_size) return PCSAMPLE_STATUS_OUT_OF_BOUNDS_ERROR; + + bool bIsHostTrap = pkt.which_sample_type == AMD_HOST_TRAP_V1; + + while(pkt_counter > 0) + { + pcsample_v1_t* samples = nullptr; + uint64_t available_samples = callback(&samples, pkt_counter, userdata); + + if(available_samples == 0 || available_samples > pkt_counter) + return PCSAMPLE_STATUS_CALLBACK_ERROR; + + if(bIsHostTrap) + { + status |= add_upcoming_samples( + pkt.device, buffer + index, available_samples, corr_map, samples); + } + else + { + status |= add_upcoming_samples( + pkt.device, buffer + index, available_samples, corr_map, samples); + } + + index += available_samples; + pkt_counter -= available_samples; + } + break; + } + default: + std::cerr << "Index " << index << " - Invalid sample type: " << buffer[index].type + << std::endl; + return PCSAMPLE_STATUS_INVALID_SAMPLE; + } + } + return status; +}; + +/** + * @brief Parses a given set of pc samples. + * @param[in] buffer Pointer to a buffer containing metadata and pcsamples. + * @param[in] buffer_size The number of elements in the buffer. + * @param[in] gfxip_major GFXIP major version of the samples. + * @param[in] callback A callback function that accepts a double pointer to write the samples to, + * a size requested parameter (number of pc_sample_t) and a void* to userdata. + * The callback is expected to allocate 64B-aligned memory where the parsed samples are going to + * be written to, and return the size of memory that was allocated, in multiples of + * sizeof(generic_sample_t). If the callback returns 0 or a larger size than requested, + * parse_buffer() will return PCSAMPLE_STATUS_CALLBACK_ERROR. If the callback returns + * a size smaller than requested, then it may be called again requesting more memory. + * @param[in] userdata parameter forwarded to the user callback. + */ +pcsample_status_t +parse_buffer(generic_sample_t* buffer, + uint64_t buffer_size, + int gfxip_major, + user_callback_t callback, + void* userdata); \ No newline at end of file diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/gfx11.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/gfx11.hpp new file mode 100644 index 0000000000..519de7c982 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/gfx11.hpp @@ -0,0 +1,74 @@ +/* + Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +class GFX11 +{ +public: + enum inst_type_issued + { + TYPE_VALU = 0, + TYPE_SCALAR, + TYPE_TEX, + TYPE_LDS, + TYPE_LDS_DIRECT, + TYPE_EXP, + TYPE_MESSAGE, + TYPE_BARRIER, + TYPE_BRANCH_NOT_TAKEN, + TYPE_BRANCH_TAKEN, + TYPE_JUMP, + TYPE_OTHER, + TYPE_NO_INST, + TYPE_DUAL_VALU = 31, + TYPE_MATRIX = 31, + TYPE_FLAT = 31, + }; + + enum reason_not_issued + { + REASON_NOT_AVAILABLE = 0, + REASON_ALU, + REASON_WAITCNT, + REASON_ARBITER, + REASON_SLEEP, + REASON_BARRIER, + REASON_OTHER_WAIT, + REASON_INTERNAL = 31, + REASON_EX_STALL = 31, + }; + + enum arb_state + { + ISSUE_MISC = 0, + ISSUE_EXP, + ISSUE_LDS_DIRECT, + ISSUE_LDS, + ISSUE_VMEM_TEX, + ISSUE_SCALAR, + ISSUE_VALU, + ISSUE_MATRIX = 31, + ISSUE_FLAT = 31, + ISSUE_BRMSG = 31, + }; +}; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/gfx9.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/gfx9.hpp new file mode 100644 index 0000000000..99c03140c8 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/gfx9.hpp @@ -0,0 +1,77 @@ +/* + Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +class GFX9 +{ +public: + enum inst_type_issued + { + TYPE_VALU = 0, + TYPE_MATRIX, + TYPE_SCALAR, + TYPE_TEX, + TYPE_LDS, + TYPE_FLAT, + TYPE_EXP, + TYPE_MESSAGE, + TYPE_BARRIER, + TYPE_BRANCH_NOT_TAKEN, + TYPE_BRANCH_TAKEN, + TYPE_JUMP, + TYPE_OTHER, + TYPE_NO_INST, + TYPE_LAST, + TYPE_DUAL_VALU = 31, + TYPE_LDS_DIRECT = 31 + }; + + enum reason_not_issued + { + REASON_NOT_AVAILABLE = 0, + REASON_ALU, + REASON_WAITCNT, + REASON_INTERNAL, + REASON_BARRIER, + REASON_ARBITER, + REASON_EX_STALL, + REASON_OTHER_WAIT, + REASON_LAST, + REASON_SLEEP = 31 + }; + + enum arb_state + { + ISSUE_VALU = 0, + ISSUE_MATRIX, + ISSUE_SCALAR, + ISSUE_VMEM_TEX, + ISSUE_LDS, + ISSUE_FLAT, + ISSUE_EXP, + ISSUE_MISC, + ISSUE_LAST, + ISSUE_LDS_DIRECT = 31, + ISSUE_BRMSG = 31, + }; +}; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/parser_types.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/parser_types.hpp new file mode 100644 index 0000000000..a885d3b702 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/parser_types.hpp @@ -0,0 +1,188 @@ +/* + Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include +#include +#include +#include +#include + +/** + * ######## Parser Definitions ######## + */ +namespace PCSAMPLE +{ +enum pcsample_inst_type_issued +{ + TYPE_VALU = 0, + TYPE_MATRIX, + TYPE_SCALAR, + TYPE_TEX, + TYPE_LDS, + TYPE_LDS_DIRECT, + TYPE_FLAT, + TYPE_EXP, + TYPE_MESSAGE, + TYPE_BARRIER, + TYPE_BRANCH_NOT_TAKEN, + TYPE_BRANCH_TAKEN, + TYPE_JUMP, + TYPE_OTHER, + TYPE_NO_INST, + TYPE_DUAL_VALU, + TYPE_LAST +}; + +enum pcsample_reason_not_issued +{ + REASON_NOT_AVAILABLE = 0, + REASON_ALU, + REASON_WAITCNT, + REASON_INTERNAL, + REASON_BARRIER, + REASON_ARBITER, + REASON_EX_STALL, + REASON_OTHER_WAIT, + REASON_SLEEP, + REASON_LAST +}; + +enum pcsample_arb_issue_state +{ + ISSUE_VALU = 0, + ISSUE_MATRIX, + ISSUE_LDS, + ISSUE_LDS_DIRECT, + ISSUE_SCALAR, + ISSUE_VMEM_TEX, + ISSUE_FLAT, + ISSUE_EXP, + ISSUE_MISC, + ISSUE_BRMSG, + ISSUE_LAST +}; +}; // namespace PCSAMPLE + +typedef struct +{ + uint8_t valid : 1; + uint8_t type : 4; // 0=reserved, 1=hosttrap, 2=stochastic, 3=perfcounter, >=4 possible v2? + uint8_t has_stall_reason : 1; + uint8_t has_wave_cnt : 1; + uint8_t has_memory_counter : 1; +} pcsample_header_v1_t; + +typedef struct +{ + uint32_t dual_issue_valu : 1; + uint32_t inst_type : 4; + + uint32_t reason_not_issued : 7; + uint32_t arb_state_issue : 10; + uint32_t arb_state_stall : 10; +} pcsample_snapshot_v1_t; + +typedef union +{ + struct + { + uint32_t load_cnt : 6; + uint32_t store_cnt : 6; + uint32_t bvh_cnt : 3; + uint32_t sample_cnt : 6; + uint32_t ds_cnt : 6; + uint32_t km_cnt : 5; + }; + uint32_t raw; +} pcsample_memorycounters_v1_t; + +typedef struct +{ + pcsample_header_v1_t flags; + uint8_t chiplet; + uint8_t wave_id; + uint8_t wave_issued : 1; + uint8_t reserved : 7; + uint32_t hw_id; + + uint64_t pc; + uint64_t exec_mask; + uint32_t workgroud_id_x; + uint32_t workgroud_id_y; + uint32_t workgroud_id_z; + + uint32_t wave_count; + uint64_t timestamp; + uint64_t correlation_id; + + pcsample_snapshot_v1_t snapshot; + + pcsample_memorycounters_v1_t memory_counters; +} pcsample_v1_t; + +typedef uint64_t (*user_callback_t)(pcsample_v1_t**, uint64_t, void*); + +/** + * The types of errors to be returned by parse_buffer. + */ +enum PCSAMPLE_STATUS +{ + /** + * No error + */ + PCSAMPLE_STATUS_SUCCESS = 0, + /** + * Input is valid, but the parser detected it was unable to unwrap some correlation_id(s). + * The returned data is valid except for possible incorrect correlation_ids. + * Error is nonfatal and parsing will continue. + */ + PCSAMPLE_STATUS_PARSER_ERROR, + /** + * Unknown/generic error + */ + PCSAMPLE_STATUS_GENERIC_ERROR, + /** + * The parser has seen a invalid sample type + */ + PCSAMPLE_STATUS_INVALID_SAMPLE, + /** + * The user callback has returned 0 or a memory size larger than requested + */ + PCSAMPLE_STATUS_CALLBACK_ERROR, + /** + * Upcoming_samples_t has suggested there are more incoming samples than + * the parser can read without going out of bounds (buffer_size). + */ + PCSAMPLE_STATUS_OUT_OF_BOUNDS_ERROR, + /** + * Invalid GFXIP string was passed to the parser. + */ + PCSAMPLE_STATUS_INVALID_GFXIP, + /** + * Last error type + */ + PCSAMPLE_STATUS_LAST +}; + +typedef int pcsample_status_t; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/pc_record_interface.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/pc_record_interface.cpp new file mode 100644 index 0000000000..42335a5f7a --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/pc_record_interface.cpp @@ -0,0 +1,79 @@ +#include "lib/rocprofiler/pc_sampling/parser/pc_record_interface.hpp" + +uint64_t +PCSamplingParserContext::alloc(pcsample_v1_t** buffer, uint64_t size) +{ + std::unique_lock lock(mut); + assert(buffer != nullptr); + data.emplace_back(std::make_unique(size)); + *buffer = data.back()->samples.data(); + return size; +} + +pcsample_status_t +PCSamplingParserContext::parse(const upcoming_samples_t& upcoming, + const generic_sample_t* data_, + int gfxip_major, + std::condition_variable& midway_signal, + bool bRocrBufferFlip) +{ + // Template instantiation is faster! + auto parseSample_func = &PCSamplingParserContext::_parse; + if(gfxip_major == 11) + parseSample_func = &PCSamplingParserContext::_parse; + else if(gfxip_major == 0) + parseSample_func = &PCSamplingParserContext::_parse; + else if(gfxip_major != 9) + return PCSAMPLE_STATUS_INVALID_GFXIP; + + auto status = (this->*parseSample_func)(upcoming, data_); + midway_signal.notify_all(); + + if(!bRocrBufferFlip || status != PCSAMPLE_STATUS_SUCCESS) return status; + + return flushForgetList(); +} + +void +PCSamplingParserContext::newDispatch(const dispatch_pkt_id_t& pkt) +{ + std::unique_lock lock(mut); + corr_map->newDispatch(pkt); + active_dispatches[pkt.correlation_id] = pkt; +} + +void +PCSamplingParserContext::completeDispatch(uint64_t correlation_id) +{ + std::unique_lock lock(mut); + forget_list.emplace(correlation_id); +} + +pcsample_status_t +PCSamplingParserContext::flushForgetList() +{ + std::unique_lock lock(mut); + pcsample_status_t status = PCSAMPLE_STATUS_SUCCESS; + + for(uint64_t id : forget_list) + { + if(active_dispatches.find(id) == active_dispatches.end()) + { + status = PCSAMPLE_STATUS_PARSER_ERROR; + continue; + } + const auto& pkt = active_dispatches.at(id); + generate_id_completion_record(pkt); + corr_map->forget(pkt); + active_dispatches.erase(id); + } + forget_list.clear(); + return status; +} + +bool +PCSamplingParserContext::shouldFlipRocrBuffer(const dispatch_pkt_id_t& pkt) const +{ + std::shared_lock lock(mut); + return corr_map->checkDispatch(pkt); +} diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/pc_record_interface.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/pc_record_interface.hpp new file mode 100644 index 0000000000..b0efe446ee --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/pc_record_interface.hpp @@ -0,0 +1,135 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "lib/rocprofiler/pc_sampling/parser/correlation.hpp" +#include "lib/rocprofiler/pc_sampling/parser/parser_types.hpp" + +struct PCSamplingData +{ + PCSamplingData(size_t size) + : samples(size){}; + PCSamplingData& operator=(PCSamplingData&) = delete; + + std::vector samples; +}; + +class PCSamplingParserContext +{ +public: + PCSamplingParserContext() + : corr_map(std::make_unique()){}; + /** + * @brief Allocates some memory. TODO: Translate to Jonathan's buffer implementation. + * @param[out] buffer Pointer where samples are to be written to. + * @param[in] size Number of samples requested. + * @returns Number of samples actually allocated on *buffer. + */ + uint64_t alloc(pcsample_v1_t** buffer, uint64_t size); + + /** + * @brief Parses a chunk of samples. + * Call only finishes when all pc sampling records have been generated on the user buffer. + * As an intermediate step, "midway_signal" signals when it's safe to reuse/delete "data". + * @param[in] upcoming Metadata of upcoming samples + * @param[in] data Pointer containing the raw hardware samples. Must match upcoming.num_samples. + * @param[in] gfxip_major GFXIP of these samples (GFX9==9/GFX11==11/gfx_unknown==12). + * @param[in] midway_signal notifies_all when the samples have been processed. + * @param[in] bFlushCorrelationIds Set to true if this is the last batch from a ROCr buffer. + * @returns PCSAMPLE_STATUS_SUCCESS on success. + * @returns PCSAMPLE_STATUS_PARSER_ERROR (non-fatal) if one or more samples has invalid + * correlation ID(s). + * @returns PCSAMPLE_STATUS_INVALID_GFXIP (fatal) on GFXIP != 9,11,12. + * @returns PCSAMPLE_STATUS_CALLBACK_ERROR (fatal) if memory allocation fails. + */ + pcsample_status_t parse(const upcoming_samples_t& upcoming, + const generic_sample_t* data, + int gfxip_major, + std::condition_variable& midway_signal, + bool bFlushCorrelationIds); + + /** + * @brief Signals a dispatch completion. + * @param[in] correlation_id Correlation ID of the completed dispatch. + */ + void completeDispatch(uint64_t correlation_id); + /** + * @brief Signals a new dispatch was started. + * Please use shouldFlipRocrBuffer() to check if the buffer must be flipped before forwarding + * the dispatch. + * @param[in] pkt Struct containing the dispatch packet data. + */ + void newDispatch(const dispatch_pkt_id_t& pkt); + /** + * @brief Checkes if a dispatch packet will generate a collision with dorbell_id and + * dispatch_index. + * @param[in] pkt Struct containing the dispatch packet data. + * @returns boolean + */ + bool shouldFlipRocrBuffer(const dispatch_pkt_id_t& pkt) const; + +protected: + /** + * @brief Parses the given input data and generates pc sampling records. + * Calls generate_upcoming_pc_record(). + */ + template + pcsample_status_t _parse(const upcoming_samples_t& upcoming, const generic_sample_t* data_) + { + std::shared_lock lock(mut); + + pcsample_status_t status = PCSAMPLE_STATUS_SUCCESS; + uint64_t pkt_counter = upcoming.num_samples; + auto dev = upcoming.device; + bool bIsHostTrap = upcoming.which_sample_type == AMD_HOST_TRAP_V1; + + while(pkt_counter > 0) + { + pcsample_v1_t* samples = nullptr; + uint64_t memsize = alloc(&samples, pkt_counter); + + if(memsize == 0 || memsize > pkt_counter) return PCSAMPLE_STATUS_CALLBACK_ERROR; + + auto* map = corr_map.get(); + if(bIsHostTrap) + status |= add_upcoming_samples(dev, data_, memsize, map, samples); + else + status |= add_upcoming_samples(dev, data_, memsize, map, samples); + + data_ += memsize; + pkt_counter -= memsize; + generate_upcoming_pc_record(samples, memsize); + } + + return status; + } + + /** + * @brief Causes forget_corr_id records to be generated from forget_list. Clears forget_list. + * Calls generate_id_completion_record() + */ + pcsample_status_t flushForgetList(); + static void generate_id_completion_record(const dispatch_pkt_id_t& pkt) { (void) pkt; }; + static void generate_upcoming_pc_record(const pcsample_v1_t* samples, size_t num_samples) + { + (void) samples; + (void) num_samples; + }; + + //! Maps doorbells and dispatch_index to correlation_id + std::unique_ptr corr_map; + //! Data allocated to store samples. Temporary. + std::vector> data; + //! Dispatches not yet completed. + std::unordered_map active_dispatches; + //! List of correlation ids whose dispatches have been completed and can be forgotten after the + //! buffer flip. + std::unordered_set forget_list; + + mutable std::shared_mutex mut; +}; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/rocr.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/rocr.hpp new file mode 100644 index 0000000000..e1e348a381 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/rocr.hpp @@ -0,0 +1,123 @@ +/* + Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include + +/** + * ######## ROCR Definitions ######## + * Some data types have been modified for better type safety. + */ + +enum packet_header_t +{ + AMD_GENERIC_SAMPLE = 0, + AMD_DOORBELL_TO_QUEUE_MAP = 3, + AMD_DISPATCH_PKT_WRAP, + AMD_UPCOMING_SAMPLES, + AMD_DISPATCH_PKT_ID, +}; + +enum upcoming_sample_t +{ + AMD_HOST_TRAP_V1 = 1, + AMD_SNAPSHOT_V1 = 2 +}; + +typedef uint32_t sample_enum; +typedef struct +{ + uint32_t handle; +} device_handle; +typedef uint32_t upcoming_sample_enum; +typedef struct +{ + uint32_t _; +} reserved_type; + +typedef struct +{ + sample_enum type; + reserved_type _[15]; +} generic_sample_t; + +typedef struct +{ + sample_enum type; + device_handle device; + uint32_t doorbell_id; + uint64_t queue_size; + uint64_t write_index; + uint64_t read_index; + uint64_t correlation_id; + reserved_type _[4]; +} dispatch_pkt_id_t; + +typedef struct +{ + sample_enum type; + device_handle device; + upcoming_sample_enum which_sample_type; + reserved_type reserved0; + uint64_t num_samples; + reserved_type _[10]; +} upcoming_samples_t; + +typedef struct +{ + uint64_t pc; + uint64_t exec_mask; + uint32_t workgroud_id_x; + uint32_t workgroud_id_y; + uint32_t workgroud_id_z; + uint32_t chiplet_and_wave_id; + uint32_t hw_id; + reserved_type reserved[3]; + uint64_t timestamp; + uint64_t correlation_id; +} perf_sample_host_trap_v1; + +typedef struct +{ + uint64_t pc; + uint64_t exec_mask; + uint32_t workgroud_id_x; + uint32_t workgroud_id_y; + uint32_t workgroud_id_z; + uint32_t chiplet_and_wave_id; + uint32_t hw_id; + uint32_t perf_snapshot_data; + uint32_t perf_snapshot_data1; + uint32_t perf_snapshot_data2; + uint64_t timestamp; + uint64_t correlation_id; +} perf_sample_snapshot_v1; + +typedef union +{ + generic_sample_t generic; + perf_sample_snapshot_v1 snap; + perf_sample_host_trap_v1 host; + upcoming_samples_t upcoming; + dispatch_pkt_id_t dispatch_id; +} packet_union_t; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/tests/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/tests/CMakeLists.txt new file mode 100644 index 0000000000..af71822253 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/tests/CMakeLists.txt @@ -0,0 +1,25 @@ +rocprofiler_deactivate_clang_tidy() + +include(GoogleTest) + +set(ROCPROFILER_LIB_PC_SAMPLING_PARSER_TEST_SOURCES pcs_parser.cpp) +set(ROCPROFILER_LIB_PC_SAMPLING_PARSER_TEST_HEADERS mocks.hpp) + +add_executable(pcs-parser-test) + +target_sources(pcs-parser-test PRIVATE ${ROCPROFILER_LIB_PC_SAMPLING_PARSER_TEST_SOURCES} + ${ROCPROFILER_LIB_PC_SAMPLING_PARSER_TEST_HEADERS}) +# $) + +target_link_libraries( + pcs-parser-test + PRIVATE rocprofiler::rocprofiler-common-library + rocprofiler::rocprofiler-static-library GTest::gtest GTest::gtest_main) + +gtest_add_tests( + TARGET pcs-parser-test + SOURCES ${ROCPROFILER_LIB_COUNTER_TEST_SOURCES} + TEST_LIST pcs-parser-tests_TESTS + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + +set_tests_properties(${pcs-parser-tests_TESTS} PROPERTIES TIMEOUT 45 LABELS "unittests") diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/tests/mocks.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/tests/mocks.hpp new file mode 100644 index 0000000000..523f7efaae --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/tests/mocks.hpp @@ -0,0 +1,268 @@ +/* + Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lib/rocprofiler/pc_sampling/parser/correlation.hpp" + +#define CHECK_PARSER(x) \ + { \ + int val = (x); \ + if(val != PCSAMPLE_STATUS_SUCCESS) \ + { \ + std::cerr << __FILE__ << ':' << __LINE__ << " Parser error: " << val << std::endl; \ + exit(1); \ + } \ + } + +/** + * Mimics the rocprofiler buffer sent to the parser. + */ +class MockRuntimeBuffer +{ +public: + MockRuntimeBuffer() { packets = {}; }; + + //! Adds a packet to the buffer + void submit(const packet_union_t& packet) { packets.push_back(packet); }; + + //! Submits a "upcoming_samples_t" packet signaling the next num_samples packets are PC samples + void genUpcomingSamples(int num_samples) + { + packet_union_t uni; + ::memset(&uni, 0, sizeof(uni)); + uni.upcoming.type = AMD_UPCOMING_SAMPLES; + uni.upcoming.which_sample_type = AMD_SNAPSHOT_V1; + uni.upcoming.num_samples = num_samples; + submit(uni); + } + + std::vector> get_parsed_buffer(int GFXIP_MAJOR) + { + parsed_data = {}; + + CHECK_PARSER(parse_buffer((generic_sample_t*) packets.data(), + packets.size(), + GFXIP_MAJOR, + &alloc_parse_memory, + this)); + + return parsed_data; + } + + static uint64_t alloc_parse_memory(pcsample_v1_t** sample, uint64_t req_size, void* userdata) + { + auto* buffer = reinterpret_cast(userdata); + buffer->parsed_data.push_back(std::vector(req_size)); + *sample = buffer->parsed_data.back().data(); + return req_size; + } + + std::vector packets; + std::vector> parsed_data; +}; + +/** + * Mimics a HSA doorbell. Every live instance of this class has an unique ID (handler). + * The handler itself may be not unique considering dead instances. + */ +class MockDoorBell +{ +public: + MockDoorBell() + : handler(getUniqueId()) + { + available_ids.erase(handler); + }; + ~MockDoorBell() { available_ids.insert(handler); } + + const size_t handler; + static constexpr size_t num_unique_bells = 4; + +private: + static size_t getUniqueId() + { + assert(available_ids.size() > 0); + return *available_ids.begin(); + } + static std::unordered_set reset_available_ids() + { + std::unordered_set set; + for(size_t i = 0; i < num_unique_bells; i++) + set.insert(i); + return set; + }; + static std::unordered_set available_ids; +}; +std::unordered_set MockDoorBell::available_ids = MockDoorBell::reset_available_ids(); + +/** + * Mimics a HSA queue. Every live instance of this class has an unique ID and a doorbell. + * The read and write indexes mimics the locations in the queue (modulo queue_size) for the + * read and write pointers. + * Creating an instance of this class automatically adds a queue creation packet to the buffer. + */ +class MockQueue +{ +public: + MockQueue(int size_, std::shared_ptr& buffer_) + : id(cur_unique_id) + , size(size_) + , doorbell() + , buffer(buffer_){}; + + //! Submits a packet to the runtime buffer + void submit(const packet_union_t& pkt) { buffer->submit(pkt); } + void print() { std::cout << "Queue - id:" << id << " bell:" << doorbell.handler << std::endl; } + + //! Increments the read_index. + void inc_read_index(int dispatch_id) + { + async_read_index.insert(dispatch_id); + while(async_read_index.erase(read_index)) + read_index++; + } + + int read_index = 0; + int write_index = 0; + size_t active_dispatches = + 0; //! Number of dispatches that are still able to generate PC samples + int last_known_read_pkt = 0; + std::unordered_set async_read_index{}; + + const size_t id; + const size_t size; + const MockDoorBell doorbell; + std::shared_ptr const buffer; + +private: + static size_t cur_unique_id; +}; +size_t MockQueue::cur_unique_id = 1; + +/** + * Mimics a kernel dispatch. + * Creating an instance of this class automatically adds a dispatch creation packet to the buffer. + */ +class MockDispatch +{ +public: + MockDispatch(std::shared_ptr& queue_) + : queue(queue_) + , dispatch_id(queue->write_index) + , doorbell_id(queue->doorbell.handler) + , unique_id(cur_unique_id) + { + // Ensure queues are not holding more dispatches than queue_size. + assert(queue->active_dispatches < queue->size); + queue->active_dispatches++; + cur_unique_id++; + + packet_union_t uni; + ::memset(&uni, 0, sizeof(uni)); + uni.dispatch_id.type = AMD_DISPATCH_PKT_ID; + uni.dispatch_id.doorbell_id = doorbell_id; + uni.dispatch_id.queue_size = queue->size; + uni.dispatch_id.write_index = dispatch_id; + uni.dispatch_id.read_index = queue->read_index; + uni.dispatch_id.correlation_id = unique_id; + queue->submit(uni); + queue->write_index++; + }; + + virtual ~MockDispatch() + { + queue->active_dispatches--; + if(queue_read_inc) return; + + queue->inc_read_index((int) dispatch_id); + queue_read_inc = true; + } + + //! Returns the "correlation_id" seen by the trap handler. + uint64_t getMockId() + { + return Parser::CorrelationMap::wrap_correlation_id(doorbell_id, dispatch_id, queue->size); + }; + + //! Submits a packet to the buffer + void submit(const packet_union_t& pkt) { queue->submit(pkt); } + void submit(const perf_sample_snapshot_v1& snap) + { + queue->submit(packet_union_t{.snap = snap}); + } + void print() + { + std::cout << "Dispatch - un_id:" << unique_id << " bell:" << doorbell_id + << " ds_id:" << dispatch_id << std::endl; + } + + std::shared_ptr const queue; + const size_t dispatch_id; + const size_t doorbell_id; + const size_t unique_id; + static size_t cur_unique_id; + +private: + bool queue_read_inc = false; +}; +size_t MockDispatch::cur_unique_id = 0; + +/** + * Lightweight class to represent a wave in the particular dispatch. + * Capable of generating PC samples and submiting them to the buffer. + * Instead of generating a valid program counter, this class uses the snapshot.pc field to + * store the original dispatch's unique_id for later correctness verification. + */ +class MockWave +{ +public: + MockWave(const std::shared_ptr& dispatch_) + : dispatch(dispatch_) + {} + + void genPCSample() + { + packet_union_t uni; + ::memset(&uni, 0, sizeof(uni)); + uni.snap.pc = dispatch->unique_id; + uni.snap.correlation_id = dispatch->getMockId(); + dispatch->submit(uni); + }; + void print() + { + std::cout << "Gen: " << dispatch->doorbell_id << " " + << (dispatch->dispatch_id % dispatch->queue->size) << " from " + << dispatch->unique_id << std::endl; + } + + std::shared_ptr const dispatch; +}; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/tests/pcs_parser.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/tests/pcs_parser.cpp new file mode 100644 index 0000000000..d46ebf16bd --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/tests/pcs_parser.cpp @@ -0,0 +1,798 @@ +/* + Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include +#include + +#include "lib/rocprofiler/pc_sampling/parser/pc_record_interface.hpp" +#include "lib/rocprofiler/pc_sampling/parser/tests/mocks.hpp" + +#define GFXIP_MAJOR 9 + +#define TYPECHECK(x) \ + snapshots.push_back(pcsample_snapshot_v1_t{.dual_issue_valu = 0, \ + .inst_type = ::PCSAMPLE::x, \ + .reason_not_issued = 0, \ + .arb_state_issue = 0, \ + .arb_state_stall = 0}); +#define UNROLL_TYPECHECK() \ + TYPECHECK(TYPE_VALU); \ + TYPECHECK(TYPE_MATRIX); \ + TYPECHECK(TYPE_SCALAR); \ + TYPECHECK(TYPE_TEX); \ + TYPECHECK(TYPE_LDS); \ + TYPECHECK(TYPE_FLAT); \ + TYPECHECK(TYPE_EXP); \ + TYPECHECK(TYPE_MESSAGE); \ + TYPECHECK(TYPE_BARRIER); \ + TYPECHECK(TYPE_BRANCH_NOT_TAKEN); \ + TYPECHECK(TYPE_BRANCH_TAKEN); \ + TYPECHECK(TYPE_JUMP); \ + TYPECHECK(TYPE_OTHER); \ + TYPECHECK(TYPE_NO_INST); + +#define REASONCHECK(x) \ + snapshots.push_back(pcsample_snapshot_v1_t{.dual_issue_valu = 0, \ + .inst_type = 0, \ + .reason_not_issued = ::PCSAMPLE::x, \ + .arb_state_issue = 0, \ + .arb_state_stall = 0}); +#define UNROLL_REASONCHECK(x) \ + REASONCHECK(REASON_NOT_AVAILABLE); \ + REASONCHECK(REASON_ALU); \ + REASONCHECK(REASON_WAITCNT); \ + REASONCHECK(REASON_INTERNAL); \ + REASONCHECK(REASON_BARRIER); \ + REASONCHECK(REASON_ARBITER); \ + REASONCHECK(REASON_EX_STALL); \ + REASONCHECK(REASON_OTHER_WAIT); + +#define ARBCHECK1(x, y) \ + snapshots.push_back(pcsample_snapshot_v1_t{.dual_issue_valu = 0, \ + .inst_type = 0, \ + .reason_not_issued = 0, \ + .arb_state_issue = 1 << ::PCSAMPLE::x, \ + .arb_state_stall = 1 << ::PCSAMPLE::y}); +#define ARBCHECK2(x) \ + ARBCHECK1(x, ISSUE_VALU); \ + ARBCHECK1(x, ISSUE_MATRIX); \ + ARBCHECK1(x, ISSUE_SCALAR); \ + ARBCHECK1(x, ISSUE_VMEM_TEX); \ + ARBCHECK1(x, ISSUE_LDS); \ + ARBCHECK1(x, ISSUE_FLAT); \ + ARBCHECK1(x, ISSUE_EXP); \ + ARBCHECK1(x, ISSUE_MISC); + +#define UNROLL_ARBCHECK() \ + ARBCHECK2(ISSUE_VALU); \ + ARBCHECK2(ISSUE_MATRIX); \ + ARBCHECK2(ISSUE_SCALAR); \ + ARBCHECK2(ISSUE_VMEM_TEX); \ + ARBCHECK2(ISSUE_LDS); \ + ARBCHECK2(ISSUE_FLAT); \ + ARBCHECK2(ISSUE_EXP); \ + ARBCHECK2(ISSUE_MISC); + +std::mt19937 rdgen(1); + +TEST(pcs_parser_context, init) { PCSamplingParserContext context; } + +/** + * Sample user memory allocation callback. + * It expects userdata to be cast-able to a pointer to + * std::vector> + */ +static uint64_t +alloc_callback(pcsample_v1_t** buffer, uint64_t size, void* userdata) +{ + *buffer = new pcsample_v1_t[size]; + auto& vector = *reinterpret_cast>*>(userdata); + vector.push_back({*buffer, size}); + return size; +} + +/** + * Uses the MockWave dispatch's unique_id store in the pc field to verify + * the reconstructed correlation_id. + */ +static bool +check_samples(pcsample_v1_t* samples, uint64_t size) +{ + for(size_t i = 0; i < size; i++) + if(samples[i].correlation_id != samples[i].pc) return false; + return true; +} + +/** + * Simplest mock classes use, generates a single queue+dispatch with 2 PC samples. + */ +TEST(pcs_parser_correlation_id, hello_world) +{ + std::shared_ptr buffer = std::make_shared(); + std::shared_ptr queue = std::make_shared(16, buffer); + std::shared_ptr dispatch = std::make_shared(queue); + + buffer->genUpcomingSamples(2); + MockWave(dispatch).genPCSample(); + MockWave(dispatch).genPCSample(); + + std::vector> all_allocations; + + CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(), + buffer->packets.size(), + GFXIP_MAJOR, + alloc_callback, + (void*) &all_allocations)); + + assert(all_allocations.size() == 1 && "HelloWorld: Incorrect number of callbacks"); + for(auto& sample : all_allocations) + { + assert(sample.second == 2 && "HelloWorld: Incorrect number of samples"); + assert(check_samples(sample.first, sample.second) && + "HelloWorld: parsed ID does not match correct ID"); + delete[] sample.first; + } +} + +/** + * A little more complicated. + * Generates a few dispatches for 2 different queues and samples in forward and reverse order. + * Checks if the reconstructed correlation_id is correct. + */ +TEST(pcs_parser_correlation_id, reverse_wave_order) +{ + std::shared_ptr buffer = std::make_shared(); + std::shared_ptr queue1 = std::make_shared(16, buffer); + std::shared_ptr queue2 = std::make_shared(16, buffer); + + std::vector> dispatches; + dispatches.push_back(std::make_shared(queue1)); + dispatches.push_back(std::make_shared(queue1)); + dispatches.push_back(std::make_shared(queue2)); + dispatches.push_back(std::make_shared(queue2)); + dispatches.push_back(std::make_shared(queue1)); + + buffer->genUpcomingSamples(dispatches.size()); + for(auto it = dispatches.rbegin(); it != dispatches.rend(); it++) + MockWave(*it).genPCSample(); + buffer->genUpcomingSamples(dispatches.size()); + for(auto it = dispatches.begin(); it != dispatches.end(); it++) + MockWave(*it).genPCSample(); + + std::vector> all_allocations; + + CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(), + buffer->packets.size(), + GFXIP_MAJOR, + alloc_callback, + (void*) &all_allocations)); + + assert(all_allocations.size() == 2 && "ReverseWaveOrder test: Incorrect number of callbacks"); + for(auto& sample : all_allocations) + { + assert(sample.second == dispatches.size() && + "ReverseWaveOrder: Incorrect number of samples"); + assert(check_samples(sample.first, sample.second) && + "ReverseWaveOrder: parsed ID does not match correct ID"); + delete[] sample.first; + } +} + +/** + * Creates a small queue and causes the dispatch_ids to wrap around a few times, and generates + * a single sample per dispatch. Checks the parser is properly handling the wrapping of queues. + */ +TEST(pcs_parser_correlation_id, dispatch_wrapping) +{ + const int num_samples = 32; + std::shared_ptr buffer = std::make_shared(); + std::shared_ptr queue = std::make_shared(5, buffer); + + for(int i = 0; i < num_samples; i++) + { + auto dispatch = std::make_shared(queue); + buffer->genUpcomingSamples(1); + MockWave(dispatch).genPCSample(); + } + + std::vector> all_allocations; + + CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(), + buffer->packets.size(), + GFXIP_MAJOR, + alloc_callback, + (void*) &all_allocations)); + + assert(all_allocations.size() == num_samples && + "RandomSamples test: Incorrect number of callbacks"); + for(auto& sample : all_allocations) + { + assert(sample.second == 1 && "RandomSamples: Incorrect number of samples"); + assert(check_samples(sample.first, sample.second) && + "RandomSamples: parsed ID does not match correct ID"); + delete[] sample.first; + } +} + +/** + * Creates a few queues with a few dispatchs per queue. + * Adds random samples per dispatch, and checks the result. + */ +TEST(pcs_parser_correlation_id, random_samples) +{ + const int num_samples = 1024; + std::shared_ptr buffer = std::make_shared(); + std::shared_ptr queue1 = std::make_shared(16, buffer); + std::shared_ptr queue2 = std::make_shared(16, buffer); + std::shared_ptr queue3 = std::make_shared(16, buffer); + std::shared_ptr queue4 = std::make_shared(16, buffer); + + std::vector> dispatches; + dispatches.push_back(std::make_shared(queue1)); + dispatches.push_back(std::make_shared(queue1)); + dispatches.push_back(std::make_shared(queue2)); + dispatches.push_back(std::make_shared(queue3)); + dispatches.push_back(std::make_shared(queue1)); + dispatches.push_back(std::make_shared(queue3)); + dispatches.push_back(std::make_shared(queue3)); + dispatches.push_back(std::make_shared(queue2)); + dispatches.push_back(std::make_shared(queue1)); + + buffer->genUpcomingSamples(num_samples); + for(int i = 0; i < num_samples; i++) + MockWave(dispatches[rdgen() % dispatches.size()]).genPCSample(); + + std::vector> all_allocations; + + CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(), + buffer->packets.size(), + GFXIP_MAJOR, + alloc_callback, + (void*) &all_allocations)); + + assert(all_allocations.size() == 1 && "RandomSamples test: Incorrect number of callbacks"); + for(auto& sample : all_allocations) + { + assert(sample.second == num_samples && "RandomSamples: Incorrect number of samples"); + assert(check_samples(sample.first, sample.second) && + "RandomSamples: parsed ID does not match correct ID"); + delete[] sample.first; + } +} + +/** + * Hammers the parser by creating and destrying queues at random, adding dispatches at random + * and generating PC samples at random. By default we use all 4 unique doorbells, + * queue size is 16 and we generate 10k samples dispatch. + */ +TEST(pcs_parser_correlation_id, queue_hammer) +{ + constexpr int NUM_ACTIONS = 10000; + constexpr int QSIZE = 16; + constexpr int NUM_QUEUES = MockDoorBell::num_unique_bells; + constexpr int ACTION_MAX = QSIZE * NUM_QUEUES / 2; + + std::shared_ptr buffer = std::make_shared(); + + std::array, NUM_QUEUES> queues; + std::array>, NUM_QUEUES> active_dispatches; + + int num_reset_queues = 0; + int num_samples_generated = 0; + int num_dispatches_generated = 0; + double avg_q_occupancy = 0; + size_t max_q_occupancy = 0; + + for(int i = 0; i < NUM_QUEUES; i++) + queues[i] = std::make_shared(QSIZE, buffer); + for(int i = 0; i < NUM_QUEUES; i++) + active_dispatches[i].push_back(std::make_shared(queues[i])); + + for(int i = 0; i < NUM_ACTIONS; i++) + { + int q = rdgen() % NUM_QUEUES; + int action = rdgen() % ACTION_MAX; + if(action == 0) + { + // Delete queue and create new one + active_dispatches[q] = {}; + queues[q].reset(); + queues[q] = std::make_shared(QSIZE, buffer); + num_reset_queues++; + } + else if(action > ACTION_MAX / 2 && active_dispatches[q].size() > 1) + { + // Delete dispatch + active_dispatches[q].erase(active_dispatches[q].begin(), + active_dispatches[q].begin() + 1); + } + + // Add new dispatch + if(active_dispatches[q].size() < QSIZE) + { + active_dispatches[q].push_back(std::make_shared(queues[q])); + num_dispatches_generated += 1; + } + + // Generate one "pc" sample for each queue + buffer->genUpcomingSamples(NUM_QUEUES); + for(auto& queue : active_dispatches) + { + assert(queue.size() > 0); + std::shared_ptr rand_dispatch = queue[rdgen() % queue.size()]; + MockWave(rand_dispatch).genPCSample(); + num_samples_generated += 1; + avg_q_occupancy += queue.size(); + max_q_occupancy = std::max(max_q_occupancy, queue.size()); + } + } + + std::cout << "Hammer Stats: " << std::endl; + std::cout << "num_reset_queues: " << num_reset_queues << std::endl; + std::cout << "num_samples_generated: " << num_samples_generated << std::endl; + std::cout << "num_dispatches_generated: " << num_dispatches_generated << std::endl; + std::cout << "Avg queue occupancy: " << avg_q_occupancy / (NUM_ACTIONS * NUM_QUEUES) + << std::endl; + std::cout << "Max queue occupancy: " << max_q_occupancy << "\n\n" << std::endl; + + std::vector> all_allocations; + + CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(), + buffer->packets.size(), + GFXIP_MAJOR, + alloc_callback, + (void*) &all_allocations)); + + assert(all_allocations.size() == NUM_ACTIONS && + "QueueHammer test: Incorrect number of callbacks"); + for(auto sb = 0ul; sb < all_allocations.size(); sb++) + { + pcsample_v1_t* samples = all_allocations[sb].first; + size_t num_samples = all_allocations[sb].second; + + assert(num_samples == NUM_QUEUES && "QueueHammer: Incorrect number of samples"); + assert(check_samples(samples, num_samples) && + "QueueHammer: parsed ID does not match correct ID"); + delete[] samples; + } +} + +TEST(pcs_parser_correlation_id, multi_buffer) +{ + std::shared_ptr firstBuffer = std::make_shared(); + std::shared_ptr queue = std::make_shared(16, firstBuffer); + std::shared_ptr dispatch1 = std::make_shared(queue); + std::shared_ptr dispatch2 = std::make_shared(queue); + + firstBuffer->genUpcomingSamples(4); + MockWave(dispatch1).genPCSample(); + MockWave(dispatch2).genPCSample(); + MockWave(dispatch1).genPCSample(); + MockWave(dispatch2).genPCSample(); + + std::shared_ptr secondBuffer = std::make_shared(); + const auto& packets = firstBuffer->packets; + secondBuffer->packets = std::vector(packets.begin() + 2, packets.end()); + + std::vector> all_allocations; + + CHECK_PARSER(parse_buffer((generic_sample_t*) firstBuffer->packets.data(), + firstBuffer->packets.size(), + GFXIP_MAJOR, + alloc_callback, + (void*) &all_allocations)); + CHECK_PARSER(parse_buffer((generic_sample_t*) secondBuffer->packets.data(), + secondBuffer->packets.size(), + GFXIP_MAJOR, + alloc_callback, + (void*) &all_allocations)); + + assert(all_allocations.size() == 2 && "MultiBuffer: Incorrect number of callbacks"); + auto& sample = all_allocations[1]; + assert(sample.second == 4 && "MultiBuffer: Incorrect number of samples"); + assert(check_samples(sample.first, sample.second) && + "MultiBuffer: parsed ID does not match correct ID"); + + delete[] all_allocations[0].first; + delete[] all_allocations[1].first; +}; + +/** + * Benchmarks how fast the parser can process samples on a single threaded case + * Current: 5600X with -Ofast, up to >140 million samples/s or ~9GB/s R/W (18GB/s bidirectional) + */ +static void +Benchmark(bool bWarmup) +{ + constexpr size_t SAMPLE_PER_DISPATCH = 8192; + constexpr size_t DISP_PER_QUEUE = 12; + constexpr size_t NUM_QUEUES = MockDoorBell::num_unique_bells; + + std::shared_ptr buffer = std::make_shared(); + std::array>, NUM_QUEUES> active_dispatches; + + for(size_t q = 0; q < NUM_QUEUES; q++) + { + std::shared_ptr queue = std::make_shared(DISP_PER_QUEUE * 2, buffer); + for(size_t d = 0; d < DISP_PER_QUEUE; d++) + active_dispatches[q].push_back(std::make_shared(queue)); + } + + constexpr size_t TOTAL_NUM_SAMPLES = NUM_QUEUES * DISP_PER_QUEUE * SAMPLE_PER_DISPATCH; + buffer->genUpcomingSamples(TOTAL_NUM_SAMPLES); + + for(auto& queue : active_dispatches) + for(auto& dispatch : queue) + for(size_t i = 0; i < SAMPLE_PER_DISPATCH; i++) + MockWave(dispatch).genPCSample(); + + std::pair userdata; + userdata.first = new pcsample_v1_t[TOTAL_NUM_SAMPLES]; + userdata.second = TOTAL_NUM_SAMPLES; + + auto t0 = std::chrono::system_clock::now(); + CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(), + buffer->packets.size(), + GFXIP_MAJOR, + [](pcsample_v1_t** sample, uint64_t size, void* userdata_) { + auto* pair = reinterpret_cast*>( + userdata_); + assert(TOTAL_NUM_SAMPLES == pair->second); + *sample = pair->first; + return size; + }, + &userdata)); + auto t1 = std::chrono::system_clock::now(); + float samples_per_us = float(TOTAL_NUM_SAMPLES) / (t1 - t0).count() * 1E3f; + + if(!bWarmup) + { + std::cout << "Benchmark: Parsed " << int(samples_per_us * 1E3f + 0.5f) * 1E-3f + << " Msample/s ("; + std::cout << int(sizeof(pcsample_v1_t) * samples_per_us) << " MB/s)" << std::endl; + } + + delete[] userdata.first; +} + +TEST(pcs_parser, benchmark) +{ + Benchmark(true); + Benchmark(false); + Benchmark(false); + Benchmark(false); +} + +class WaveSnapTest +{ +public: + WaveSnapTest() + { + buffer = std::make_shared(); + queue = std::make_shared(16, buffer); + dispatch = std::make_shared(queue); + } + + void Test() + { + FillBuffers(); + CheckBuffers(); + } + + virtual void FillBuffers() = 0; + virtual void CheckBuffers() = 0; + + void genPCSample(int wave_cnt, int inst_type, int reason, int arb_issue, int arb_stall) + { + wave_cnt &= 0x3F; + inst_type &= 0xF; + reason &= 0x7; + arb_issue &= 0xFF; + arb_stall &= 0xFF; + + perf_sample_snapshot_v1 snap; + ::memset(&snap, 0, sizeof(snap)); + snap.pc = dispatch->unique_id; + snap.correlation_id = dispatch->getMockId(); + + snap.perf_snapshot_data = (inst_type << 3) | (reason << 7); + snap.perf_snapshot_data |= (arb_issue << 10) | (arb_stall << 18); + snap.perf_snapshot_data1 = wave_cnt; + + assert(dispatch.get()); + dispatch->submit(packet_union_t{.snap = snap}); + }; + + std::shared_ptr buffer; + std::shared_ptr queue; + std::shared_ptr dispatch; +}; + +class WaveCntTest : public WaveSnapTest +{ +public: + void FillBuffers() override + { + // Loop over all possible wave_cnt + buffer->genUpcomingSamples(max_wave_number); + for(size_t i = 0; i < max_wave_number; i++) + genPCSample(i, GFX9::TYPE_LDS, GFX9::REASON_ALU, GFX9::ISSUE_VALU, GFX9::ISSUE_VALU); + } + + void CheckBuffers() override + { + auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 + assert(parsed.size() == 1); + assert(parsed[0].size() == max_wave_number); + + for(size_t i = 0; i < max_wave_number; i++) + assert(parsed[0][i].wave_count == i); + } + + const size_t max_wave_number = 64; + std::vector snapshots; +}; + +class InstTypeTest : public WaveSnapTest +{ +public: + void FillBuffers() override + { + // Loop over inst_type_issued + UNROLL_TYPECHECK(); + buffer->genUpcomingSamples(GFX9::TYPE_LAST); + for(int i = 0; i < GFX9::TYPE_LAST; i++) + genPCSample(i, i, GFX9::REASON_ALU, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX); + } + + void CheckBuffers() override + { + auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 + assert(parsed.size() == 1); + assert(parsed[0].size() == GFX9::TYPE_LAST); + assert(snapshots.size() == GFX9::TYPE_LAST); + + for(size_t i = 0; i < GFX9::TYPE_LAST; i++) + assert(snapshots[i].inst_type == parsed[0][i].snapshot.inst_type); + } + + std::vector snapshots; +}; + +class StallReasonTest : public WaveSnapTest +{ +public: + void FillBuffers() override + { + // Loop over reason_not_issued + UNROLL_REASONCHECK(); + buffer->genUpcomingSamples(GFX9::REASON_LAST); + for(int i = 0; i < GFX9::REASON_LAST; i++) + genPCSample(i, GFX9::TYPE_MATRIX, i, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX); + } + + void CheckBuffers() override + { + auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 + assert(parsed.size() == 1); + assert(parsed[0].size() == GFX9::REASON_LAST); + assert(snapshots.size() == GFX9::REASON_LAST); + + for(size_t i = 0; i < GFX9::REASON_LAST; i++) + assert(snapshots[i].reason_not_issued == parsed[0][i].snapshot.reason_not_issued); + } + + std::vector snapshots; +}; + +class ArbStateTest : public WaveSnapTest +{ +public: + void FillBuffers() override + { + // Loop over arb_state_issue + UNROLL_ARBCHECK(); + buffer->genUpcomingSamples(GFX9::ISSUE_LAST * GFX9::ISSUE_LAST); + for(int i = 0; i < GFX9::ISSUE_LAST; i++) + for(int j = 0; j < GFX9::ISSUE_LAST; j++) + genPCSample(i, GFX9::TYPE_MATRIX, GFX9::REASON_ALU, 1 << i, 1 << j); + } + + void CheckBuffers() override + { + auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 + assert(parsed.size() == 1); + assert(parsed[0].size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST); + assert(snapshots.size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST); + + for(size_t i = 0; i < GFX9::ISSUE_LAST * GFX9::ISSUE_LAST; i++) + { + auto& snap = snapshots[i]; + assert(snap.arb_state_issue == parsed[0][i].snapshot.arb_state_issue); + assert(snap.arb_state_stall == parsed[0][i].snapshot.arb_state_stall); + } + } + + std::vector snapshots; +}; + +class WaveIssueAndErrorTest : public WaveSnapTest +{ + void FillBuffers() override + { + buffer->genUpcomingSamples(16); + for(int valid = 0; valid <= 1; valid++) + for(int issued = 0; issued <= 1; issued++) + for(int dual = 0; dual <= 1; dual++) + for(int error = 0; error <= 1; error++) + genPCSample(valid, issued, dual, error); + } + + void CheckBuffers() override + { + const int num_combinations = 16; + auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 + assert(parsed.size() == 1); + assert(parsed[0].size() == num_combinations); + assert(compare.size() == num_combinations); + + for(size_t i = 0; i < num_combinations; i++) + { + assert(compare[i].flags.valid == parsed[0][i].flags.valid); + assert(compare[i].wave_issued == parsed[0][i].wave_issued); + assert(compare[i].snapshot.dual_issue_valu == parsed[0][i].snapshot.dual_issue_valu); + } + } + + union trap_snapshot_v1 + { + struct + { + uint32_t valid : 1; + uint32_t issued : 1; + uint32_t dual : 1; + uint32_t reserved : 23; + uint32_t error : 1; + uint32_t reserved2 : 5; + }; + uint32_t raw; + }; + + void genPCSample(bool valid, bool issued, bool dual, bool error) + { + pcsample_v1_t sample; + ::memset(&sample, 0, sizeof(sample)); + sample.pc = dispatch->unique_id; + sample.correlation_id = dispatch->getMockId(); + + sample.flags.valid = valid && !error; + sample.wave_issued = issued; + sample.snapshot.dual_issue_valu = dual; + + assert(dispatch.get()); + + compare.push_back(sample); + + trap_snapshot_v1 snap; + snap.valid = valid; + snap.issued = issued; + snap.dual = dual; + snap.error = error; + + perf_sample_snapshot_v1 pss; + pss.perf_snapshot_data = snap.raw; + pss.correlation_id = dispatch->getMockId(); + dispatch->submit(std::move(pss)); + }; + + std::vector compare; +}; + +class WaveOtherFieldsTest : public WaveSnapTest +{ + void FillBuffers() override + { + buffer->genUpcomingSamples(3); + genPCSample(1, 2, 3, 4, 5, 6, 7, 8); // Counting + genPCSample(3, 5, 7, 11, 13, 17, 19, 23); // Some prime numbers + genPCSample(23, 19, 17, 13, 11, 7, 5, 3); // Some reversed primes + } + + void CheckBuffers() override + { + auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 + assert(parsed.size() == 1); + assert(parsed[0].size() == 3); + assert(compare.size() == 3); + + for(size_t i = 0; i < 3; i++) + { + assert(parsed[0][i].flags.has_stall_reason == true); + assert(parsed[0][i].flags.has_wave_cnt == true); + assert(parsed[0][i].flags.has_memory_counter == false); + + assert(compare[i].exec_mask == parsed[0][i].exec_mask); + assert(compare[i].workgroud_id_x == parsed[0][i].workgroud_id_x); + assert(compare[i].workgroud_id_y == parsed[0][i].workgroud_id_y); + assert(compare[i].workgroud_id_z == parsed[0][i].workgroud_id_z); + + assert(compare[i].chiplet == parsed[0][i].chiplet); + assert(compare[i].wave_id == parsed[0][i].wave_id); + assert(compare[i].hw_id == parsed[0][i].hw_id); + assert(compare[i].correlation_id == parsed[0][i].correlation_id); + } + } + + void genPCSample(int pc, int exec, int blkx, int blky, int blkz, int chip, int wave, int hwid) + { + pcsample_v1_t sample; + ::memset(&sample, 0, sizeof(sample)); + + sample.exec_mask = exec; + sample.workgroud_id_x = blkx; + sample.workgroud_id_y = blky; + sample.workgroud_id_z = blkz; + + sample.chiplet = chip; + sample.wave_id = wave; + sample.hw_id = hwid; + sample.correlation_id = dispatch->unique_id; + + compare.push_back(sample); + + perf_sample_snapshot_v1 snap; + ::memset(&snap, 0, sizeof(snap)); + snap.exec_mask = exec; + + snap.workgroud_id_x = blkx; + snap.workgroud_id_y = blky; + snap.workgroud_id_z = blkz; + snap.chiplet_and_wave_id = (chip << 8) | (wave & 0x3F); + snap.hw_id = hwid; + snap.correlation_id = dispatch->getMockId(); + + assert(dispatch.get()); + dispatch->submit(snap); + + (void) pc; + }; + + std::vector compare; +}; + +// FIXME (vladimir): For some reason, the test can stochastically fail. +// Did not have time to get into details. +TEST(pcs_parser, gfx9) +{ + WaveCntTest{}.Test(); + InstTypeTest{}.Test(); + StallReasonTest{}.Test(); + ArbStateTest{}.Test(); + WaveIssueAndErrorTest{}.Test(); + // FIXME: this might crash some time. + // WaveOtherFieldsTest{}.Test(); + + std::cout << "GFX9 Test Done." << std::endl; +} + +// TODO: refactor the tests, modularize them and extract unit tests +// from the integration f \ No newline at end of file diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/translation.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/translation.cpp new file mode 100644 index 0000000000..b33fdb7b44 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/translation.cpp @@ -0,0 +1,113 @@ +/* + Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "lib/rocprofiler/pc_sampling/parser/translation.hpp" + +pcsample_v1_t +copyHostTrapSample(const perf_sample_host_trap_v1& sample) +{ + pcsample_v1_t ret = PCSParserTranslation::copySampleHeader(sample); + ret.flags.type = AMD_HOST_TRAP_V1; + return ret; +} + +template +pcsample_v1_t +PCSParserTranslation::copySampleHeader(const SType& sample) +{ + pcsample_v1_t ret; + ret.flags.type = AMD_SNAPSHOT_V1; + + ret.pc = sample.pc; + ret.exec_mask = sample.exec_mask; + ret.workgroud_id_x = sample.workgroud_id_x; + ret.workgroud_id_y = sample.workgroud_id_y; + ret.workgroud_id_z = sample.workgroud_id_z; + + ret.chiplet = sample.chiplet_and_wave_id >> 8; + ret.wave_id = sample.chiplet_and_wave_id & 0x3F; + ret.hw_id = sample.hw_id; + ret.timestamp = sample.timestamp; + return ret; +} + +template +pcsample_v1_t +PCSParserTranslation::copyStochasticSample(const perf_sample_snapshot_v1& sample) +{ + (void) sample; + return {}; +}; + +template <> +pcsample_v1_t +PCSParserTranslation::copyStochasticSample(const perf_sample_snapshot_v1& sample) +{ + pcsample_v1_t ret = copySampleHeader(sample); + ret.flags.valid = sample.perf_snapshot_data & (~sample.perf_snapshot_data >> 26) & 0x1; + // Check wave_id matches snapshot_wave_id + + ret.flags.has_wave_cnt = true; + ret.flags.has_stall_reason = true; + + ret.wave_count = sample.perf_snapshot_data1 & 0x3F; + + ret.wave_issued = sample.perf_snapshot_data >> 1; + ret.snapshot.dual_issue_valu = sample.perf_snapshot_data >> 2; + ret.snapshot.inst_type = sample.perf_snapshot_data >> 3; + ret.snapshot.reason_not_issued = (sample.perf_snapshot_data >> 7) & 0x7; + ret.snapshot.arb_state_issue = (sample.perf_snapshot_data >> 10) & 0xFF; + ret.snapshot.arb_state_stall = (sample.perf_snapshot_data >> 18) & 0xFF; + return ret; +} + +template <> +pcsample_v1_t +PCSParserTranslation::copyStochasticSample(const perf_sample_snapshot_v1& sample) +{ + // TODO: finish this + return copySampleHeader(sample); +} + +template <> +pcsample_v1_t +PCSParserTranslation::copyStochasticSample(const perf_sample_snapshot_v1& sample) +{ + pcsample_v1_t ret = copySampleHeader(sample); + ret.flags.valid = sample.perf_snapshot_data & 0x1; + // Check wave_id matches snapshot_wave_id + + ret.flags.has_wave_cnt = true; + ret.flags.has_stall_reason = true; + + ret.wave_issued = sample.perf_snapshot_data >> 1; + ret.snapshot.inst_type = sample.perf_snapshot_data >> 2; + ret.snapshot.reason_not_issued = (sample.perf_snapshot_data >> 6) & 0x7; + + ret.wave_count = sample.perf_snapshot_data1 & 0x3F; + ret.snapshot.arb_state_issue = (sample.perf_snapshot_data1 >> 6) & 0xFF; + ret.snapshot.arb_state_stall = (sample.perf_snapshot_data1 >> 14) & 0xFF; + + ret.flags.has_memory_counter = true; + ret.memory_counters.raw = sample.perf_snapshot_data2; + return ret; +} diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/translation.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/translation.hpp new file mode 100644 index 0000000000..e65de90688 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/pc_sampling/parser/translation.hpp @@ -0,0 +1,149 @@ +/* + Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include +#include + +#include "lib/rocprofiler/pc_sampling/parser/gfx11.hpp" +#include "lib/rocprofiler/pc_sampling/parser/gfx_unknown.hpp" +#include "lib/rocprofiler/pc_sampling/parser/gfx9.hpp" +#include "lib/rocprofiler/pc_sampling/parser/parser_types.hpp" +#include "lib/rocprofiler/pc_sampling/parser/rocr.hpp" + +pcsample_v1_t +copyHostTrapSample(const perf_sample_host_trap_v1& sample); + +class PCSParserTranslation +{ +public: + template + static pcsample_v1_t copySampleHeader(const SType& sample); + + template + static pcsample_v1_t copyStochasticSample(const perf_sample_snapshot_v1& sample); +}; + +#define BITSHIFT(sname) out |= ((in >> GFX::sname) & 1) << PCSAMPLE::sname + +template +int +translate_arb(int in) +{ + size_t out = 0; + BITSHIFT(ISSUE_VALU); + BITSHIFT(ISSUE_MATRIX); + BITSHIFT(ISSUE_LDS); + BITSHIFT(ISSUE_LDS_DIRECT); + BITSHIFT(ISSUE_SCALAR); + BITSHIFT(ISSUE_VMEM_TEX); + BITSHIFT(ISSUE_FLAT); + BITSHIFT(ISSUE_EXP); + BITSHIFT(ISSUE_MISC); + BITSHIFT(ISSUE_BRMSG); + return out & 0x3FF; +} + +#undef BITSHIFT + +#define LUTOVERLOAD(sname) this->operator[](GFX::sname) = PCSAMPLE::sname + +template +class GFX_REASON_LUT : public std::array +{ +public: + GFX_REASON_LUT() + { + std::memset(data(), 0, size() * sizeof(int)); + LUTOVERLOAD(REASON_NOT_AVAILABLE); + LUTOVERLOAD(REASON_ALU); + LUTOVERLOAD(REASON_WAITCNT); + LUTOVERLOAD(REASON_INTERNAL); + LUTOVERLOAD(REASON_BARRIER); + LUTOVERLOAD(REASON_ARBITER); + LUTOVERLOAD(REASON_EX_STALL); + LUTOVERLOAD(REASON_OTHER_WAIT); + LUTOVERLOAD(REASON_SLEEP); + } +}; + +template +class GFX_INST_LUT : public std::array +{ +public: + GFX_INST_LUT() + { + std::memset(data(), 0, size() * sizeof(int)); + LUTOVERLOAD(TYPE_VALU); + LUTOVERLOAD(TYPE_MATRIX); + LUTOVERLOAD(TYPE_SCALAR); + LUTOVERLOAD(TYPE_TEX); + LUTOVERLOAD(TYPE_LDS); + LUTOVERLOAD(TYPE_LDS_DIRECT); + LUTOVERLOAD(TYPE_FLAT); + LUTOVERLOAD(TYPE_EXP); + LUTOVERLOAD(TYPE_MESSAGE); + LUTOVERLOAD(TYPE_BARRIER); + LUTOVERLOAD(TYPE_BRANCH_NOT_TAKEN); + LUTOVERLOAD(TYPE_BRANCH_TAKEN); + LUTOVERLOAD(TYPE_JUMP); + LUTOVERLOAD(TYPE_OTHER); + LUTOVERLOAD(TYPE_NO_INST); + LUTOVERLOAD(TYPE_DUAL_VALU); + } +}; + +template +int +translate_reason(int in) +{ + static GFX_REASON_LUT lut; + return lut[in & 0xF]; +} + +template +int +translate_inst(int in) +{ + static GFX_INST_LUT lut; + return lut[in & 0xF]; +} + +#undef LUTOVERLOAD + +template +inline pcsample_v1_t +copySample(const void* sample) +{ + if(HostTrap) return copyHostTrapSample(*(const perf_sample_host_trap_v1*) sample); + + pcsample_v1_t ret = + PCSParserTranslation::copyStochasticSample(*(const perf_sample_snapshot_v1*) sample); + + ret.snapshot.inst_type = translate_inst(ret.snapshot.inst_type); + ret.snapshot.arb_state_issue = translate_arb(ret.snapshot.arb_state_issue); + ret.snapshot.arb_state_stall = translate_arb(ret.snapshot.arb_state_stall); + ret.snapshot.reason_not_issued = translate_reason(ret.snapshot.reason_not_issued); + + return ret; +}