From 3a639543e725f840a376726510d3f5fccbb51ca6 Mon Sep 17 00:00:00 2001 From: Saurabh Verma Date: Fri, 10 Feb 2023 22:06:07 +0000 Subject: [PATCH] SWDEV-297574: Support for pcie counters Change-Id: I4a662c43a9d0cf883f336574baa09fc33b78b9af --- inc/rocprofiler.h | 79 ++++- samples/CMakeLists.txt | 10 + samples/common/common.h | 19 +- .../pcie_counters_example.cpp | 103 ++++++ src/api/CMakeLists.txt | 4 + src/core/counters/mmio/mmio.cpp | 189 ++++++++++ src/core/counters/mmio/mmio.h | 122 +++++++ .../counters/mmio/pcie_counters_mi200.cpp | 116 +++++++ src/core/counters/mmio/pcie_counters_mi200.h | 43 +++ .../mmio/pcie_perfmon_registers_mi200.h | 327 ++++++++++++++++++ src/core/counters/mmio/perfmon.h | 47 +++ src/core/memory/generic_buffer.cpp | 7 + src/core/session/counters_sampler.cpp | 168 +++++++++ src/core/session/counters_sampler.h | 62 ++++ src/core/session/filter.cpp | 15 + src/core/session/filter.h | 2 + src/core/session/session.cpp | 19 + src/core/session/session.h | 5 + tests/unittests/core/CMakeLists.txt | 4 + tests/unittests/profiler/CMakeLists.txt | 4 + 20 files changed, 1342 insertions(+), 3 deletions(-) create mode 100644 samples/counters_sampler/pcie_counters_example.cpp create mode 100644 src/core/counters/mmio/mmio.cpp create mode 100644 src/core/counters/mmio/mmio.h create mode 100644 src/core/counters/mmio/pcie_counters_mi200.cpp create mode 100644 src/core/counters/mmio/pcie_counters_mi200.h create mode 100644 src/core/counters/mmio/pcie_perfmon_registers_mi200.h create mode 100644 src/core/counters/mmio/perfmon.h create mode 100644 src/core/session/counters_sampler.cpp create mode 100644 src/core/session/counters_sampler.h diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 0e50f95ebe..82bc4252b3 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -479,7 +479,11 @@ typedef enum { /** * Represents SPM records */ - ROCPROFILER_SPM_RECORD = 4 + ROCPROFILER_SPM_RECORD = 4, + /** + * Represents Counters sampler records + */ + ROCPROFILER_COUNTERS_SAMPLER_RECORD = 5 } rocprofiler_record_kind_t; /** @@ -1789,7 +1793,11 @@ typedef enum { /** * HIP/HSA/ROCTX/SYS Trace. */ - ROCPROFILER_API_TRACE = 6 + ROCPROFILER_API_TRACE = 6, + /** + * Sampled Counters + */ + ROCPROFILER_COUNTERS_SAMPLER = 7 } rocprofiler_filter_kind_t; /** @@ -1895,6 +1903,69 @@ typedef struct { } rocprofiler_spm_parameter_t; +typedef enum{ + ROCPROFILER_COUNTERS_SAMPLER_PCIE_COUNTERS = 0 +} rocprofiler_counters_sampler_counter_type_t; + +typedef struct{ + char* name; + rocprofiler_counters_sampler_counter_type_t type; +} rocprofiler_counters_sampler_counter_input_t; + +typedef struct{ + rocprofiler_counters_sampler_counter_type_t type; + rocprofiler_record_counter_value_t value; +} rocprofiler_counters_sampler_counter_output_t; + +typedef struct{ + /** + * Counters to profile + */ + rocprofiler_counters_sampler_counter_input_t* counters; + /** + * Counters count + */ + int counters_num; + /** + * Sampling rate (ms) + */ + uint32_t sampling_rate; + /** + * Total sampling duration (ms); time between sampling start/stop + */ + uint32_t sampling_duration; + /** + * Initial delay (ms) + */ + uint32_t initial_delay; + /** + * Preferred agents to collect counters from + */ + int gpu_agent_index; +}rocprofiler_counters_sampler_parameters_t; + +typedef struct{ + /** + * ROCMtool General Record base header to identify the id and kind of every + * record + */ + rocprofiler_record_header_t header; + /** + * Agent Identifier to be used by the user to get the Agent Information using + * ::rocprofiler_query_agent_info + */ + rocprofiler_agent_id_t gpu_id; + /** + * Counters, including identifiers to get counter information and Counters + * values + */ + rocprofiler_counters_sampler_counter_output_t* counters; + /** + * Number of counter values + */ + uint32_t num_counters; +}rocprofiler_record_counters_sampler_t; + /** * Filter Kind Data */ @@ -1915,6 +1986,10 @@ typedef union { * spm counters parameters */ rocprofiler_spm_parameter_t* spm_parameters; + /** + * sampled counters parameters + */ + rocprofiler_counters_sampler_parameters_t counters_sampler_parameters; } rocprofiler_filter_data_t; /** diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index f3ab20ff6c..3e939f6e4d 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -84,6 +84,16 @@ target_link_options(profiler_device_profiling PRIVATE "-Wl,--build-id=md5") add_dependencies(samples profiler_device_profiling) install(TARGETS profiler_device_profiling RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/samples COMPONENT samples) +## Build Counters Sampling example +set_source_files_properties(counters_sampler/pcie_counters_example.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) +hip_add_executable(pcie_counters_sampler counters_sampler/pcie_counters_example.cpp ${ROCPROFILER_UTIL_SRC_FILES}) +target_include_directories(pcie_counters_sampler PRIVATE ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/inc ${CMAKE_CURRENT_SOURCE_DIR}/common) +target_link_libraries(pcie_counters_sampler PRIVATE ${ROCPROFILER_TARGET} systemd amd_comgr) +add_dependencies(samples pcie_counters_sampler) +install(TARGETS pcie_counters_sampler RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/samples COMPONENT samples) + +# ################################################################################################################ + # ############################################################################################################################################ # Tracer Samples # ############################################################################################################################################ diff --git a/samples/common/common.h b/samples/common/common.h index b7fcaba061..c4fdb9ad9a 100644 --- a/samples/common/common.h +++ b/samples/common/common.h @@ -285,6 +285,16 @@ void FlushPCSamplingRecord( << std::endl; } +void FlushCountersSamplerRecord( + const rocprofiler_record_counters_sampler_t *counters_sampler_record) { + for (uint32_t i = 0; i < counters_sampler_record->num_counters; i++) { + output_file << ",Counter_" << i << "(" + << std::to_string(counters_sampler_record->counters[i].value.value) << ")" + << std::endl; + } + output_file << std::endl; +} + int WriteBufferRecords(const rocprofiler_record_header_t* begin, const rocprofiler_record_header_t* end, rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id) { while (begin < end) { @@ -308,9 +318,16 @@ int WriteBufferRecords(const rocprofiler_record_header_t* begin, const rocprofil FlushPCSamplingRecord(pc_sampling_record); break; } - default: { + case ROCPROFILER_COUNTERS_SAMPLER_RECORD: { + const rocprofiler_record_counters_sampler_t *counters_sampler_record = + reinterpret_cast(begin); + FlushCountersSamplerRecord(counters_sampler_record); break; } + default: { + std::cout <<"unknown record\n"; + break; + } } rocprofiler_next_record(begin, &begin, session_id, buffer_id); } diff --git a/samples/counters_sampler/pcie_counters_example.cpp b/samples/counters_sampler/pcie_counters_example.cpp new file mode 100644 index 0000000000..da070d277c --- /dev/null +++ b/samples/counters_sampler/pcie_counters_example.cpp @@ -0,0 +1,103 @@ +#include "../common/common.h" + +int main(int argc, char** argv) { + int* gpuMem; + int counter_option = 0; + + std::vector pcie_counters = { + "CI_PERF_slv_MemRd_Bandwidth0", "CI_PERF_slv_MemWr_Bandwidth0", "CI_PERF_slv_totalMemRdTx", + "CI_PERF_slv_totalMemWrTx", "CI_PERF_slv_totalTx"}; + + if(argc > 1) { + counter_option = atoi(argv[1]); + } + else{ + std::cout<< "Please provide one of the counter index options as argument:\n"; + for(int i = 0; i < pcie_counters.size(); i++){ + std::cout<< "[" << i << "]: " << pcie_counters[i] << std::endl; + } + std::cout<< "Example:\n ./pcie_counters_sampler 1\n"; + exit(0); + } + + prepare(); + // Initialize the tools + CHECK_ROCPROFILER(rocprofiler_initialize()); + + // Creating the session with given replay mode + rocprofiler_session_id_t session_id; + CHECK_ROCPROFILER(rocprofiler_create_session(ROCPROFILER_KERNEL_REPLAY_MODE, &session_id)); + + // Creating Output Buffer for the data + rocprofiler_buffer_id_t buffer_id; + CHECK_ROCPROFILER(rocprofiler_create_buffer( + session_id, + [](const rocprofiler_record_header_t* record, const rocprofiler_record_header_t* end_record, + rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id) { + WriteBufferRecords(record, end_record, session_id, buffer_id); + }, + 0x999999, &buffer_id)); + + // Counters Sampler Filter + rocprofiler_filter_id_t filter_id; + [[maybe_unused]] rocprofiler_filter_property_t property = {}; + + + rocprofiler_counters_sampler_counter_input_t counters_input[2] = { + {.name = const_cast(pcie_counters[counter_option].c_str()), + .type = ROCPROFILER_COUNTERS_SAMPLER_PCIE_COUNTERS}}; + + uint32_t rate = 1000; + uint32_t duration = 5000; + + rocprofiler_counters_sampler_parameters_t cs_parameters = {.counters = counters_input, + .counters_num = 1, + .sampling_rate = rate, + .sampling_duration = duration, + .gpu_agent_index = 0}; + CHECK_ROCPROFILER( + rocprofiler_create_filter(session_id, ROCPROFILER_COUNTERS_SAMPLER, + rocprofiler_filter_data_t{.counters_sampler_parameters = cs_parameters}, + 0, &filter_id, property)); + CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id)); + + // Normal HIP Calls + hipDeviceProp_t devProp; + HIP_CALL(hipGetDeviceProperties(&devProp, 0)); + HIP_CALL(hipMalloc((void**)&gpuMem, 1 * sizeof(int))); + + // KernelA and KernelB won't be profiled + kernelCalls('A'); + kernelCalls('B'); + + std::cout << "Collecting samples for: " << pcie_counters[counter_option] + << " ; sampling rate: " << rate << " ms; duration: " << duration << " ms" << std::endl; + // Activating the session + CHECK_ROCPROFILER(rocprofiler_start_session(session_id)); + + // KernelC, KernelD, KernelE and KernelF to be profiled as part of the session + kernelCalls('C'); + kernelCalls('D'); + kernelCalls('E'); + kernelCalls('F'); + // Normal HIP Calls + HIP_CALL(hipFree(gpuMem)); + + // allow sampler to run for 10 secs + sleep(6); + + // Deactivating session + CHECK_ROCPROFILER(rocprofiler_terminate_session(session_id)); + + // Manual Flush user buffer request + CHECK_ROCPROFILER(rocprofiler_flush_data(session_id, buffer_id)); + + // Destroy sessions + CHECK_ROCPROFILER(rocprofiler_destroy_session(session_id)); + + // Destroy all profiling related objects(User buffer, sessions, filters, + // etc..) + CHECK_ROCPROFILER(rocprofiler_finalize()); + + return 0; +} \ No newline at end of file diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt index 822ee4121e..d2c5130fd1 100644 --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@ -174,10 +174,12 @@ set(CORE_SESSION_DIR ${PROJECT_SOURCE_DIR}/src/core/session) file(GLOB CORE_SESSION_SRC_FILES ${CORE_SESSION_DIR}/session.cpp) file(GLOB CORE_FILTER_SRC_FILES ${CORE_SESSION_DIR}/filter.cpp) file(GLOB CORE_DEVICE_PROFILING_SRC_FILES ${CORE_SESSION_DIR}/device_profiling.cpp) +file(GLOB CORE_COUNTERS_SAMPLER_SRC_FILES ${CORE_SESSION_DIR}/counters_sampler.cpp) file(GLOB CORE_COUNTERS_SRC_FILES ${PROJECT_BINARY_DIR}/src/api/*_counter.cpp) file(GLOB CORE_COUNTERS_PARENT_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters/*.cpp) file(GLOB CORE_COUNTERS_METRICS_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters/metrics/*.cpp) +file(GLOB CORE_COUNTERS_MMIO_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters/mmio/*.cpp) set(CORE_UTILS_DIR ${PROJECT_SOURCE_DIR}/src/utils) file(GLOB CORE_UTILS_SRC_FILES ${CORE_UTILS_DIR}/*.cpp) @@ -198,8 +200,10 @@ add_library(${ROCPROFILER_TARGET} SHARED ${CORE_SESSION_SRC_FILES} ${CORE_FILTER_SRC_FILES} ${CORE_DEVICE_PROFILING_SRC_FILES} + ${CORE_COUNTERS_SAMPLER_SRC_FILES} ${CORE_COUNTERS_PARENT_SRC_FILES} ${CORE_COUNTERS_METRICS_SRC_FILES} + ${CORE_COUNTERS_MMIO_SRC_FILES} ${CORE_UTILS_SRC_FILES} ${CORE_HSA_PACKETS_SRC_FILES} ${CORE_HSA_QUEUES_SRC_FILES} diff --git a/src/core/counters/mmio/mmio.cpp b/src/core/counters/mmio/mmio.cpp new file mode 100644 index 0000000000..c336630568 --- /dev/null +++ b/src/core/counters/mmio/mmio.cpp @@ -0,0 +1,189 @@ +/* Copyright (c) 2023 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "mmio.h" +#include +#include "../../../utils/helper.h" +#include "pcie_perfmon_registers_mi200.h" + +namespace rocmtools { + +namespace mmio { + +void PrintFunctionPhase(const char* function_name, int phase){ +#if defined(DEBUG_TRACE) + if (phase == 0) + std::clog << function_name << "() START" << std::endl; + else + std::clog << function_name << "() END" << std::endl; +#endif +} + +void PrintRegisterData(uint32_t& index_value, uint32_t& data_value, const char* function_name, int phase){ +#if defined(DEBUG_TRACE) + if (phase == 0) { + PrintFunctionPhase(function_name, phase); + std::clog << "Old (index, data) : "<< std::hex << index_value << " " << data_value << std::endl; + } + else { + std::clog << "New (index, data) : "<< std::hex << index_value << " " << data_value << std::endl; + PrintFunctionPhase(function_name, phase); + } +#endif +} + +MMIO::MMIO(const Agent::AgentInfo& info) : agent_info_(&info), pci_memory_(nullptr), type_(DEFAULT_MMAP) { + const auto pci_domain = agent_info_->getPCIDomain(); + const auto pci_location_id = agent_info_->getPCILocationID(); + + pci_device_ = + pci_device_find_by_slot(pci_domain, pci_location_id >> 8, pci_location_id & 0xFF, 0); + if (!pci_device_ || pci_device_probe(pci_device_)) fatal("failed to probe the GPU device\n"); + + // Look for a region between 256KB and 4096KB, 32-bit, non IO, and non prefetchable. + for (size_t region = 0; region < sizeof(pci_device::regions) / sizeof(pci_device::regions[0]); + ++region) + if (pci_device_->regions[region].is_64 == 0 && + pci_device_->regions[region].is_prefetchable == 0 && + pci_device_->regions[region].is_IO == 0 && + pci_device_->regions[region].size >= (256UL * 1024) && + pci_device_->regions[region].size <= (4096UL * 1024)) { + pci_memory_size_ = pci_device_->regions[region].size; + int err = pci_device_map_range(pci_device_, pci_device_->regions[region].base_addr, + pci_device_->regions[region].size, PCI_DEV_MAP_FLAG_WRITABLE, + (void**)&pci_memory_); + if(err) + fatal("failed to map the registers. Error code: %d\n", err); + } + + if (pci_memory_ == nullptr) fatal("could not find the pci memory address\n"); + + SetIndexDataRegisters(INDIRECT_REG_INDEX, INDIRECT_REG_DATA); +} + +MMIO::~MMIO() { + if (pci_memory_) + { + int err = pci_device_unmap_range(pci_device_, pci_memory_, pci_memory_size_); + if(err) warning("failed to unmap the pci memory. Error code: %d\n", err); + } +} + +bool MMIO::RegisterWriteAPI(uint32_t reg_offset, uint32_t value){ + // access the mmap + // write register offset to index register 0x38 of index/data pair (indirect addressing) + // write register bits to data register 0x3c of index/data pair (indirect addressing) + + // std::lock_guard lock(mutex_); + PrintRegisterData(*index_reg_addr, *data_reg_addr, __FUNCTION__, 0); + + // TODO: should work only if map is created + + *index_reg_addr = reg_offset; + *data_reg_addr = value; + + PrintRegisterData(*index_reg_addr, *data_reg_addr, __FUNCTION__, 1); + return true; +} + +bool MMIO::RegisterReadAPI(uint32_t reg_offset, uint32_t& value){ + // access the mmap + // write register offset to index register 0x38 of index/data pair (indirect addressing) + // read register bits to data register 0x3c of index/data pair (indirect addressing) + + // std::lock_guard lock(mutex_); + PrintRegisterData(*index_reg_addr, *data_reg_addr, __FUNCTION__, 0); + + // TODO: should work only if map is created + + *index_reg_addr = reg_offset; + // TODO: add delay here?? + value = *data_reg_addr; + + PrintRegisterData(*index_reg_addr, *data_reg_addr, __FUNCTION__, 1); + return true; +} + + +MMIO* MMIOManager::CreateMMIO(mmap_type_t type, const Agent::AgentInfo& info) { + MMIO* mmio = nullptr; + switch (type) { + case PCIE_PERFMON: { + mmio = GetMMIOInstance(type, info); + if(mmio == nullptr){ + mmio = dynamic_cast(new PciePerfmonMMIO(info)); + AddInstance(mmio); + } + break; + } + case DF_PERFMON: { + break; + } + case UMC_PERFMON: { + break; + } + case DEFAULT_MMAP: { + break; + } + } + return mmio; +} + +MMIO* MMIOManager::GetMMIOInstance(mmap_type_t type, const Agent::AgentInfo& info) { + MMIO* mmio = nullptr; + auto it = mmio_instances_.find(info.getHandle()); + if(it != mmio_instances_.end()){ + for(auto& mmio_instance: it->second){ + if(mmio_instance->Type() == type){ + mmio = mmio_instance; + } + } + } + return mmio; +} + +void MMIOManager::AddInstance(MMIO* in_mmio_instance) { + uint64_t handle = in_mmio_instance->GetAgentInfo().getHandle(); + mmio_instances_[handle].push_back(in_mmio_instance); +} + +void MMIOManager::DestroyMMIOInstance(MMIO* in_mmio_instance) { + if(in_mmio_instance == nullptr) + return; + + uint64_t handle = in_mmio_instance->GetAgentInfo().getHandle(); + auto it = mmio_instances_.find(handle); + if(it != mmio_instances_.end()){ + auto& mmio_array = it->second; + // find instance in the array and remove it from the array + mmio_array.erase(std::remove(mmio_array.begin(), mmio_array.end(), in_mmio_instance), mmio_array.end()); + } + delete in_mmio_instance; +} + + +std::map> MMIOManager::mmio_instances_; + + +} // namespace mmio + +} // namespace rocmtools + + \ No newline at end of file diff --git a/src/core/counters/mmio/mmio.h b/src/core/counters/mmio/mmio.h new file mode 100644 index 0000000000..eb1e6dd69c --- /dev/null +++ b/src/core/counters/mmio/mmio.h @@ -0,0 +1,122 @@ +/* Copyright (c) 2023 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + + #ifndef SRC_CORE_NON_GFXIP_COUNTERS_MMIO_H + #define SRC_CORE_NON_GFXIP_COUNTERS_MMIO_H + +#include +#include "src/core/hardware/hsa_info.h" + +#include +#include + +#include +#include +#include + +namespace rocmtools { + +namespace mmio { + +#define FUNCTION_START() PrintFunctionPhase(__FUNCTION__, 0) +#define FUNCTION_END() PrintFunctionPhase(__FUNCTION__, 1) + +// uncomment below to see register write sequences +// #define DEBUG_TRACE = 1 + +void PrintFunctionPhase(const char* function_name, int phase); +void PrintRegisterData(uint32_t& index_value, uint32_t& data_value, const char* function_name, int phase); + +// Index/Data registers +const static uint32_t INDIRECT_REG_INDEX = 0x38; +const static uint32_t INDIRECT_REG_DATA = 0x3c; + +typedef enum { DEFAULT_MMAP, DF_PERFMON, UMC_PERFMON, PCIE_PERFMON } mmap_type_t; + +class MMIOManager; + +class MMIO { + public: + virtual bool RegisterWriteAPI(uint32_t reg_offset, uint32_t value); + virtual bool RegisterReadAPI(uint32_t reg_offset, uint32_t& value); + virtual void SetIndexDataRegisters(const uint32_t index_reg, const uint32_t data_reg) { + index_reg_addr = (uint32_t*)((char*)pci_memory_ + index_reg); + data_reg_addr = (uint32_t*)((char*)pci_memory_ + data_reg); + } + + MMIO(MMIO& other) = delete; + void operator=(const MMIO&) = delete; + virtual ~MMIO(); + friend class MMIOManager; + + const Agent::AgentInfo& GetAgentInfo() { return *agent_info_; } + mmap_type_t Type() { return type_; } + + protected: + MMIO(const Agent::AgentInfo& info); + + // default constructor; helpful for derived classes + // which want to setup mmio construction differently + MMIO(){ type_ = DEFAULT_MMAP; }; + + const Agent::AgentInfo* agent_info_; + struct pci_device* pci_device_; + size_t pci_memory_size_; + uint32_t* pci_memory_; + mmap_type_t type_; + + uint32_t* index_reg_addr; + uint32_t* data_reg_addr; +}; + +// PciePerfmonMMIO has same mmio setup approach as +// done in MMIO class +class PciePerfmonMMIO : public MMIO{ + public: + friend class MMIOManager; + protected: + PciePerfmonMMIO(const Agent::AgentInfo& info): MMIO(info) { + type_ = PCIE_PERFMON; + }; +}; + +/* + Class to manage mmio for UMC/DF/PCIe etc. + The mmio approach for the different IPs may + be same or different. For eg: UMC and PCIe share + the same mmio and index/data registers +*/ +class MMIOManager { + public: + static MMIO* CreateMMIO(mmap_type_t type, const Agent::AgentInfo& info); + static MMIO* GetMMIOInstance(mmap_type_t type, const Agent::AgentInfo& info); + static void DestroyMMIOInstance(MMIO* instance); + private: + static void AddInstance(MMIO* instance); + static std::map> mmio_instances_; +}; + + +} // namespace mmio + +} // namespace rocmtools + + +#endif \ No newline at end of file diff --git a/src/core/counters/mmio/pcie_counters_mi200.cpp b/src/core/counters/mmio/pcie_counters_mi200.cpp new file mode 100644 index 0000000000..f59c9250cf --- /dev/null +++ b/src/core/counters/mmio/pcie_counters_mi200.cpp @@ -0,0 +1,116 @@ +#include "pcie_counters_mi200.h" +#include "pcie_perfmon_registers_mi200.h" +#include "perfmon.h" + +namespace rocmtools { + +PciePerfMonMI200::PciePerfMonMI200(const Agent::AgentInfo& info) : PerfMon(), mmio_(nullptr) { + mmio_ = + dynamic_cast(mmio::MMIOManager::CreateMMIO(mmio::PCIE_PERFMON, info)); +} + +PciePerfMonMI200::~PciePerfMonMI200() { + mmio::MMIOManager::DestroyMMIOInstance(dynamic_cast(mmio_)); +} + +void PciePerfMonMI200::writeRegister(uint32_t reg_offset, uint32_t value){ + // mmio or ioctl approaches + mmio_->RegisterWriteAPI(reg_offset, value); +} + +void PciePerfMonMI200::readRegister(uint32_t reg_offset, uint32_t& value){ + // mmio or ioctl approaches + mmio_->RegisterReadAPI(reg_offset, value); +} + +void PciePerfMonMI200::SetCounterNames(std::vector& counter_names) { + counter_names_ = counter_names; + // TODO: only one event at a time is supported at the moment + auto it = PCIE_MI200::pcie_events_table.find(counter_names[0]); + if (it != PCIE_MI200::pcie_events_table.end()) { + PCIE_MI200::pcie_event_t event_desc = it->second; + if (event_desc.event_category == PCIE_MI200::RX_TILE_SCLK) { + event_id_ = event_desc.event_id; + } + } +} + +void PciePerfMonMI200::Start(){ + // TODO: make sure values stored in table + // in registers header are dec and not hex + + Start_RX_TILE_SCLK(event_id_); +} + +void PciePerfMonMI200::Stop(){ + // TODO: revisit correct value to stop + writeRegister(PCIE_MI200::PCIE_PERF_COUNT_CNTL, 0x2); // stop +} + +void PciePerfMonMI200::Read(std::vector& values){ + uint64_t val=0; + Read_RX_TILE_SCLK(val); + rocprofiler_counters_sampler_counter_output_t value = { + ROCPROFILER_COUNTERS_SAMPLER_PCIE_COUNTERS, + static_cast(val) + }; + values.push_back(value); +} + +void PciePerfMonMI200::Start_RX_TILE_TXCLK(uint32_t event){ + + // Step 1: PORT SEL update + writeRegister(PCIE_MI200::PCIE_PERF_CNTL_EVENT_CI_PORT_SEL, 0x0); + + // Step 2: EVENT SEL update + uint32_t value = event; // last 8 bits for event + writeRegister(PCIE_MI200::PCIE_PERF_CNTL_TXCLK3, value); + + // Steps 3 & 4: Performance counters initialization, enable: + // TODO: revisit. Just a single write with 0x3 might be enough (check with pcie team) + writeRegister(PCIE_MI200::PCIE_PERF_COUNT_CNTL, 0x5); + +} + +void PciePerfMonMI200::Read_RX_TILE_TXCLK(uint64_t& result){ + // Step 5: Performance counters read: + uint32_t lo_val, hi_val; + readRegister(PCIE_MI200::PCIE_PERF_COUNT0_TXCLK3, lo_val); + readRegister(PCIE_MI200::PCIE_PERF_COUNT0_UPVAL_TXCLK3, hi_val); + + // Combine the lo and hi values and put them in result + uint64_t val = (hi_val & 0xFFFFUL); + val = val << 32; + result = val | lo_val; +} + +void PciePerfMonMI200::Start_RX_TILE_SCLK(uint32_t event){ + + // Step 1: PORT SEL update + writeRegister(PCIE_MI200::PCIE_PERF_CNTL_EVENT_CI_PORT_SEL, 0x0); + + // Step 2: EVENT SEL update + uint32_t value = event; // last 8 bits for event + writeRegister(PCIE_MI200::PCIE_PERF_CNTL_LCLK1, value); + + // Steps 3 & 4: Performance counters initialization, enable: + // TODO: revisit. Just a single write with 0x3 might be enough (check with pcie team) + writeRegister(PCIE_MI200::PCIE_PERF_COUNT_CNTL, 0x5); + +} + +void PciePerfMonMI200::Read_RX_TILE_SCLK(uint64_t& result){ + // Step 5: Performance counters read: + uint32_t lo_val, hi_val; + readRegister(PCIE_MI200::PCIE_PERF_COUNT0_LCLK1, lo_val); + readRegister(PCIE_MI200::PCIE_PERF_COUNT0_UPVAL_LCLK1, hi_val); + + // Combine the lo and hi values and put them in result + uint64_t val = (hi_val & 0xFFFFUL); + val = val << 32; + result = val | lo_val; +} + +} // namespace rocmtools + + diff --git a/src/core/counters/mmio/pcie_counters_mi200.h b/src/core/counters/mmio/pcie_counters_mi200.h new file mode 100644 index 0000000000..94cbc58dec --- /dev/null +++ b/src/core/counters/mmio/pcie_counters_mi200.h @@ -0,0 +1,43 @@ +#ifndef PCIE_COUNTERS_MI200_H +#define PCIE_COUNTERS_MI200_H + +#include "mmio.h" +#include "perfmon.h" + +namespace rocmtools { + +/* + One perfmon per GPU. + Only one instance per GPU, per process +*/ + +class PciePerfMonMI200 : public PerfMon { + public: + PciePerfMonMI200(const Agent::AgentInfo& info); + ~PciePerfMonMI200(); + void SetCounterNames(std::vector& counter_names) override; + void Start() override; + void Stop() override; + void Read(std::vector& values) override; + mmio::mmap_type_t Type() override { return mmio::mmap_type_t::PCIE_PERFMON; } + + private: + // TODO : check google coding std + void writeRegister(uint32_t reg_offset, uint32_t value); + void readRegister(uint32_t reg_offset, uint32_t& value); + + void Start_RX_TILE_TXCLK(uint32_t event); + void Read_RX_TILE_TXCLK(uint64_t& result); + + void Start_RX_TILE_SCLK(uint32_t event); + void Read_RX_TILE_SCLK(uint64_t& result); + + private: + mmio::PciePerfmonMMIO* mmio_; + std::vector counter_names_; + int event_id_; +}; + +} // namespace rocmtools + +#endif \ No newline at end of file diff --git a/src/core/counters/mmio/pcie_perfmon_registers_mi200.h b/src/core/counters/mmio/pcie_perfmon_registers_mi200.h new file mode 100644 index 0000000000..9678bf85eb --- /dev/null +++ b/src/core/counters/mmio/pcie_perfmon_registers_mi200.h @@ -0,0 +1,327 @@ +#ifndef PCIE_PERFMON_REGISTERS_MI200_H +#define PCIE_PERFMON_REGISTERS_MI200_H + +#include + +namespace PCIE_MI200 { + +// -------- RX Tile TXCLK Start -------- + +// Step 1: PORT SEL update +const static uint32_t PCIE_PERF_CNTL_EVENT_CI_PORT_SEL = 0x11180250; + +// Step 2: EVENT SEL update +const static uint32_t PCIE_PERF_CNTL_TXCLK1 = 0x11180204; +const static uint32_t PCIE_PERF_CNTL_TXCLK2 = 0x11180210; +const static uint32_t PCIE_PERF_CNTL_TXCLK3 = 0x1118021C; //# +const static uint32_t PCIE_PERF_CNTL_TXCLK4 = 0x11180228; //# +const static uint32_t PCIE_PERF_CNTL_TXCLK5 = 0x11180258; +const static uint32_t PCIE_PERF_CNTL_TXCLK6 = 0x11180264; +const static uint32_t PCIE_PERF_CNTL_TXCLK7 = 0x11180888; +const static uint32_t PCIE_PERF_CNTL_TXCLK8 = 0x11180894; +const static uint32_t PCIE_PERF_CNTL_TXCLK9 = 0x111808A0; +const static uint32_t PCIE_PERF_CNTL_TXCLK10 = 0x111808AC; + +// Steps 3 & 4: Performance counters initialization, enable: +const static uint32_t PCIE_PERF_COUNT_CNTL = 0x11180200; + +// Step 5: Performance counters read: +const static uint32_t PCIE_PERF_COUNT0_TXCLK1 = 0x11180208; +const static uint32_t PCIE_PERF_COUNT0_TXCLK2 = 0x11180214; +const static uint32_t PCIE_PERF_COUNT0_TXCLK3 = 0x11180220; //# +const static uint32_t PCIE_PERF_COUNT0_TXCLK4 = 0x1118022C; //# +const static uint32_t PCIE_PERF_COUNT0_TXCLK5 = 0x1118025C; +const static uint32_t PCIE_PERF_COUNT0_TXCLK6 = 0x11180268; +const static uint32_t PCIE_PERF_COUNT0_TXCLK7 = 0x1118088C; +const static uint32_t PCIE_PERF_COUNT0_TXCLK8 = 0x11180898; +const static uint32_t PCIE_PERF_COUNT0_TXCLK9 = 0x111808A4; +const static uint32_t PCIE_PERF_COUNT0_TXCLK10 = 0x111808B0; + +const static uint32_t PCIE_PERF_COUNT0_UPVAL_TXCLK1 = 0x111808E8; +const static uint32_t PCIE_PERF_COUNT0_UPVAL_TXCLK2 = 0x111808F0; +const static uint32_t PCIE_PERF_COUNT0_UPVAL_TXCLK3 = 0x111808F8; //# +const static uint32_t PCIE_PERF_COUNT0_UPVAL_TXCLK4 = 0x11180900; //# +const static uint32_t PCIE_PERF_COUNT0_UPVAL_TXCLK5 = 0x11180908; +const static uint32_t PCIE_PERF_COUNT0_UPVAL_TXCLK6 = 0x11180910; +const static uint32_t PCIE_PERF_COUNT0_UPVAL_TXCLK7 = 0x11180918; +const static uint32_t PCIE_PERF_COUNT0_UPVAL_TXCLK8 = 0x11180920; +const static uint32_t PCIE_PERF_COUNT0_UPVAL_TXCLK9 = 0x11180928; +const static uint32_t PCIE_PERF_COUNT0_UPVAL_TXCLK10 = 0x11180930; + +const static uint32_t PCIE_PERF_COUNT1_TXCLK1 = 0x1118020C; +const static uint32_t PCIE_PERF_COUNT1_TXCLK2 = 0x11180218; +const static uint32_t PCIE_PERF_COUNT1_TXCLK3 = 0x11180224; //# +const static uint32_t PCIE_PERF_COUNT1_TXCLK4 = 0x11180230; //# +const static uint32_t PCIE_PERF_COUNT1_TXCLK5 = 0x11180260; +const static uint32_t PCIE_PERF_COUNT1_TXCLK6 = 0x1118026C; +const static uint32_t PCIE_PERF_COUNT1_TXCLK7 = 0x11180890; +const static uint32_t PCIE_PERF_COUNT1_TXCLK8 = 0x1118089C; +const static uint32_t PCIE_PERF_COUNT1_TXCLK9 = 0x111808A8; +const static uint32_t PCIE_PERF_COUNT1_TXCLK10 = 0x111808B4; + +const static uint32_t PCIE_PERF_COUNT1_UPVAL_TXCLK1 = 0x111808EC; +const static uint32_t PCIE_PERF_COUNT1_UPVAL_TXCLK2 = 0x111808F4; +const static uint32_t PCIE_PERF_COUNT1_UPVAL_TXCLK3 = 0x111808FC; //# +const static uint32_t PCIE_PERF_COUNT1_UPVAL_TXCLK4 = 0x11180904; //# +const static uint32_t PCIE_PERF_COUNT1_UPVAL_TXCLK5 = 0x1118090C; +const static uint32_t PCIE_PERF_COUNT1_UPVAL_TXCLK6 = 0x11180914; +const static uint32_t PCIE_PERF_COUNT1_UPVAL_TXCLK7 = 0x1118091C; +const static uint32_t PCIE_PERF_COUNT1_UPVAL_TXCLK8 = 0x11180924; +const static uint32_t PCIE_PERF_COUNT1_UPVAL_TXCLK9 = 0x1118092C; +const static uint32_t PCIE_PERF_COUNT1_UPVAL_TXCLK10 = 0x11180934; + + +// -------- RX Tile TXCLK End -------- + +// -------- RX Tile SCLK Start -------- + +// Step 1: PORT SEL update +// PCIE_PERF_CNTL_EVENT_CI_PORT_SEL + +// Step 2: EVENT SEL update +const static uint32_t PCIE_PERF_CNTL_LCLK1 = 0x11180234; +const static uint32_t PCIE_PERF_CNTL_LCLK2 = 0x11180240; +const static uint32_t PCIE_PERF_CNTL_LCLK3 = 0x11180270; +const static uint32_t PCIE_PERF_CNTL_LCLK4 = 0x1118027C; +const static uint32_t PCIE_PERF_CNTL_LCLK5 = 0x111808B8; +const static uint32_t PCIE_PERF_CNTL_LCLK6 = 0x111808C4; +const static uint32_t PCIE_PERF_CNTL_LCLK7 = 0x111808D0; +const static uint32_t PCIE_PERF_CNTL_LCLK8 = 0x111808DC; + +// Step 5: Performance counters read: +const static uint32_t PCIE_PERF_COUNT0_LCLK1 = 0x11180238; +const static uint32_t PCIE_PERF_COUNT0_LCLK2 = 0x11180244; +const static uint32_t PCIE_PERF_COUNT0_LCLK3 = 0x11180274; +const static uint32_t PCIE_PERF_COUNT0_LCLK4 = 0x11180280; +const static uint32_t PCIE_PERF_COUNT0_LCLK5 = 0x111808BC; +const static uint32_t PCIE_PERF_COUNT0_LCLK6 = 0x111808C8; +const static uint32_t PCIE_PERF_COUNT0_LCLK7 = 0x111808D4; +const static uint32_t PCIE_PERF_COUNT0_LCLK8 = 0x111808E0; + +const static uint32_t PCIE_PERF_COUNT0_UPVAL_LCLK1 = 0x11180938; +const static uint32_t PCIE_PERF_COUNT0_UPVAL_LCLK2 = 0x11180940; +const static uint32_t PCIE_PERF_COUNT0_UPVAL_LCLK3 = 0x11180948; +const static uint32_t PCIE_PERF_COUNT0_UPVAL_LCLK4 = 0x11180950; +const static uint32_t PCIE_PERF_COUNT0_UPVAL_LCLK5 = 0x11180958; +const static uint32_t PCIE_PERF_COUNT0_UPVAL_LCLK6 = 0x11180960; +const static uint32_t PCIE_PERF_COUNT0_UPVAL_LCLK7 = 0x11180968; +const static uint32_t PCIE_PERF_COUNT0_UPVAL_LCLK8 = 0x11180970; + +const static uint32_t PCIE_PERF_COUNT1_LCLK1 = 0x1118023C; +const static uint32_t PCIE_PERF_COUNT1_LCLK2 = 0x11180248; +const static uint32_t PCIE_PERF_COUNT1_LCLK3 = 0x11180278; +const static uint32_t PCIE_PERF_COUNT1_LCLK4 = 0x11180284; +const static uint32_t PCIE_PERF_COUNT1_LCLK5 = 0x111808C0; +const static uint32_t PCIE_PERF_COUNT1_LCLK6 = 0x111808CC; +const static uint32_t PCIE_PERF_COUNT1_LCLK7 = 0x111808D8; +const static uint32_t PCIE_PERF_COUNT1_LCLK8 = 0x111808E4; + +const static uint32_t PCIE_PERF_COUNT1_UPVAL_LCLK1 = 0x1118093C; +const static uint32_t PCIE_PERF_COUNT1_UPVAL_LCLK2 = 0x11180944; +const static uint32_t PCIE_PERF_COUNT1_UPVAL_LCLK3 = 0x1118094C; +const static uint32_t PCIE_PERF_COUNT1_UPVAL_LCLK4 = 0x11180954; +const static uint32_t PCIE_PERF_COUNT1_UPVAL_LCLK5 = 0x1118095C; +const static uint32_t PCIE_PERF_COUNT1_UPVAL_LCLK6 = 0x11180964; +const static uint32_t PCIE_PERF_COUNT1_UPVAL_LCLK7 = 0x1118096C; +const static uint32_t PCIE_PERF_COUNT1_UPVAL_LCLK8 = 0x11180974; + +// -------- RX Tile SCLK End ---------- + +typedef enum{ + TX_TILE_TXCLK = 0, + TX_TILE_SCLK = 1, + RX_TILE_TXCLK = 2, + RX_TILE_SCLK = 3, + LC_TILE_TXCLK = 4 +}pcie_event_category_t; + +struct pcie_event_t{ + pcie_event_t(int id, pcie_event_category_t cat): event_id(id), event_category(cat){} + int event_id; + pcie_event_category_t event_category; +}; + +const static std::map pcie_events_table = { +{"RX_PERF_RXP_RX_TailEdb_A[0]", {2, RX_TILE_TXCLK}}, +{"RX_PERF_RXP_RX_TailEdb_A[1]", {3, RX_TILE_TXCLK}}, +{"RX_PERF_RXP_RX_TailEdb_A[2]", {4, RX_TILE_TXCLK}}, +{"RX_PERF_RXP_RX_TailEdb_A[3]", {5, RX_TILE_TXCLK}}, +{"RX_PERF_RXP_RX_TailEnd_A[0]", {6, RX_TILE_TXCLK}}, +{"RX_PERF_RXP_RX_TailEnd_A[1]", {7, RX_TILE_TXCLK}}, +{"RX_PERF_RXP_RX_TailEnd_A[2]", {8, RX_TILE_TXCLK}}, +{"RX_PERF_RXP_RX_TailEnd_A[3]", {9, RX_TILE_TXCLK}}, +{"RX_PERF_RXP_RX_HeadSdp_A[0]", {10, RX_TILE_TXCLK}}, +{"RX_PERF_RXP_RX_HeadSdp_A[1]", {11, RX_TILE_TXCLK}}, +{"RX_PERF_RXP_RX_HeadSdp_A[2]", {12, RX_TILE_TXCLK}}, +{"RX_PERF_RXP_RX_HeadSdp_A[3]", {13, RX_TILE_TXCLK}}, +{"RX_PERF_RXP_RX_HeadStp_A[0]", {14, RX_TILE_TXCLK}}, +{"RX_PERF_RXP_RX_HeadStp_A[1]", {15, RX_TILE_TXCLK}}, +{"RX_PERF_RXP_RX_HeadStp_A[2]", {16, RX_TILE_TXCLK}}, +{"RX_PERF_RXP_RX_HeadStp_A[3]", {17, RX_TILE_TXCLK}}, +{"RX_PERF_RXCRC_nullified_tlp_A", {18, RX_TILE_TXCLK}}, +{"RX_PERF_RXCRC_valid_crc_A", {19, RX_TILE_TXCLK}}, +{"RX_PERF_RXCRC_invalid_crc_A", {20, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_vendor_type1_A", {21, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_vendor_type0_A", {22, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_set_slot_power_limit_A", {23, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_unlock_A", {24, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_err_fatal_A", {25, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_err_nonfatal_A", {26, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_err_corr_A", {27, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_pme_to_ack_A", {28, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_pme_turn_off_A", {29, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_pm_pme_A", {30, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_pm_active_state_nak_A", {31, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_deassert_intd_A", {32, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_deassert_intc_A", {33, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_deassert_intb_A", {34, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_deassert_inta_A", {35, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_assert_intd_A", {36, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_assert_intc_A", {37, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_assert_intb_A", {38, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_assert_inta_A", {39, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_valid_A", {40, RX_TILE_TXCLK}}, +{"RX_PERF_RMSG_unsupported_A", {41, RX_TILE_TXCLK}}, +{"RX_PERF_RCB_unexpected_cpl_A", {42, RX_TILE_TXCLK}}, +{"RX_PERF_RCB_timeout_cpl_A", {43, RX_TILE_TXCLK}}, +{"RX_PERF_HDS_tlphdrvalid_A", {44, RX_TILE_TXCLK}}, +{"RX_PERF_HDS_tlpdatavalid_A", {45, RX_TILE_TXCLK}}, +{"RX_PERF_GAN_bad_tlp_A", {46, RX_TILE_TXCLK}}, +{"RX_PERF_GAN_nak_A", {47, RX_TILE_TXCLK}}, +{"RX_PERF_GAN_ack_A", {48, RX_TILE_TXCLK}}, +{"RX_PERF_FE_unsupported_req_A", {49, RX_TILE_TXCLK}}, +{"RX_PERF_FE_unsupported_cpl_A", {50, RX_TILE_TXCLK}}, +{"RX_PERF_FE_unexpected_cpl_A", {51, RX_TILE_TXCLK}}, +{"RX_PERF_FE_poisoned_tlp_A", {52, RX_TILE_TXCLK}}, +{"RX_PERF_FE_poisoned_cpl_A", {53, RX_TILE_TXCLK}}, +{"RX_PERF_FE_malformed_tlp_A", {54, RX_TILE_TXCLK}}, +{"RX_PERF_FE_cpl_abort_A", {55, RX_TILE_TXCLK}}, +{"RX_PERF_FE_request_MSG_A", {56, RX_TILE_TXCLK}}, +{"RX_PERF_FE_request_CFG_WR_A", {57, RX_TILE_TXCLK}}, +{"RX_PERF_FE_request_CFG_RD_A", {58, RX_TILE_TXCLK}}, +{"RX_PERF_FE_request_IO_WR_A", {59, RX_TILE_TXCLK}}, +{"RX_PERF_FE_request_IO_RD_A", {60, RX_TILE_TXCLK}}, +{"RX_PERF_FE_request_MEM_WR_A", {61, RX_TILE_TXCLK}}, +{"RX_PERF_FE_request_MEM_RD_A", {62, RX_TILE_TXCLK}}, +{"RX_PERF_FE_length_MST_gt16_A", {63, RX_TILE_TXCLK}}, +{"RX_PERF_FE_length_MST_9to16_A", {64, RX_TILE_TXCLK}}, +{"RX_PERF_FE_length_MST_5to8_A", {65, RX_TILE_TXCLK}}, +{"RX_PERF_FE_length_MST_2to4_A", {66, RX_TILE_TXCLK}}, +{"RX_PERF_FE_length_MST_1_A", {67, RX_TILE_TXCLK}}, +{"RX_PERF_FE_length_SLV_gt32_A", {68, RX_TILE_TXCLK}}, +{"RX_PERF_FE_length_SLV_17to32_A", {69, RX_TILE_TXCLK}}, +{"RX_PERF_FE_length_SLV_9to16_A", {70, RX_TILE_TXCLK}}, +{"RX_PERF_FE_length_SLV_5to8_A", {71, RX_TILE_TXCLK}}, +{"RX_PERF_FE_length_SLV_2to4_A", {72, RX_TILE_TXCLK}}, +{"RX_PERF_FE_length_SLV_1_A", {73, RX_TILE_TXCLK}}, +{"RX_PERF_FE_cpl_status_CA_A", {74, RX_TILE_TXCLK}}, +{"RX_PERF_FE_cpl_status_CRS_A", {75, RX_TILE_TXCLK}}, +{"RX_PERF_FE_cpl_status_UR_A", {76, RX_TILE_TXCLK}}, +{"RX_PERF_FE_cpl_status_SC_A", {77, RX_TILE_TXCLK}}, +{"RX_PERF_DLLP_pm_active_state_request_l1_A", {78, RX_TILE_TXCLK}}, +{"RX_PERF_DLLP_pm_request_ack_A", {79, RX_TILE_TXCLK}}, +{"RX_PERF_DLLP_pm_enter_l23_A", {80, RX_TILE_TXCLK}}, +{"RX_PERF_DLLP_pm_enter_l1_A", {81, RX_TILE_TXCLK}}, +{"RX_PERF_DLLP_error_A", {82, RX_TILE_TXCLK}}, +{"RX_PERF_DLLP_crc_err_A", {83, RX_TILE_TXCLK}}, +{"SB_PERF_FCC_npd_0", {84, RX_TILE_TXCLK}}, +{"SB_PERF_FCC_pd_0", {85, RX_TILE_TXCLK}}, +{"SB_PERF_FCC_nph_0", {86, RX_TILE_TXCLK}}, +{"SB_PERF_FCC_ph_0", {87, RX_TILE_TXCLK}}, +{"SB_PERF_fail_crc_rd_hdr_0", {88, RX_TILE_TXCLK}}, +{"SB_PERF_pass_crc_rd_hdr_0", {89, RX_TILE_TXCLK}}, +{"SB_PERF_fail_crc_wr_hdr_0", {90, RX_TILE_TXCLK}}, +{"SB_PERF_pass_crc_wr_hdr_0", {91, RX_TILE_TXCLK}}, +{"SB_PERF_fail_crc_data_0", {92, RX_TILE_TXCLK}}, +{"SB_PERF_pass_crc_data_0", {93, RX_TILE_TXCLK}}, +{"SB_PERF_invalid_crc_0", {94, RX_TILE_TXCLK}}, +{"SB_PERF_valid_crc_0", {95, RX_TILE_TXCLK}}, +{"SB_PERF_rd_hdr_WEN_0", {96, RX_TILE_TXCLK}}, +{"SB_PERF_wr_hdr_WEN_0", {97, RX_TILE_TXCLK}}, +{"SB_PERF_data_WEN_0", {98, RX_TILE_TXCLK}}, +{"SB_PERF_non_post_rd_from_FE", {99, RX_TILE_TXCLK}}, +{"SB_PERF_non_post_wr_from_FE", {100, RX_TILE_TXCLK}}, +{"SB_PERF_post_req_from_FE", {101, RX_TILE_TXCLK}}, +{"SB_PERF_non_post_rd_from_FE_0", {102, RX_TILE_TXCLK}}, +{"SB_PERF_non_post_wr_from_FE_0", {103, RX_TILE_TXCLK}}, +{"SB_PERF_post_req_from_FE_0", {104, RX_TILE_TXCLK}}, +{"RX_PERF_DLLP_nak_A", {111, RX_TILE_TXCLK}}, +{"RX_PERF_DLLP_ack_A", {112, RX_TILE_TXCLK}}, +{"RX_PERF_allErrors_A", {113, RX_TILE_TXCLK}}, +{"perf_PG_COUNT", {175, RX_TILE_TXCLK}}, +{"perf_NOT_POWER_GATED", {176, RX_TILE_TXCLK}}, +{"perf_POWER_GATED", {177, RX_TILE_TXCLK}}, + +{"SB_PERF_non_post_rd_to_HI", {2, RX_TILE_SCLK}}, +{"SB_PERF_non_post_wr_to_HI", {3, RX_TILE_SCLK}}, +{"SB_PERF_post_req_to_HI", {4, RX_TILE_SCLK}}, +{"SB_PERF_non_post_rd_to_HI_0", {5, RX_TILE_SCLK}}, +{"SB_PERF_non_post_wr_to_HI_0", {6, RX_TILE_SCLK}}, +{"SB_PERF_post_req_to_HI_0", {7, RX_TILE_SCLK}}, +{"SB_PERF_rd_hdr_REN_0", {8, RX_TILE_SCLK}}, +{"SB_PERF_wr_hdr_REN_0", {9, RX_TILE_SCLK}}, +{"SB_PERF_data_REN_0", {10, RX_TILE_SCLK}}, +{"SB_PERF_rd_hdr_empty_0", {11, RX_TILE_SCLK}}, +{"SB_PERF_wr_hdr_empty_0", {12, RX_TILE_SCLK}}, +{"SB_PERF_data_empty_0", {13, RX_TILE_SCLK}}, +{"CI_PERF_slv_total128BRdCpl", {29, RX_TILE_SCLK}}, +{"CI_PERF_slv_total32BMemRdTx", {30, RX_TILE_SCLK}}, +{"CI_PERF_slv_total64BMemRdTx", {31, RX_TILE_SCLK}}, +{"CI_PERF_slv_total16BMemWrTx", {32, RX_TILE_SCLK}}, +{"CI_PERF_slv_total32BMemWrTx", {33, RX_TILE_SCLK}}, +{"CI_PERF_slv_total64BMemWrTx", {34, RX_TILE_SCLK}}, +{"CI_PERF_slv_totalTx", {35, RX_TILE_SCLK}}, +{"CI_PERF_slv_stallGrantGen", {36, RX_TILE_SCLK}}, +{"CI_PERF_slv_totalGrant", {37, RX_TILE_SCLK}}, +{"CI_PERF_slv_txPending", {38, RX_TILE_SCLK}}, +{"CI_PERF_slv_numMemRdLT32B", {39, RX_TILE_SCLK}}, +{"CI_PERF_slv_numMemRdLT16B", {40, RX_TILE_SCLK}}, +{"CI_PERF_slv_totalMemTx", {41, RX_TILE_SCLK}}, +{"CI_PERF_slv_totalMemRdTx", {42, RX_TILE_SCLK}}, +{"CI_PERF_slv_totalMemWrTx", {43, RX_TILE_SCLK}}, +{"CI_PERF_slv_numGrant0", {44, RX_TILE_SCLK}}, +{"CI_PERF_slv_portCntOverFlow_ns0", {45, RX_TILE_SCLK}}, +{"CI_PERF_slv_portCntUnderFlow_ns0", {46, RX_TILE_SCLK}}, +{"CI_PERF_slv_portCntOverFlow_s0", {47, RX_TILE_SCLK}}, +{"CI_PERF_slv_portCntUnderFlow_s0", {48, RX_TILE_SCLK}}, +{"CI_PERF_slv_portCntOverFlow0", {49, RX_TILE_SCLK}}, +{"CI_PERF_slv_portCntUnderFlow0", {50, RX_TILE_SCLK}}, +{"CI_PERF_slv_npNotAccepted_ns0", {51, RX_TILE_SCLK}}, +{"CI_PERF_slv_npNotAccepted_s0", {52, RX_TILE_SCLK}}, +{"CI_PERF_slv_num128BRdCpl0", {53, RX_TILE_SCLK}}, +{"CI_PERF_slv_num32BMemRdTx0", {54, RX_TILE_SCLK}}, +{"CI_PERF_slv_num64BMemRdTx0", {55, RX_TILE_SCLK}}, +{"CI_PERF_slv_num16BMemWrTx0", {56, RX_TILE_SCLK}}, +{"CI_PERF_slv_num32BMemWrTx0", {57, RX_TILE_SCLK}}, +{"CI_PERF_slv_num64BMemWrTx0", {58, RX_TILE_SCLK}}, +{"CI_PERF_slv_MemRd_Bandwidth0", {59, RX_TILE_SCLK}}, +{"CI_PERF_slv_MemWr_Bandwidth0", {60, RX_TILE_SCLK}}, +{"TX_PERF_S_RCLK_s_tag_buf_empty", {61, RX_TILE_SCLK}}, +{"P_request_latency_500ns_or_more", {62, RX_TILE_SCLK}}, +{"P_request_latency_250_to_500ns", {63, RX_TILE_SCLK}}, +{"P_request_latency_100_to_250ns", {64, RX_TILE_SCLK}}, +{"P_request_latency_100ns_or_less", {65, RX_TILE_SCLK}}, +{"NP_request_latency_500ns_or_more", {66, RX_TILE_SCLK}}, +{"NP_request_latency_250_to_500ns", {67, RX_TILE_SCLK}}, +{"NP_request_latency_100_to_250ns", {68, RX_TILE_SCLK}}, +{"NP_request_latency_100ns_or_less", {69, RX_TILE_SCLK}}, +{"CI_PERF_slv_MemRd_wait_for_cpl_slot[0]", {70, RX_TILE_SCLK}}, +{"CI_PERF_slv_MemRd_wait_for_tag[0]", {71, RX_TILE_SCLK}}, +{"CI_PERF_slv_MemRd_wait_for_d_credit[0]", {72, RX_TILE_SCLK}}, +{"CI_PERF_slv_MemRd_wait_for_h_credit[0]", {73, RX_TILE_SCLK}}, +{"CI_PERF_slv_MemWr_wait_for_tag[0]", {74, RX_TILE_SCLK}}, +{"CI_PERF_slv_MemWr_wait_for_d_credit[0]", {75, RX_TILE_SCLK}}, +{"CI_PERF_slv_MemWr_wait_for_h_credit[0]", {76, RX_TILE_SCLK}}, +{"CISLV_PERF_no_VC1_no_tags_q", {77, RX_TILE_SCLK}}, +{"CISLV_PERF_no_VC1_data_credits_q", {78, RX_TILE_SCLK}}, +{"CISLV_PERF_no_VC1_req_credits_q", {79, RX_TILE_SCLK}}, +{"CISLV_PERF_no_cpl_slots_q[0]", {80, RX_TILE_SCLK}}, +{"CISLV_PERF_no_VC0_no_tags_q", {81, RX_TILE_SCLK}}, +{"CISLV_PERF_no_VC0_data_credits_q", {82, RX_TILE_SCLK}}, +{"CISLV_PERF_no_VC0_req_credits_q", {83, RX_TILE_SCLK}} +}; + +} + + +#endif \ No newline at end of file diff --git a/src/core/counters/mmio/perfmon.h b/src/core/counters/mmio/perfmon.h new file mode 100644 index 0000000000..9c90c2347e --- /dev/null +++ b/src/core/counters/mmio/perfmon.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2023 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef SRC_CORE_COUNTERS_PERFMON_H +#define SRC_CORE_COUNTERS_PERFMON_H + +#include "inc/rocprofiler.h" +#include "mmio.h" +#include + +namespace rocmtools { + +class PerfMon { + public: + virtual ~PerfMon(){}; + virtual void Start() = 0; + virtual void Stop() = 0; + virtual void Read(std::vector& values) = 0; + virtual void SetCounterNames(std::vector& counter_names) { + counter_names_ = counter_names; + }; + virtual mmio::mmap_type_t Type() = 0; + + protected: + std::vector counter_names_; +}; + +} // namespace rocmtools + +#endif \ No newline at end of file diff --git a/src/core/memory/generic_buffer.cpp b/src/core/memory/generic_buffer.cpp index 798f64a91d..4ba3ff2a0e 100644 --- a/src/core/memory/generic_buffer.cpp +++ b/src/core/memory/generic_buffer.cpp @@ -220,11 +220,18 @@ bool GetNextRecord(const rocprofiler_record_header_t* record, *next = reinterpret_cast(att_tracer_record + 1); break; } + case ROCPROFILER_COUNTERS_SAMPLER_RECORD: { + const rocprofiler_record_counters_sampler_t* sampler_record = + reinterpret_cast(record); + *next = reinterpret_cast(sampler_record + 1); + break; + } default: const rocprofiler_record_tracer_t* tracer_record = reinterpret_cast(record); *next = reinterpret_cast(tracer_record + 1); // size_to_add = sizeof(rocprofiler_record_header_t); + break; } // const std::byte* ptr = reinterpret_cast(record); // ptr += size_to_add; diff --git a/src/core/session/counters_sampler.cpp b/src/core/session/counters_sampler.cpp new file mode 100644 index 0000000000..18adb675fd --- /dev/null +++ b/src/core/session/counters_sampler.cpp @@ -0,0 +1,168 @@ +/* Copyright (c) 2023 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "counters_sampler.h" +#include "src/core/hsa/hsa_support.h" +#include "src/api/rocmtool.h" +#include "src/core/counters/mmio/pcie_counters_mi200.h" + +namespace rocmtools { + +CountersSampler::CountersSampler( + rocprofiler_buffer_id_t buffer_id, + rocprofiler_filter_id_t filter_id, + rocprofiler_session_id_t session_id) +: buffer_id_(buffer_id) +, filter_id_(filter_id) +, session_id_(session_id) +, pci_system_initialized_(pci_system_init() == 0) + +{ + params_ = rocmtools::GetROCMToolObj() + ->GetSession(session_id_) + ->GetFilter(filter_id_) + ->GetCountersSamplerParameterData(); + + std::vector agents; + rocmtools::hsa_support::GetCoreApiTable().hsa_iterate_agents_fn( + [](hsa_agent_t agent, void* arg) { + auto& agents = *reinterpret_cast*>(arg); + const auto& ai = rocmtools::hsa_support::GetAgentInfo(agent.handle); + if (ai.getType() == HSA_DEVICE_TYPE_GPU) { + agents.emplace_back(agent); + } + return HSA_STATUS_SUCCESS; + }, + &agents); + + // create perfmon instances for the counter types specfied + + // PCIE counters + std::vector pcie_counter_names; + for (int i = 0; i < params_.counters_num; i++) { + if(params_.counters[i].type == ROCPROFILER_COUNTERS_SAMPLER_PCIE_COUNTERS) + pcie_counter_names.push_back(params_.counters[i].name); + } + + if (pcie_counter_names.size() > 0) { + auto agentInfo = rocmtools::hsa_support::GetAgentInfo(agents[params_.gpu_agent_index].handle); + if(agentInfo.getName() == "gfx90a"){ + PciePerfMonMI200* perfmon = new PciePerfMonMI200(agentInfo); + perfmon->SetCounterNames(pcie_counter_names); + perfmon_instances_.push_back(perfmon); + } + } +} + +CountersSampler::~CountersSampler() { + // cleanup perfmon instancess + for (auto &perfmon : perfmon_instances_) { + if(perfmon != nullptr) + delete perfmon; + } + // clean up libpcieaccess resources + // TODO: should be part of mmio class in future + if (pci_system_initialized_) { + pci_system_cleanup(); + pci_system_initialized_ = false; + } +} + +void CountersSampler::Start() { + if (sampler_thread_.joinable()) { return; } + + std::cout << "Sampler Start\n"; + // Start all Perfmons + for (auto& perfmon : perfmon_instances_) { + perfmon->Start(); + } + + // Start polling thread + keep_running_ = true; + sampler_thread_ = std::thread([this]() { SamplerLoop(); }); +} + +void CountersSampler::Stop() { + if (!sampler_thread_.joinable()) { return; } + + std::cout << "Sampler Stop\n"; + // Stop all Perfmons + for (auto& perfmon : perfmon_instances_) { + perfmon->Stop(); + } + + // Stop polling thread + keep_running_ = false; + sampler_thread_.join(); +} + +void CountersSampler::AddRecord(rocprofiler_record_counters_sampler_t &record) { + const auto tool = rocmtools::GetROCMToolObj(); + const auto session = tool->GetSession(session_id_); + const auto buffer = session->GetBuffer(buffer_id_); + + std::lock_guard lk(session->GetSessionLock()); + + record.header = { + ROCPROFILER_COUNTERS_SAMPLER_RECORD, + { tool->GetUniqueRecordId() } + }; + + // Add the record to the buffer(a deep-copy operation) along with + // a lambda function to deep-copy the record.counters member to + // the newly created buffer record + buffer->AddRecord( + record, record.counters, + (record.num_counters * (sizeof(rocprofiler_counters_sampler_counter_output_t) + 1)), + [](auto& buff_record, const void* data) { + buff_record.counters = const_cast( + static_cast(data)); + }); +} + +void CountersSampler::SamplerLoop() { + std::this_thread::sleep_until(std::chrono::steady_clock::now() + + std::chrono::milliseconds(params_.initial_delay)); + uint32_t elapsed = 0; + while (keep_running_ && (elapsed <= params_.sampling_duration)) { + auto next_tick = + std::chrono::steady_clock::now() + std::chrono::milliseconds(params_.sampling_rate); + + rocprofiler_record_counters_sampler_t record; + std::vector values; + for (auto& perfmon : perfmon_instances_){ + perfmon->Read(values); + } + record.counters = + static_cast( + malloc(values.size() * sizeof(rocprofiler_counters_sampler_counter_output_t))); + ::memcpy(record.counters, &(values)[0], + values.size() * sizeof(rocprofiler_counters_sampler_counter_output_t)); + record.num_counters = values.size(); + rocprofiler_counters_sampler_counter_output_t* record_counters = record.counters; + AddRecord(record); + free(record_counters); + + std::this_thread::sleep_until(next_tick); + elapsed += params_.sampling_rate; + } +} + +} // namespace rocmtools \ No newline at end of file diff --git a/src/core/session/counters_sampler.h b/src/core/session/counters_sampler.h new file mode 100644 index 0000000000..00e84200d0 --- /dev/null +++ b/src/core/session/counters_sampler.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2023 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef SRC_CORE_SESSION_COUNTERS_SAMPLER_H_ +#define SRC_CORE_SESSION_COUNTERS_SAMPLER_H_ + +#include "rocprofiler.h" +#include "src/core/counters/mmio/perfmon.h" +#include +#include +#include +#include + +namespace rocmtools { + +class CountersSampler { + public: + CountersSampler(rocprofiler_buffer_id_t buffer_id, rocprofiler_filter_id_t filter_id, + rocprofiler_session_id_t session_id); + ~CountersSampler(); + + CountersSampler(const CountersSampler&) = delete; + CountersSampler& operator=(const CountersSampler&) = delete; + + void Start(); + void Stop(); + void AddRecord(rocprofiler_record_counters_sampler_t& record); + + private: + void SamplerLoop(); + + rocprofiler_buffer_id_t buffer_id_; + rocprofiler_filter_id_t filter_id_; + rocprofiler_session_id_t session_id_; + bool pci_system_initialized_{false}; + rocprofiler_counters_sampler_parameters_t params_; + std::vector perfmon_instances_; + + std::atomic keep_running_{false}; + std::thread sampler_thread_; +}; + +} // namespace rocmtools + +#endif \ No newline at end of file diff --git a/src/core/session/filter.cpp b/src/core/session/filter.cpp index 9bb6559a41..919c5b44ba 100644 --- a/src/core/session/filter.cpp +++ b/src/core/session/filter.cpp @@ -67,10 +67,15 @@ Filter::Filter(rocprofiler_filter_id_t id, rocprofiler_filter_kind_t filter_kind } break; } + case ROCPROFILER_COUNTERS_SAMPLER: { + counters_sampler_parameters_ = filter_data.counters_sampler_parameters; + break; + } default: { warning( "Error: ROCProfiler filter specified is not supported for " "profiler mode!\n"); + break; } } } @@ -122,6 +127,16 @@ rocprofiler_spm_parameter_t* Filter::GetSpmParameterData() { "Error: ROCProfiler filter specified is not supported for " "SPM collection mode!\n"); } + +rocprofiler_counters_sampler_parameters_t Filter::GetCountersSamplerParameterData() { + if (kind_ == ROCPROFILER_COUNTERS_SAMPLER) { + return counters_sampler_parameters_; + } + fatal( + "Error: ROCMtools filter specified is not supported for " + "Counters sampler mode!\n"); +} + void Filter::SetProperty(rocprofiler_filter_property_t property) { switch (property.kind) { case ROCPROFILER_FILTER_HSA_TRACER_API_FUNCTIONS: { diff --git a/src/core/session/filter.h b/src/core/session/filter.h index 8f3621ffde..31a4be42cc 100644 --- a/src/core/session/filter.h +++ b/src/core/session/filter.h @@ -56,6 +56,7 @@ class Filter { rocprofiler_filter_property_kind_t kind); size_t GetPropertiesCount(rocprofiler_filter_property_kind_t kind); rocprofiler_spm_parameter_t* GetSpmParameterData(); + rocprofiler_counters_sampler_parameters_t GetCountersSamplerParameterData(); private: rocprofiler_filter_id_t id_; @@ -72,6 +73,7 @@ class Filter { std::vector tracer_apis_; // ROCTX/HIP/HSA API rocprofiler_spm_parameter_t* spm_parameter_; // spm parameter std::vector att_parameters_; // ATT Parameters + rocprofiler_counters_sampler_parameters_t counters_sampler_parameters_; // sampled counters parameters rocprofiler_sync_callback_t callback_; }; diff --git a/src/core/session/session.cpp b/src/core/session/session.cpp index e3d80a25a6..c27701ec61 100644 --- a/src/core/session/session.cpp +++ b/src/core/session/session.cpp @@ -148,6 +148,16 @@ void Session::Start() { pc_sampler_->Start(); } + if (FindFilterWithKind(ROCPROFILER_COUNTERS_SAMPLER)) { + if (!counters_sampler_started_.load(std::memory_order_release)) { + counters_sampler_ = new CountersSampler( + GetFilter(GetFilterIdWithKind(ROCPROFILER_COUNTERS_SAMPLER))->GetBufferId(), + GetFilter(GetFilterIdWithKind(ROCPROFILER_COUNTERS_SAMPLER))->GetId(), session_id_); + counters_sampler_started_.exchange(true, std::memory_order_release); + } + counters_sampler_->Start(); + } + is_active_ = true; if (FindFilterWithKind(ROCPROFILER_SPM_COLLECTION)) startSpm(); } @@ -179,6 +189,14 @@ void Session::Terminate() { } } + if (FindFilterWithKind(ROCPROFILER_COUNTERS_SAMPLER)) { + if (counters_sampler_started_.load(std::memory_order_release)) { + counters_sampler_->Stop(); + delete counters_sampler_; + counters_sampler_started_.exchange(false, std::memory_order_release); + } + } + is_active_ = false; } } @@ -191,6 +209,7 @@ att::AttTracer* Session::GetAttTracer() { return att_tracer_; } tracer::Tracer* Session::GetTracer() { return tracer_; } spm::SpmCounters* Session::GetSpmCounter() { return spmcounter_; } pc_sampler::PCSampler* Session::GetPCSampler() { return pc_sampler_; } +CountersSampler* Session::GetCountersSampler() { return counters_sampler_; } rocprofiler_filter_id_t Session::CreateFilter(rocprofiler_filter_kind_t filter_kind, rocprofiler_filter_data_t filter_data, diff --git a/src/core/session/session.h b/src/core/session/session.h index 561e16a4a2..dcba49b3c8 100644 --- a/src/core/session/session.h +++ b/src/core/session/session.h @@ -40,6 +40,7 @@ #include "att/att.h" #include "spm/spm.h" #include "src/pcsampler/session/pc_sampler.h" +#include "counters_sampler.h" #define ASSERTM(exp, msg) assert(((void)msg, exp)) @@ -62,6 +63,7 @@ class Session { att::AttTracer* GetAttTracer(); spm::SpmCounters* GetSpmCounter(); pc_sampler::PCSampler* GetPCSampler(); + CountersSampler* GetCountersSampler(); // Filter rocprofiler_filter_id_t CreateFilter(rocprofiler_filter_kind_t filter_kind, @@ -117,6 +119,9 @@ class Session { std::atomic pc_sampler_started_{false}; pc_sampler::PCSampler* pc_sampler_; + std::atomic counters_sampler_started_{false}; + CountersSampler* counters_sampler_; + std::atomic buffers_counter_{1}; std::mutex buffers_lock_; std::map buffers_; diff --git a/tests/unittests/core/CMakeLists.txt b/tests/unittests/core/CMakeLists.txt index 5efc625421..22374d5daf 100644 --- a/tests/unittests/core/CMakeLists.txt +++ b/tests/unittests/core/CMakeLists.txt @@ -23,6 +23,7 @@ set(CORE_SESSION_DIR ${PROJECT_SOURCE_DIR}/src/core/session) file(GLOB CORE_SESSION_SRC_FILES ${CORE_SESSION_DIR}/session.cpp) file(GLOB CORE_FILTER_SRC_FILES ${CORE_SESSION_DIR}/filter.cpp) file(GLOB CORE_DEVICE_PROFILING_SRC_FILES ${CORE_SESSION_DIR}/device_profiling.cpp) +file(GLOB CORE_COUNTERS_SAMPLER_SRC_FILES ${CORE_SESSION_DIR}/counters_sampler.cpp) set(CORE_HW_DIR ${PROJECT_SOURCE_DIR}/src/core/hardware) file(GLOB CORE_HW_SRC_FILES ${CORE_HW_DIR}/hsa_info.cpp) @@ -57,6 +58,7 @@ file(GLOB CORE_HSA_QUEUES_SRC_FILES ${CORE_HSA_QUEUES_DIR}/*.cpp) file(GLOB CORE_COUNTERS_PARENT_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters/*.cpp) file(GLOB CORE_COUNTERS_METRICS_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters/metrics/*.cpp) +file(GLOB CORE_COUNTERS_MMIO_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters/mmio/*.cpp) set(CORE_PC_SAMPLING_DIR ${PROJECT_SOURCE_DIR}/src/pcsampler) file(GLOB CORE_PC_SAMPLING_FILES ${CORE_PC_SAMPLING_DIR}/core/*.cpp ${CORE_PC_SAMPLING_DIR}/gfxip/*.cpp ${CORE_PC_SAMPLING_DIR}/session/*.cpp) @@ -70,6 +72,7 @@ add_executable(runCoreUnitTests ${CMAKE_CURRENT_SOURCE_DIR}/gtests_main.cpp ${CORE_SESSION_SRC_FILES} ${CORE_FILTER_SRC_FILES} ${CORE_DEVICE_PROFILING_SRC_FILES} + ${CORE_COUNTERS_SAMPLER_SRC_FILES} ${CORE_HW_SRC_FILES} ${CORE_UTILS_SRC_FILES} ${ROCPROFILER_SRC_FILES} @@ -81,6 +84,7 @@ add_executable(runCoreUnitTests ${CMAKE_CURRENT_SOURCE_DIR}/gtests_main.cpp ${ROCPROFILER_TRACER_SRC_FILES} ${ROCPROFILER_ROCTRACER_SRC_FILES} ${CORE_COUNTERS_METRICS_SRC_FILES} + ${CORE_COUNTERS_MMIO_SRC_FILES} ${CORE_COUNTERS_PARENT_SRC_FILES} ${CORE_PC_SAMPLING_FILES} ${OLD_LIB_SRC}) diff --git a/tests/unittests/profiler/CMakeLists.txt b/tests/unittests/profiler/CMakeLists.txt index 7e08ace63e..909585d3e1 100644 --- a/tests/unittests/profiler/CMakeLists.txt +++ b/tests/unittests/profiler/CMakeLists.txt @@ -13,6 +13,7 @@ set(CORE_SESSION_DIR ${PROJECT_SOURCE_DIR}/src/core/session) file(GLOB CORE_SESSION_SRC_FILES ${CORE_SESSION_DIR}/session.cpp) file(GLOB CORE_FILTER_SRC_FILES ${CORE_SESSION_DIR}/filter.cpp) file(GLOB CORE_DEVICE_PROFILING_SRC_FILES ${CORE_SESSION_DIR}/device_profiling.cpp) +file(GLOB CORE_COUNTERS_SAMPLER_SRC_FILES ${CORE_SESSION_DIR}/counters_sampler.cpp) set(CORE_HW_DIR ${PROJECT_SOURCE_DIR}/src/core/hardware) file(GLOB CORE_HW_SRC_FILES ${CORE_HW_DIR}/hsa_info.cpp) @@ -52,6 +53,7 @@ file(GLOB ROCPROFILER_TOOL_SRC_FILES ${PROJECT_SOURCE_DIR}/src/rocmtools/tools/t file(GLOB CORE_COUNTERS_PARENT_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters/*.cpp) file(GLOB CORE_COUNTERS_METRICS_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters/metrics/*.cpp) +file(GLOB CORE_COUNTERS_MMIO_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters/mmio/*.cpp) add_executable(runProfilerUnitTests ${CMAKE_CURRENT_SOURCE_DIR}/tools/tool_gtest.cpp ${CMAKE_CURRENT_SOURCE_DIR}/api/rocmtool_gtest.cpp @@ -59,6 +61,7 @@ add_executable(runProfilerUnitTests ${CMAKE_CURRENT_SOURCE_DIR}/tools/tool_gtest ${CORE_SESSION_SRC_FILES} ${CORE_FILTER_SRC_FILES} ${CORE_DEVICE_PROFILING_SRC_FILES} + ${CORE_COUNTERS_SAMPLER_SRC_FILES} ${CORE_HW_SRC_FILES} ${CORE_UTILS_SRC_FILES} ${ROCPROFILER_SPM_SRC_FILES} @@ -70,6 +73,7 @@ add_executable(runProfilerUnitTests ${CMAKE_CURRENT_SOURCE_DIR}/tools/tool_gtest ${ROCPROFILER_TRACER_SRC_FILES} ${ROCPROFILER_ROCTRACER_SRC_FILES} ${CORE_COUNTERS_METRICS_SRC_FILES} + ${CORE_COUNTERS_MMIO_SRC_FILES} ${CORE_COUNTERS_PARENT_SRC_FILES} ${CORE_PC_SAMPLING_FILES})