From 59cc2a382c14056c6e8a390193ca7a32100a18d8 Mon Sep 17 00:00:00 2001 From: Benjamin Welton Date: Tue, 26 Sep 2023 11:50:57 -0700 Subject: [PATCH] Migrate XML counter defs and reader from v1/v2 (#25) * Migrate XML counter defs and reader from v1/v2 * working set * more fixes * Update CMakeLists.txt * source formatting (clang-format v11) (#83) Co-authored-by: bwelton * Update source/lib/rocprofiler/counters/CMakeLists.txt --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: bwelton Co-authored-by: Jonathan R. Madsen --- source/lib/common/CMakeLists.txt | 2 +- source/lib/common/xml.hpp | 606 ++++++++++++++ source/lib/rocprofiler/CMakeLists.txt | 5 +- .../lib/rocprofiler/counters/CMakeLists.txt | 11 + source/lib/rocprofiler/counters/metrics.cpp | 118 +++ source/lib/rocprofiler/counters/metrics.hpp | 101 +++ .../rocprofiler/counters/tests/CMakeLists.txt | 24 + .../counters/tests/metrics_test.cpp | 42 + .../rocprofiler/counters/tests/metrics_test.h | 323 ++++++++ .../rocprofiler/counters/xml/CMakeLists.txt | 3 + .../counters/xml/basic_counters.xml | 744 ++++++++++++++++++ .../counters/xml/derived_counters.xml | 585 ++++++++++++++ 12 files changed, 2560 insertions(+), 4 deletions(-) create mode 100644 source/lib/common/xml.hpp create mode 100644 source/lib/rocprofiler/counters/CMakeLists.txt create mode 100644 source/lib/rocprofiler/counters/metrics.cpp create mode 100644 source/lib/rocprofiler/counters/metrics.hpp create mode 100644 source/lib/rocprofiler/counters/tests/CMakeLists.txt create mode 100644 source/lib/rocprofiler/counters/tests/metrics_test.cpp create mode 100644 source/lib/rocprofiler/counters/tests/metrics_test.h create mode 100644 source/lib/rocprofiler/counters/xml/CMakeLists.txt create mode 100755 source/lib/rocprofiler/counters/xml/basic_counters.xml create mode 100755 source/lib/rocprofiler/counters/xml/derived_counters.xml diff --git a/source/lib/common/CMakeLists.txt b/source/lib/common/CMakeLists.txt index f67ca666fb..ac70fa43e2 100644 --- a/source/lib/common/CMakeLists.txt +++ b/source/lib/common/CMakeLists.txt @@ -5,7 +5,7 @@ rocprofiler_activate_clang_tidy() set(common_sources config.cpp environment.cpp demangle.cpp) set(common_headers config.hpp defines.hpp environment.hpp demangle.hpp mpl.hpp - utility.hpp) + utility.hpp xml.hpp) add_library(rocprofiler-common-library STATIC) add_library(rocprofiler::rocprofiler-common-library ALIAS rocprofiler-common-library) diff --git a/source/lib/common/xml.hpp b/source/lib/common/xml.hpp new file mode 100644 index 0000000000..d07e371f70 --- /dev/null +++ b/source/lib/common/xml.hpp @@ -0,0 +1,606 @@ +/****************************************************************************** +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace xml +{ +class Xml +{ +public: + using token_t = std::vector; + + struct level_t; + using node_vect_t = std::vector>; + using node_list_t = std::list>; + + using nodes_t = node_vect_t; + using opts_t = std::map; + struct level_t + { + std::string tag; + nodes_t nodes; + opts_t opts; + std::shared_ptr copy; + }; + using nodes_vec_t = std::vector>; + using map_t = std::map; + + enum + { + DECL_STATE, + BODY_STATE + }; + + static std::shared_ptr Create(const std::string& file_name, const Xml* obj = nullptr) + { + auto xml = std::make_shared(file_name, obj); + if(xml != nullptr) + { + if(xml->Init() != false) + { + const std::size_t pos = file_name.rfind('/'); + const std::string path = + (pos != std::string::npos) ? file_name.substr(0, pos + 1) : ""; + + xml->PreProcess(); + nodes_t incl_nodes; + for(const auto& node : xml->GetNodes("top.include")) + { + if(node->opts.find("touch") == node->opts.end()) + { + node->opts["touch"] = ""; + incl_nodes.push_back(node); + } + } + for(const auto& incl : incl_nodes) + { + const std::string& incl_name = path + incl->opts["file"]; + auto ixml = Create(incl_name, xml.get()); + if(!ixml) + { + xml.reset(); + break; + } + } + if(xml) + { + xml->Process(); + } + } + } + + return xml; + } + + std::string GetName() { return file_name_; } + + // clang-tidy incorrectly marks these functions as being staticable. They are not. + // NOLINTBEGIN + void AddExpr(const std::string& full_tag, const std::string& name, const std::string& expr) + { + const std::size_t pos = full_tag.rfind('.'); + const std::size_t pos1 = (pos == std::string::npos) ? 0 : pos + 1; + const std::string level_tag = full_tag.substr(pos1); + auto level = std::make_shared(); + (*map_)[full_tag].push_back(level); + level->tag = level_tag; + level->opts["name"] = name; + level->opts["expr"] = expr; + } + + void AddConst(const std::string& full_tag, const std::string& name, const uint64_t& val) + { + std::ostringstream oss; + oss << val; + AddExpr(full_tag, name, oss.str()); + } + // NOLINTEND + + nodes_t GetNodes(const std::string& global_tag) { return (*map_)[global_tag]; } + const map_t& GetAllNodes() { return (*map_); } + + template + F ForEach(const F& f_i) + { + F f = f_i; + if(map_) + { + for(auto& entry : *map_) + { + for(auto node : entry.second) + { + if(f.fun(entry.first, node) == false) break; + } + } + } + return f; + } + + template + F ForEach(const F& f_i) const + { + F f = f_i; + if(map_) + { + for(auto& entry : *map_) + { + for(const auto& node : entry.second) + { + if(f.fun(entry.first, node) == false) break; + } + } + } + return f; + } + + struct print_func + { + static bool fun(const std::string& global_tag, const std::shared_ptr& node) + { + std::cout << global_tag << ":" << std::endl; + for(auto& opt : node->opts) + { + std::cout << global_tag << "." << opt.first << " = " << opt.second << std::endl; + } + return true; + } + }; + + void Print() const + { + std::cout << "XML file '" << file_name_ << "':" << std::endl; + ForEach(print_func()); + } + + Xml(std::string file_name, const Xml* obj) + : file_name_(std::move(file_name)) + , state_(BODY_STATE) + { + if(obj != nullptr) + { + map_ = obj->map_; + level_ = obj->level_; + included_ = true; + } + } + + ~Xml() + { + for(auto& x : stack_) + { + x->nodes.clear(); + x->copy.reset(); + } + if(!map_) return; + for(auto& [_, nodes] : *map_) + { + for(auto& node : nodes) + { + node->nodes.clear(); + node->copy.reset(); + } + } + } + +private: + bool Init() + { + fd_ = open(file_name_.c_str(), O_RDONLY); + if(fd_ == -1) + { + // perror((std::string("open XML file ") + file_name_).c_str()); + return false; + } + + if(map_ == nullptr) + { + map_ = std::make_unique(); + AddLevel("top"); + } + + return true; + } + + void PreProcess() + { + uint32_t ind = 0; + char buf[kBufSize]; + bool error = false; + + while(true) + { + const uint32_t pos = lseek(fd_, 0, SEEK_CUR); + uint32_t size = read(fd_, buf, kBufSize); + if(size <= 0) break; + buf[size - 1] = '\0'; + + if(strncmp(buf, "#include \"", 10) == 0) + { + for(ind = 0; (ind < size) && (buf[ind] != '\n'); ++ind) + {} + if(ind < size) + { + buf[ind] = '\0'; + size = ind; + lseek(fd_, pos + ind + 1, SEEK_SET); + } + + for(ind = 10; (ind < size) && (buf[ind] != '"'); ++ind) + {} + if(ind == size) + { + error = true; + break; + } + buf[ind] = '\0'; + + AddLevel("include"); + AddOption("file", &buf[10]); + UpLevel(); + } + } + + if(error) + { + fprintf(stderr, "XML PreProcess failed, line '%s'\n", buf); + abort(); + } + + lseek(fd_, 0, SEEK_SET); + } + + void Process() + { + token_t remainder; + + while(true) + { + token_t token = (!remainder.empty()) ? remainder : NextToken(); + remainder.clear(); + + // token_t token1 = token; + // token1.push_back('\0'); + // std::cout << ">>> " << &token1[0] << std::endl; + + // End of file + if(token.empty()) break; + + switch(state_) + { + case BODY_STATE: + if(token[0] == '<') + { + bool node_begin = true; + unsigned ind = 1; + if(token[1] == '/') + { + node_begin = false; + ++ind; + } + + unsigned i = ind; + while(i < token.size()) + { + if(token[i] == '>') break; + ++i; + } + for(unsigned j = i + 1; j < token.size(); ++j) + remainder.push_back(token[j]); + + if(i == token.size()) + { + if(node_begin) + state_ = DECL_STATE; + else + BadFormat(token); + token.push_back('\0'); + } + else + { + token[i] = '\0'; + } + + const char* tag = &token[ind]; + if(node_begin) + { + AddLevel(tag); + } + else + { + Inherit(GetOption("base")); + + if(strncmp(CurrentLevel().c_str(), tag, strlen(tag)) != 0) + { + token.back() = '>'; + BadFormat(token); + } + UpLevel(); + } + } + else + { + BadFormat(token); + } + break; + case DECL_STATE: + if(token[0] == '>') + { + state_ = BODY_STATE; + for(unsigned j = 1; j < token.size(); ++j) + remainder.push_back(token[j]); + continue; + } + else + { + token.push_back('\0'); + unsigned j = 0; + for(j = 0; j < token.size(); ++j) + if(token[j] == '=') break; + if(j == token.size()) BadFormat(token); + token[j] = '\0'; + const std::string key = token.data(); + const std::string value = &token[j + 1]; + AddOption(key, value); + } + break; + default: + std::cout << "XML parser error: wrong state: " << state_ << std::endl; + abort(); + } + } + } + + bool SpaceCheck() const + { + bool cond = ((buffer_[index_] == ' ') || (buffer_[index_] == '\t')); + return cond; + } + + bool LineEndCheck() + { + bool found = false; + if(buffer_[index_] == '\n') + { + buffer_[index_] = ' '; + ++file_line_; + found = true; + comment_ = false; + } + else if(comment_ || (buffer_[index_] == '#')) + { + found = true; + comment_ = true; + } + return found; + } + + token_t NextToken() + { + token_t token; + bool in_string = false; + bool special_symb = false; + + while(true) + { + if(data_size_ == 0) + { + data_size_ = read(fd_, buffer_, kBufSize); + if(data_size_ <= 0) break; + } + + if(token.empty()) + { + while((index_ < data_size_) && (SpaceCheck() || LineEndCheck())) + { + ++index_; + } + } + while((index_ < data_size_) && (in_string || !(SpaceCheck() || LineEndCheck()))) + { + const char symb = buffer_[index_]; + bool skip_symb = false; + + switch(symb) + { + case '\\': + if(special_symb) + { + special_symb = false; + } + else + { + special_symb = true; + skip_symb = true; + } + break; + case '"': + if(special_symb) + { + special_symb = false; + } + else + { + in_string = !in_string; + if(!in_string) + { + buffer_[index_] = ' '; + --index_; + } + skip_symb = true; + } + break; + } + + if(!skip_symb) token.push_back(symb); + ++index_; + } + + if(index_ == data_size_) + { + index_ = 0; + data_size_ = 0; + } + else + { + if(special_symb || in_string) BadFormat(token); + break; + } + } + + return token; + } + + void BadFormat(token_t token) + { + token.push_back('\0'); + std::cout << "Error: " << file_name_ << ", line " << file_line_ << ", bad XML token '" + << token.data() << "'" << std::endl; + abort(); + } + + void AddLevel(const std::string& tag) + { + auto level = std::make_shared(); + level->tag = tag; + if(level_) + { + level_->nodes.push_back(level); + stack_.push_back(level_); + } + level_ = level; + + std::string global_tag = GlobalTag(tag); + (*map_)[global_tag].push_back(level_); + } + + void UpLevel() + { + level_ = stack_.back(); + stack_.pop_back(); + } + + void Copy(const std::shared_ptr& from, const std::shared_ptr& to) + { + auto level = to; + if(level == nullptr) + { + AddLevel(from->tag); + level = level_; + } + level->copy = from; + level->opts = from->opts; + + for(const auto& node : from->nodes) + { + bool found = false; + const std::string name = GetOption("name", node); + const std::string global_tag = GlobalTag(level->tag) + "." + node->tag; + for(const auto& item : (*map_)[global_tag]) + { + if((name == GetOption("name", item)) || (node == item->copy)) + { + found = true; + break; + } + } + if(found == false) Copy(node, nullptr); + } + + if(to == nullptr) UpLevel(); + } + + void Inherit(const std::string& tag) + { + if(!tag.empty()) + { + const std::string global_tag = GlobalTag(tag); + auto it = map_->find(global_tag); + if(it == map_->end()) + { + fprintf(stderr, + "Node \"%s\": Base not found \"%s\"\n", + level_->tag.c_str(), + tag.c_str()); + abort(); + } + for(const auto& node : it->second) + { + Copy(node, level_); + } + } + } + + std::string CurrentLevel() const { return level_->tag; } + + std::string GlobalTag(const std::string& tag) const + { + std::string global_tag; + for(const auto& level : stack_) + { + global_tag += level->tag + "."; + } + global_tag += tag; + return global_tag; + } + + void AddOption(const std::string& key, const std::string& value) { level_->opts[key] = value; } + std::string GetOption(const std::string& key, std::shared_ptr level = nullptr) + { + level = (level != nullptr) ? level : level_; + auto it = level->opts.find(key); + return (it != level->opts.end()) ? it->second : ""; + } + + const std::string file_name_; + unsigned file_line_{0}; + int fd_; + + static const size_t kBufSize = 256; + char buffer_[kBufSize]; + + unsigned data_size_{0}; + unsigned index_{0}; + unsigned state_{0}; + bool comment_{false}; + std::vector> stack_; + bool included_{false}; + std::shared_ptr level_; + std::shared_ptr map_; +}; + +} // namespace xml diff --git a/source/lib/rocprofiler/CMakeLists.txt b/source/lib/rocprofiler/CMakeLists.txt index cd4411eaf0..e8decabe10 100644 --- a/source/lib/rocprofiler/CMakeLists.txt +++ b/source/lib/rocprofiler/CMakeLists.txt @@ -30,6 +30,7 @@ target_sources(rocprofiler-object-library PRIVATE ${ROCPROFILER_LIB_SOURCES} add_subdirectory(hsa) add_subdirectory(context) +add_subdirectory(counters) target_link_libraries( rocprofiler-object-library @@ -96,9 +97,7 @@ target_link_libraries( rocprofiler-static-library PUBLIC rocprofiler::rocprofiler-headers rocprofiler::rocprofiler-hsa-runtime rocprofiler::rocprofiler-hip - PRIVATE rocprofiler::rocprofiler-build-flags rocprofiler::rocprofiler-memcheck - rocprofiler::rocprofiler-common-library rocprofiler::rocprofiler-stdcxxfs - rocprofiler::rocprofiler-dl rocprofiler::rocprofiler-amd-comgr) + PRIVATE rocprofiler::rocprofiler-common-library) set_target_properties( rocprofiler-static-library PROPERTIES OUTPUT_NAME rocprofiler64 DEFINE_SYMBOL diff --git a/source/lib/rocprofiler/counters/CMakeLists.txt b/source/lib/rocprofiler/counters/CMakeLists.txt new file mode 100644 index 0000000000..6769637755 --- /dev/null +++ b/source/lib/rocprofiler/counters/CMakeLists.txt @@ -0,0 +1,11 @@ +set(ROCPROFILER_LIB_COUNTERS_SOURCES metrics.cpp) +set(ROCPROFILER_LIB_COUNTERS_HEADERS metrics.hpp) + +target_sources(rocprofiler-object-library PRIVATE ${ROCPROFILER_LIB_COUNTERS_SOURCES} + ${ROCPROFILER_LIB_COUNTERS_HEADERS}) + +add_subdirectory(xml) + +if(ROCPROFILER_BUILD_TESTS) + add_subdirectory(tests) +endif() diff --git a/source/lib/rocprofiler/counters/metrics.cpp b/source/lib/rocprofiler/counters/metrics.cpp new file mode 100644 index 0000000000..29ab5f390f --- /dev/null +++ b/source/lib/rocprofiler/counters/metrics.cpp @@ -0,0 +1,118 @@ +/****************************************************************************** +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "metrics.hpp" + +#include // for dladdr +#include +#include +#include + +#include "glog/logging.h" +#include "lib/common/xml.hpp" +#include "rocprofiler/rocprofiler.h" + +namespace counters +{ +namespace +{ +MetricMap +loadXml(const std::string& filename) +{ + MetricMap ret; + DLOG(INFO) << "Loading Counter Config: " << filename; + // todo: return unique_ptr.... + auto xml = xml::Xml::Create(filename); + LOG_IF(FATAL, !xml) + << "Could not open XML Counter Config File (set env ROCPROFILER_METRICS_PATH)"; + + for(const auto& [gfx_name, nodes] : xml->GetAllNodes()) + { + /** + * "top." is used to designate the root encapsulation of all contained XML subroots (in our + * case "gfxX"). This is inserted by the parser so it will always be present. .metric + * denotes XML tags that are contained in the subroots. This will not change unless we + * respec the XML (which we should...). + */ + if(gfx_name.find("metric") == std::string::npos || + gfx_name.find("top.") == std::string::npos) + continue; + + auto& metricVec = + ret.emplace(gfx_name.substr(strlen("top."), + gfx_name.size() - strlen("top.") - strlen(".metric")), + std::vector()) + .first->second; + for(const auto& node : nodes) + { + metricVec.emplace_back(node->opts["name"], + node->opts["block"], + node->opts["event"], + node->opts["descr"], + node->opts["expr"]); + } + } + + DLOG(INFO) << fmt::format("{}", ret); + return ret; +} + +std::string +findViaInstallPath(const std::string& filename) +{ + Dl_info dl_info; + DLOG(INFO) << filename << " is being looked up via install path"; + if(dladdr(reinterpret_cast(rocprofiler_query_available_agents), &dl_info) != 0) + { + return std::filesystem::path{dl_info.dli_fname}.remove_filename() / + fmt::format("../lib/{}", filename); + } + return filename; +} + +std::string +findViaEnvironment(const std::string& filename) +{ + if(getenv("ROCPROFILER_METRICS_PATH")) + { + DLOG(INFO) << filename << " is being looked up via env variable ROCPROFILER_METRICS_PATH"; + return std::filesystem::path{std::string(getenv("ROCPROFILER_METRICS_PATH"))} / filename; + } + // No environment variable, lookup via install path + return findViaInstallPath(filename); +} + +} // namespace + +MetricMap +getDerrivedHardwareMetrics() +{ + return loadXml(findViaEnvironment("derived_counters.xml")); +} + +MetricMap +getBaseHardwareMetrics() +{ + return loadXml(findViaEnvironment("basic_counters.xml")); +} + +}; // namespace counters diff --git a/source/lib/rocprofiler/counters/metrics.hpp b/source/lib/rocprofiler/counters/metrics.hpp new file mode 100644 index 0000000000..c4385c7c79 --- /dev/null +++ b/source/lib/rocprofiler/counters/metrics.hpp @@ -0,0 +1,101 @@ +#pragma once + +#include +#include +#include +#include + +#include + +#include "fmt/core.h" +#include "fmt/ranges.h" + +namespace counters +{ +// Base metrics (w/o instance information) defined in gfx_metrics/derrived.xml +class Metric +{ +public: + Metric(std::string name, + std::string block, + std::string event, + std::string dsc, + std::string expr) + : name_(std::move(name)) + , block_(std::move(block)) + , event_(std::move(event)) + , description_(std::move(dsc)) + , expression_(std::move(expr)) + {} + + const std::string& name() const { return name_; } + const std::string& block() const { return block_; } + const std::string& event() const { return event_; } + const std::string& description() const { return description_; } + const std::string& expression() const { return expression_; } + +private: + std::string name_; + std::string block_; + std::string event_; + std::string description_; + std::string expression_; +}; + +using MetricMap = std::unordered_map>; + +MetricMap +getBaseHardwareMetrics(); + +MetricMap +getDerrivedHardwareMetrics(); + +} // namespace counters + +namespace fmt +{ +// fmt::format support for metric +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + auto format(counters::Metric const& metric, Ctx& ctx) const + { + return fmt::format_to(ctx.out(), + "Metric: {} [Block: {}, Event: {}, Expression: {}, Description: {}]", + metric.name(), + metric.block(), + metric.event(), + metric.expression().empty() ? "" : metric.expression(), + metric.description()); + } +}; + +// fmt::format support for MetricMap +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + auto format(counters::MetricMap const& map, Ctx& ctx) const + { + std::string out; + for(const auto& [gfxName, counters] : map) + { + out += fmt::format("Counters for {}\n\t{}\n", gfxName, fmt::join(counters, "\n\t")); + } + return fmt::format_to(ctx.out(), "{}", out); + } +}; +} // namespace fmt \ No newline at end of file diff --git a/source/lib/rocprofiler/counters/tests/CMakeLists.txt b/source/lib/rocprofiler/counters/tests/CMakeLists.txt new file mode 100644 index 0000000000..6a6f87a24c --- /dev/null +++ b/source/lib/rocprofiler/counters/tests/CMakeLists.txt @@ -0,0 +1,24 @@ +rocprofiler_deactivate_clang_tidy() + +include(GoogleTest) + +set(ROCPROFILER_LIB_COUNTER_TEST_SOURCES "metrics_test.cpp") + +add_executable(counter-test) + +target_sources( + counter-test PRIVATE ${ROCPROFILER_LIB_COUNTER_TEST_SOURCES} + $) + +target_link_libraries( + counter-test + PRIVATE rocprofiler::rocprofiler-hip rocprofiler::rocprofiler-common-library + rocprofiler::rocprofiler-object-library GTest::gtest GTest::gtest_main) + +gtest_add_tests( + TARGET counter-test + SOURCES ${ROCPROFILER_LIB_COUNTER_TEST_SOURCES} + TEST_LIST counter-test_TESTS + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + +set_tests_properties(${counter-tests_TESTS} PROPERTIES TIMEOUT 45 LABELS "unittests") diff --git a/source/lib/rocprofiler/counters/tests/metrics_test.cpp b/source/lib/rocprofiler/counters/tests/metrics_test.cpp new file mode 100644 index 0000000000..e46ee27694 --- /dev/null +++ b/source/lib/rocprofiler/counters/tests/metrics_test.cpp @@ -0,0 +1,42 @@ +#include "metrics_test.h" + +#include + +#include "lib/rocprofiler/counters/metrics.hpp" + +namespace +{ +auto +loadTestData(std::unordered_map>> map) +{ + std::unordered_map> ret; + for(auto& [gfx, dataMap] : map) + { + auto& metric_vec = ret.emplace(gfx, std::vector{}).first->second; + for(auto& data_vec : dataMap) + { + metric_vec.emplace_back( + data_vec.at(0), data_vec.at(1), data_vec.at(2), data_vec.at(4), data_vec.at(3)); + } + } + return ret; +} +} // namespace + +TEST(MetricsTest, BaseMetricLoad) +{ + auto x = counters::getBaseHardwareMetrics(); + auto test_data = loadTestData(basic_gfx908); + ASSERT_EQ(x.count("gfx908"), 1); + ASSERT_EQ(test_data.count("gfx908"), 1); + EXPECT_EQ(fmt::format("{}", x["gfx908"]), fmt::format("{}", test_data["gfx908"])); +} + +TEST(MetricsTest, DerrivedMetricLoad) +{ + auto x = counters::getDerrivedHardwareMetrics(); + auto test_data = loadTestData(derrived_gfx908); + ASSERT_EQ(x.count("gfx908"), 1); + ASSERT_EQ(test_data.count("gfx908"), 1); + EXPECT_EQ(fmt::format("{}", x["gfx908"]), fmt::format("{}", test_data["gfx908"])); +} \ No newline at end of file diff --git a/source/lib/rocprofiler/counters/tests/metrics_test.h b/source/lib/rocprofiler/counters/tests/metrics_test.h new file mode 100644 index 0000000000..a29e7fcca7 --- /dev/null +++ b/source/lib/rocprofiler/counters/tests/metrics_test.h @@ -0,0 +1,323 @@ +#pragma once + +#include +#include +#include + +// Expected values for GFX908. GFX908 was chosen because it is not the first +// arch defined in the XML and it is also an arch that inherits values (from gfx9) +// Layout is: {name, block, event, expression, description}. +static const std::unordered_map>> basic_gfx908 = { + {"gfx908", + {{"SQ_INSTS_VMEM_WR", + "SQ", + "28", + "", + "Number of VMEM write instructions issued (including FLAT). (per-simd, emulated)"}, + {"SQ_INSTS_VMEM_RD", + "SQ", + "29", + "", + "Number of VMEM read instructions issued (including FLAT). (per-simd, emulated)"}, + {"SQ_INSTS_SALU", + "SQ", + "31", + "", + "Number of SALU instructions issued. (per-simd, emulated)"}, + {"SQ_INSTS_SMEM", + "SQ", + "32", + "", + "Number of SMEM instructions issued. (per-simd, emulated)"}, + {"SQ_INSTS_FLAT", + "SQ", + "33", + "", + "Number of FLAT instructions issued. (per-simd, emulated)"}, + {"SQ_INSTS_FLAT_LDS_ONLY", + "SQ", + "34", + "", + "Number of FLAT instructions issued that read/wrote only from/to LDS (only works if " + "EARLY_TA_DONE is enabled). (per-simd, emulated)"}, + {"SQ_INSTS_LDS", + "SQ", + "35", + "", + "Number of LDS instructions issued (including FLAT). (per-simd, emulated)"}, + {"SQ_INSTS_GDS", + "SQ", + "36", + "", + "Number of GDS instructions issued. (per-simd, emulated)"}, + {"SQ_WAIT_INST_LDS", + "SQ", + "64", + "", + "Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. " + "(per-simd, nondeterministic)"}, + {"SQ_ACTIVE_INST_VALU", + "SQ", + "72", + "", + "regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. " + "(per-simd, nondeterministic). Units in quad-cycles(4 cycles)"}, + {"SQ_INST_CYCLES_SALU", + "SQ", + "85", + "", + "Number of cycles needed to execute non-memory read scalar operations. (per-simd, " + "emulated)"}, + {"SQ_THREAD_CYCLES_VALU", + "SQ", + "86", + "", + "Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but " + "multiplied by # of active threads). (per-simd)"}, + {"SQ_LDS_BANK_CONFLICT", + "SQ", + "94", + "", + "Number of cycles LDS is stalled by bank conflicts. (emulated)"}, + {"TCC_HIT", "TCC", "17", "", "Number of cache hits."}, + {"TCC_MISS", "TCC", "19", "", "Number of cache misses. UC reads count as misses."}, + {"TCC_EA_WRREQ", + "TCC", + "26", + "", + "Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. " + "Atomics may travel over the same interface and are generally classified as write requests. " + "This does not include probe commands."}, + {"TCC_EA_WRREQ_64B", + "TCC", + "27", + "", + "Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq " + "interface."}, + {"TCC_EA_WRREQ_STALL", + "TCC", + "30", + "", + "Number of cycles a write request was stalled."}, + {"TCC_EA_RDREQ", + "TCC", + "38", + "", + "Number of TCC/EA read requests (either 32-byte or 64-byte)"}, + {"TCC_EA_RDREQ_32B", "TCC", "39", "", "Number of 32-byte TCC/EA read requests"}, + {"GRBM_COUNT", "GRBM", "0", "", "Tie High - Count Number of Clocks"}, + {"GRBM_GUI_ACTIVE", "GRBM", "2", "", "The GUI is Active"}, + {"SQ_WAVES", + "SQ", + "4", + "", + "Count number of waves sent to SQs. (per-simd, emulated, global)"}, + {"SQ_INSTS_VALU", + "SQ", + "26", + "", + "Number of VALU instructions issued. (per-simd, emulated)"}, + {"TA_TA_BUSY", + "TA", + "15", + "", + "TA block is busy. Perf_Windowing not supported for this counter."}, + {"TA_FLAT_READ_WAVEFRONTS", + "TA", + "101", + "", + "Number of flat opcode reads processed by the TA."}, + {"TA_FLAT_WRITE_WAVEFRONTS", + "TA", + "102", + "", + "Number of flat opcode writes processed by the TA."}, + {"TCP_TCP_TA_DATA_STALL_CYCLES", + "TCP", + "6", + "", + "TCP stalls TA data interface. Now Windowed."}}}}; + +static const std::unordered_map>> + derrived_gfx908 = { + {"gfx908", + {{"TCC_HIT_sum", + "", + "", + "sum(TCC_HIT,32)", + "Number of cache hits. Sum over TCC instances."}, + {"TCC_MISS_sum", + "", + "", + "sum(TCC_MISS,32)", + "Number of cache misses. Sum over TCC instances."}, + {"TCC_EA_RDREQ_32B_sum", + "", + "", + "sum(TCC_EA_RDREQ_32B,32)", + "Number of 32-byte TCC/EA read requests. Sum over TCC instances."}, + {"TCC_EA_RDREQ_sum", + "", + "", + "sum(TCC_EA_RDREQ,32)", + "Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances."}, + {"TCC_EA_WRREQ_sum", + "", + "", + "sum(TCC_EA_WRREQ,32)", + "Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq " + "interface. Sum over TCC instances."}, + {"TCC_EA_WRREQ_64B_sum", + "", + "", + "sum(TCC_EA_WRREQ_64B,32)", + "Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq " + "interface. Sum over TCC instances."}, + {"TCC_WRREQ_STALL_max", + "", + "", + "max(TCC_EA_WRREQ_STALL,32)", + "Number of cycles a write request was stalled. Max over TCC instances."}, + {"CU_UTILIZATION", + "", + "", + "GRBM_GUI_ACTIVE/GRBM_COUNT", + "The total number of active cycles divided by total number of elapsed cycles"}, + {"KERNEL_DURATION", "", "", "1", "The duration of the kernel dispatch"}, + {"TA_BUSY_avr", + "", + "", + "avr(TA_TA_BUSY,16)", + "TA block is busy. Average over TA instances."}, + {"TA_BUSY_max", "", "", "max(TA_TA_BUSY,16)", "TA block is busy. Max over TA instances."}, + {"TA_BUSY_min", "", "", "min(TA_TA_BUSY,16)", "TA block is busy. Min over TA instances."}, + {"TA_FLAT_READ_WAVEFRONTS_sum", + "", + "", + "sum(TA_FLAT_READ_WAVEFRONTS,16)", + "Number of flat opcode reads processed by the TA. Sum over TA instances."}, + {"TA_FLAT_WRITE_WAVEFRONTS_sum", + "", + "", + "sum(TA_FLAT_WRITE_WAVEFRONTS,16)", + "Number of flat opcode writes processed by the TA. Sum over TA instances."}, + {"TCP_TCP_TA_DATA_STALL_CYCLES_sum", + "", + "", + "sum(TCP_TCP_TA_DATA_STALL_CYCLES,16)", + "Total number of TCP stalls TA data interface."}, + {"TCP_TCP_TA_DATA_STALL_CYCLES_max", + "", + "", + "max(TCP_TCP_TA_DATA_STALL_CYCLES,16)", + "Maximum number of TCP stalls TA data interface."}, + {"FETCH_SIZE", + "", + "", + "(TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024", + "The total kilobytes fetched from the video memory. This is measured with all extra " + "fetches and any cache or memory effects taken into account."}, + {"WRITE_SIZE", + "", + "", + "((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024", + "The total kilobytes written to the video memory. This is measured with all extra " + "fetches and any cache or memory effects taken into account."}, + {"WRITE_REQ_32B", + "", + "", + "TCC_EA_WRREQ_64B_sum*2+(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)", + "The total number of 32-byte effective memory writes."}, + {"VFetchInsts", + "", + "", + "(SQ_INSTS_VMEM_RD-TA_FLAT_READ_WAVEFRONTS_sum)/SQ_WAVES", + "The average number of vector fetch instructions from the video memory executed per " + "work-item (affected by flow control). Excludes FLAT instructions that fetch from video " + "memory."}, + {"VWriteInsts", + "", + "", + "(SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES", + "The average number of vector write instructions to the video memory executed per " + "work-item (affected by flow control). Excludes FLAT instructions that write to video " + "memory."}, + {"FlatVMemInsts", + "", + "", + "(SQ_INSTS_FLAT-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES", + "The average number of FLAT instructions that read from or write to the video memory " + "executed per work item (affected by flow control). Includes FLAT instructions that " + "read from or write to scratch."}, + {"LDSInsts", + "", + "", + "(SQ_INSTS_LDS-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES", + "The average number of LDS read or LDS write instructions executed per work item " + "(affected by flow control). Excludes FLAT instructions that read from or write to " + "LDS."}, + {"FlatLDSInsts", + "", + "", + "SQ_INSTS_FLAT_LDS_ONLY/SQ_WAVES", + "The average number of FLAT instructions that read or write to LDS executed per work " + "item (affected by flow control)."}, + {"VALUUtilization", + "", + "", + "100*SQ_THREAD_CYCLES_VALU/(SQ_ACTIVE_INST_VALU*MAX_WAVE_SIZE)", + "The percentage of active vector ALU threads in a wave. A lower number can mean either " + "more thread divergence in a wave or that the work-group size is not a multiple of 64. " + "Value range: 0\% (bad), 100\% (ideal - no thread divergence)."}, + {"VALUBusy", + "", + "", + "100*SQ_ACTIVE_INST_VALU*4/SIMD_NUM/GRBM_GUI_ACTIVE", + "The percentage of GPUTime vector ALU instructions are processed. Value range: 0\% " + "(bad) to 100\% (optimal)."}, + {"SALUBusy", + "", + "", + "100*SQ_INST_CYCLES_SALU*4/SIMD_NUM/GRBM_GUI_ACTIVE", + "The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) " + "to 100% (optimal)."}, + {"FetchSize", + "", + "", + "FETCH_SIZE", + "The total kilobytes fetched from the video memory. This is measured with all extra " + "fetches and any cache or memory effects taken into account."}, + {"WriteSize", + "", + "", + "WRITE_SIZE", + "The total kilobytes written to the video memory. This is measured with all extra " + "fetches and any cache or memory effects taken into account."}, + {"MemWrites32B", + "", + "", + "WRITE_REQ_32B", + "The total number of effective 32B write transactions to the memory"}, + {"L2CacheHit", + "", + "", + "100*sum(TCC_HIT,16)/(sum(TCC_HIT,16)+sum(TCC_MISS,16))", + "The percentage of fetch, write, atomic, and other instructions that hit the data in L2 " + "cache. Value range: 0\% (no hit) to 100\% (optimal)."}, + {"MemUnitStalled", + "", + "", + "100*TCP_TCP_TA_DATA_STALL_CYCLES_max/GRBM_GUI_ACTIVE/SE_NUM", + "The percentage of GPUTime the memory unit is stalled. Try reducing the number or size " + "of fetches and writes if possible. Value range: 0\% (optimal) to 100\% (bad)."}, + {"WriteUnitStalled", + "", + "", + "100*TCC_WRREQ_STALL_max/GRBM_GUI_ACTIVE", + "The percentage of GPUTime the Write unit is stalled. Value range: 0\% to 100\% (bad)."}, + {"LDSBankConflict", + "", + "", + "100*SQ_LDS_BANK_CONFLICT/GRBM_GUI_ACTIVE/CU_NUM", + "The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0\% (optimal) " + "to 100\% (bad)."}}}}; \ No newline at end of file diff --git a/source/lib/rocprofiler/counters/xml/CMakeLists.txt b/source/lib/rocprofiler/counters/xml/CMakeLists.txt new file mode 100644 index 0000000000..309987bf3c --- /dev/null +++ b/source/lib/rocprofiler/counters/xml/CMakeLists.txt @@ -0,0 +1,3 @@ +configure_file(basic_counters.xml ${PROJECT_BINARY_DIR}/lib/basic_counters.xml COPYONLY) +configure_file(derived_counters.xml ${PROJECT_BINARY_DIR}/lib/derived_counters.xml + COPYONLY) diff --git a/source/lib/rocprofiler/counters/xml/basic_counters.xml b/source/lib/rocprofiler/counters/xml/basic_counters.xml new file mode 100755 index 0000000000..3ffb3191c4 --- /dev/null +++ b/source/lib/rocprofiler/counters/xml/basic_counters.xml @@ -0,0 +1,744 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + # EA1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/source/lib/rocprofiler/counters/xml/derived_counters.xml b/source/lib/rocprofiler/counters/xml/derived_counters.xml new file mode 100755 index 0000000000..05fd8c6603 --- /dev/null +++ b/source/lib/rocprofiler/counters/xml/derived_counters.xml @@ -0,0 +1,585 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + # LDSBankConflict The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + # LDSBankConflict The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). + + + + + # EA1 + + + + + + + + + + # both EA0 and EA1 should be included + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## IP Block Utilization Metrics + + + + + + + ## Instruction Fetch Metrics + + ## Wavefront Metrics + + + + + + ## Compute Unit Metrics + + + + + + + + + + + + + + ## Local Data Share (LDS) Metrics + + + + + ## L1I and sL1D Cache Metrics + + + ## vL1D Cache Metrics + + + + + + + + + + + + + ## L2 Cache Metrics + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# Vega20 + +# Arcturus + +# Aldebaran + +#Mi300 + + + +#Navi21 + + + +#Navi31 + + + + + + # GPUBusy The percentage of time GPU was busy. + + + # Wavefronts Total wavefronts. + + + # VALUInsts The average number of vector ALU instructions executed per work-item (affected by flow control). + + + # SALUInsts The average number of scalar ALU instructions executed per work-item (affected by flow control). + + + # SFetchInsts The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control). + + + # GDSInsts The average number of GDS read or GDS write instructions executed per work item (affected by flow control). + + + # MemUnitBusy The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). + + + # ALUStalledByLDS The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). + + +