diff --git a/.gitmodules b/.gitmodules index 391aea4832..2f00d4ebc9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -25,3 +25,6 @@ [submodule "external/elfio"] path = external/elfio url = https://github.com/serge1/ELFIO.git +[submodule "external/yaml-cpp"] + path = external/yaml-cpp + url = https://github.com/jbeder/yaml-cpp.git diff --git a/cmake/rocprofiler_interfaces.cmake b/cmake/rocprofiler_interfaces.cmake index 15377d5a3b..7717378348 100644 --- a/cmake/rocprofiler_interfaces.cmake +++ b/cmake/rocprofiler_interfaces.cmake @@ -54,6 +54,7 @@ rocprofiler_add_interface_library(rocprofiler-elf "ElfUtils elf library" INTERNA rocprofiler_add_interface_library(rocprofiler-dw "ElfUtils dw library" INTERNAL) rocprofiler_add_interface_library(rocprofiler-elfio "ELFIO header-only C++ library" INTERNAL) +rocprofiler_add_interface_library(rocprofiler-yaml-cpp "YAML CPP Parser" INTERNAL) # # interface for libraries (ROCm-specific) diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt index ee361ab3c5..149aa54618 100644 --- a/external/CMakeLists.txt +++ b/external/CMakeLists.txt @@ -126,6 +126,21 @@ if(NOT TARGET PTL::ptl-static) add_subdirectory(ptl EXCLUDE_FROM_ALL) endif() +rocprofiler_checkout_git_submodule( + RECURSIVE + RELATIVE_PATH external/yaml-cpp + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + REPO_URL https://github.com/jbeder/yaml-cpp.git + REPO_BRANCH "master") + +add_subdirectory(yaml-cpp EXCLUDE_FROM_ALL) + +target_link_libraries(rocprofiler-yaml-cpp + INTERFACE $) +target_include_directories( + rocprofiler-yaml-cpp + INTERFACE $) + # checkout submodule if not already checked out or clone repo if no .gitmodules file rocprofiler_checkout_git_submodule( RECURSIVE diff --git a/external/yaml-cpp b/external/yaml-cpp new file mode 160000 index 0000000000..1d8ca1f35e --- /dev/null +++ b/external/yaml-cpp @@ -0,0 +1 @@ +Subproject commit 1d8ca1f35eb3a9c9142462b28282a848e5d29a91 diff --git a/source/lib/common/CMakeLists.txt b/source/lib/common/CMakeLists.txt index 28d859c98a..e1b5d8ffc2 100644 --- a/source/lib/common/CMakeLists.txt +++ b/source/lib/common/CMakeLists.txt @@ -4,7 +4,7 @@ rocprofiler_activate_clang_tidy() set(common_sources demangle.cpp elf_utils.cpp environment.cpp logging.cpp - static_object.cpp utility.cpp xml.cpp) + static_object.cpp utility.cpp) set(common_headers abi.hpp defines.hpp @@ -19,8 +19,7 @@ set(common_headers stringize_arg.hpp synchronized.hpp units.hpp - utility.hpp - xml.hpp) + utility.hpp) add_library(rocprofiler-common-library STATIC) add_library(rocprofiler-sdk::rocprofiler-common-library ALIAS rocprofiler-common-library) @@ -41,6 +40,7 @@ target_link_libraries( $ $ $ + $ $ $ $ diff --git a/source/lib/common/xml.cpp b/source/lib/common/xml.cpp deleted file mode 100644 index 20a7eabc1e..0000000000 --- a/source/lib/common/xml.cpp +++ /dev/null @@ -1,531 +0,0 @@ -// MIT License -// -// Copyright (c) 2023 Advanced Micro Devices, Inc. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#include "lib/common/xml.hpp" -#include "lib/common/logging.hpp" - -namespace rocprofiler -{ -namespace common -{ -Xml::Xml(std::string file_name, const Xml* obj) -: file_name_(std::move(file_name)) -, state_(BODY_STATE) -{ - if(obj != nullptr) - { - map_ = obj->map_; - level_ = obj->level_; - included_ = true; - } -} - -Xml::~Xml() -{ - for(auto& x : stack_) - { - x->nodes.clear(); - x->copy.reset(); - } - if(!map_) return; - for(auto& [_, nodes] : *map_) - { - (void) _; - for(auto& node : nodes) - { - node->nodes.clear(); - node->copy.reset(); - } - } -} - -std::shared_ptr -Xml::Create(const std::string& file_name, const Xml* obj) -{ - auto xml = std::make_shared(file_name, obj); - if(xml != nullptr) - { - if(xml->Init() != false) - { - const std::size_t pos = file_name.rfind('/'); - const std::string path = (pos != std::string::npos) ? file_name.substr(0, pos + 1) : ""; - - xml->PreProcess(); - nodes_t incl_nodes; - for(const auto& node : xml->GetNodes("top.include")) - { - if(node->opts.find("touch") == node->opts.end()) - { - node->opts["touch"] = ""; - incl_nodes.push_back(node); - } - } - for(const auto& incl : incl_nodes) - { - const std::string& incl_name = path + incl->opts["file"]; - auto ixml = Create(incl_name, xml.get()); - if(!ixml) - { - xml.reset(); - break; - } - } - if(xml) - { - xml->Process(); - } - } - } - - return xml; -} - -void -Xml::AddExpr(const std::string& full_tag, const std::string& name, const std::string& expr) -{ - const std::size_t pos = full_tag.rfind('.'); - const std::size_t pos1 = (pos == std::string::npos) ? 0 : pos + 1; - const std::string level_tag = full_tag.substr(pos1); - auto level = std::make_shared(); - (*map_)[full_tag].push_back(level); - level->tag = level_tag; - level->opts["name"] = name; - level->opts["expr"] = expr; -} - -void -Xml::AddConst(const std::string& full_tag, const std::string& name, const uint64_t& val) -{ - std::ostringstream oss; - oss << val; - AddExpr(full_tag, name, oss.str()); -} - -bool -Xml::print_func::operator()(const std::string& global_tag, const std::shared_ptr& node) -{ - std::cout << global_tag << ":\n"; - for(auto& opt : node->opts) - { - std::cout << global_tag << "." << opt.first << " = " << opt.second << "\n"; - } - return true; -} - -void -Xml::Print() const -{ - std::cout << "XML file '" << file_name_ << "':\n"; - ForEach(print_func{}); -} - -bool -Xml::Init() -{ - fd_ = open(file_name_.c_str(), O_RDONLY); - if(fd_ == -1) - { - // perror((std::string("open XML file ") + file_name_).c_str()); - return false; - } - - if(map_ == nullptr) - { - map_ = std::make_unique(); - AddLevel("top"); - } - - return true; -} - -void -Xml::PreProcess() -{ - uint32_t ind = 0; - char buf[kBufSize]; - bool error = false; - - while(true) - { - const uint32_t pos = lseek(fd_, 0, SEEK_CUR); - uint32_t size = read(fd_, buf, kBufSize); - if(size <= 0) break; - buf[size - 1] = '\0'; - - if(strncmp(buf, "#include \"", 10) == 0) - { - for(ind = 0; (ind < size) && (buf[ind] != '\n'); ++ind) - {} - if(ind < size) - { - buf[ind] = '\0'; - size = ind; - lseek(fd_, pos + ind + 1, SEEK_SET); - } - - for(ind = 10; (ind < size) && (buf[ind] != '"'); ++ind) - {} - if(ind == size) - { - error = true; - break; - } - buf[ind] = '\0'; - - AddLevel("include"); - AddOption("file", &buf[10]); - UpLevel(); - } - } - - if(error) - { - fprintf(stderr, "XML PreProcess failed, line '%s'\n", buf); - abort(); - } - - lseek(fd_, 0, SEEK_SET); -} - -void -Xml::Process() -{ - token_t remainder; - - while(true) - { - token_t token = (!remainder.empty()) ? remainder : NextToken(); - remainder.clear(); - - // token_t token1 = token; - // token1.push_back('\0'); - // std::cout << ">>> " << &token1[0] << std::endl; - - // End of file - if(token.empty()) break; - - switch(state_) - { - case BODY_STATE: - if(token[0] == '<') - { - bool node_begin = true; - unsigned ind = 1; - if(token[1] == '/') - { - node_begin = false; - ++ind; - } - - unsigned i = ind; - while(i < token.size()) - { - if(token[i] == '>') break; - ++i; - } - for(unsigned j = i + 1; j < token.size(); ++j) - remainder.push_back(token[j]); - - if(i == token.size()) - { - if(node_begin) - state_ = DECL_STATE; - else - BadFormat(token); - token.push_back('\0'); - } - else - { - token[i] = '\0'; - } - - const char* tag = &token[ind]; - if(node_begin) - { - AddLevel(tag); - } - else - { - Inherit(GetOption("base")); - - if(strncmp(CurrentLevel().c_str(), tag, strlen(tag)) != 0) - { - token.back() = '>'; - BadFormat(token); - } - UpLevel(); - } - } - else - { - BadFormat(token); - } - break; - case DECL_STATE: - if(token[0] == '>') - { - state_ = BODY_STATE; - for(unsigned j = 1; j < token.size(); ++j) - remainder.push_back(token[j]); - continue; - } - else - { - token.push_back('\0'); - unsigned j = 0; - for(j = 0; j < token.size(); ++j) - if(token[j] == '=') break; - if(j == token.size()) BadFormat(token); - token[j] = '\0'; - const std::string key = token.data(); - const std::string value = &token[j + 1]; - AddOption(key, value); - } - break; - default: - { - ROCP_ERROR << "XML parser error: wrong state: " << state_; - abort(); - } - } - } -} - -bool -Xml::SpaceCheck() const -{ - bool cond = ((buffer_[index_] == ' ') || (buffer_[index_] == '\t')); - return cond; -} - -bool -Xml::LineEndCheck() -{ - bool found = false; - if(buffer_[index_] == '\n') - { - buffer_[index_] = ' '; - ++file_line_; - found = true; - comment_ = false; - } - else if(comment_ || (buffer_[index_] == '#')) - { - found = true; - comment_ = true; - } - return found; -} - -Xml::token_t -Xml::NextToken() -{ - token_t token; - bool in_string = false; - bool special_symb = false; - - while(true) - { - if(data_size_ == 0) - { - data_size_ = read(fd_, buffer_, kBufSize); - if(data_size_ <= 0) break; - } - - if(token.empty()) - { - while((index_ < data_size_) && (SpaceCheck() || LineEndCheck())) - { - ++index_; - } - } - while((index_ < data_size_) && (in_string || !(SpaceCheck() || LineEndCheck()))) - { - const char symb = buffer_[index_]; - bool skip_symb = false; - - switch(symb) - { - case '\\': - if(special_symb) - { - special_symb = false; - } - else - { - special_symb = true; - skip_symb = true; - } - break; - case '"': - if(special_symb) - { - special_symb = false; - } - else - { - in_string = !in_string; - if(!in_string) - { - buffer_[index_] = ' '; - --index_; - } - skip_symb = true; - } - break; - } - - if(!skip_symb) token.push_back(symb); - ++index_; - } - - if(index_ == data_size_) - { - index_ = 0; - data_size_ = 0; - } - else - { - if(special_symb || in_string) BadFormat(token); - break; - } - } - - return token; -} - -void -Xml::BadFormat(token_t token) -{ - token.push_back('\0'); - ROCP_ERROR << "Error: " << file_name_ << ", line " << file_line_ << ", bad XML token '" - << token.data() << "'"; - abort(); -} - -void -Xml::AddLevel(const std::string& tag) -{ - auto level = std::make_shared(); - level->tag = tag; - if(level_) - { - level_->nodes.push_back(level); - stack_.push_back(level_); - } - level_ = level; - - std::string global_tag = GlobalTag(tag); - (*map_)[global_tag].push_back(level_); -} - -void -Xml::UpLevel() -{ - level_ = stack_.back(); - stack_.pop_back(); -} - -void -Xml::Copy(const std::shared_ptr& from, const std::shared_ptr& to) -{ - auto level = to; - if(level == nullptr) - { - AddLevel(from->tag); - level = level_; - } - level->copy = from; - level->opts = from->opts; - - for(const auto& node : from->nodes) - { - bool found = false; - const std::string name = GetOption("name", node); - const std::string global_tag = GlobalTag(level->tag) + "." + node->tag; - for(const auto& item : (*map_)[global_tag]) - { - if((name == GetOption("name", item)) || (node == item->copy)) - { - found = true; - break; - } - } - if(found == false) Copy(node, nullptr); - } - - if(to == nullptr) UpLevel(); -} - -void -Xml::Inherit(const std::string& tag) -{ - if(!tag.empty()) - { - const std::string global_tag = GlobalTag(tag); - auto it = map_->find(global_tag); - if(it == map_->end()) - { - fprintf( - stderr, "Node \"%s\": Base not found \"%s\"\n", level_->tag.c_str(), tag.c_str()); - abort(); - } - for(const auto& node : it->second) - { - Copy(node, level_); - } - } -} - -std::string -Xml::CurrentLevel() const -{ - return level_->tag; -} - -std::string -Xml::GlobalTag(const std::string& tag) const -{ - std::string global_tag; - for(const auto& level : stack_) - { - global_tag += level->tag + "."; - } - global_tag += tag; - return global_tag; -} - -void -Xml::AddOption(const std::string& key, const std::string& value) -{ - level_->opts[key] = value; -} - -std::string -Xml::GetOption(const std::string& key, std::shared_ptr level) -{ - level = (level != nullptr) ? level : level_; - auto it = level->opts.find(key); - return (it != level->opts.end()) ? it->second : ""; -} -} // namespace common -} // namespace rocprofiler diff --git a/source/lib/common/xml.hpp b/source/lib/common/xml.hpp deleted file mode 100644 index 88fe2d3346..0000000000 --- a/source/lib/common/xml.hpp +++ /dev/null @@ -1,147 +0,0 @@ -// MIT License -// -// Copyright (c) 2023 Advanced Micro Devices, Inc. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace rocprofiler -{ -namespace common -{ -class Xml -{ -public: - using token_t = std::vector; - - struct level_t; - using node_vect_t = std::vector>; - using node_list_t = std::list>; - - using nodes_t = node_vect_t; - using opts_t = std::map; - struct level_t - { - std::string tag; - nodes_t nodes; - opts_t opts; - std::shared_ptr copy; - }; - using nodes_vec_t = std::vector>; - using map_t = std::map; - - enum - { - DECL_STATE, - BODY_STATE - }; - - static std::shared_ptr Create(const std::string& file_name, const Xml* obj = nullptr); - - std::string GetName() { return file_name_; } - - void AddExpr(const std::string& full_tag, const std::string& name, const std::string& expr); - void AddConst(const std::string& full_tag, const std::string& name, const uint64_t& val); - - nodes_t GetNodes(const std::string& global_tag) { return (*map_)[global_tag]; } - const map_t& GetAllNodes() { return (*map_); } - - template - Tp ForEach(const Tp& v_i) const; - - struct print_func - { - bool operator()(const std::string& global_tag, const std::shared_ptr& node); - }; - - void Print() const; - - Xml(std::string file_name, const Xml* obj); - ~Xml(); - -private: - bool Init(); - void PreProcess(); - void Process(); - bool SpaceCheck() const; - bool LineEndCheck(); - token_t NextToken(); - void BadFormat(token_t token); - void AddLevel(const std::string& tag); - void UpLevel(); - void Copy(const std::shared_ptr& from, const std::shared_ptr& to); - void Inherit(const std::string& tag); - std::string CurrentLevel() const; - std::string GlobalTag(const std::string& tag) const; - void AddOption(const std::string& key, const std::string& value); - std::string GetOption(const std::string& key, std::shared_ptr level = nullptr); - - const std::string file_name_; - unsigned file_line_{0}; - int fd_; - - static const size_t kBufSize = 256; - char buffer_[kBufSize]; - - unsigned data_size_{0}; - unsigned index_{0}; - unsigned state_{0}; - bool comment_{false}; - std::vector> stack_; - bool included_{false}; - std::shared_ptr level_; - std::shared_ptr map_; -}; - -template -Tp -Xml::ForEach(const Tp& v_i) const -{ - Tp v = v_i; - if(map_) - { - for(auto& entry : *map_) - { - for(const auto& node : entry.second) - { - if(Tp{}(entry.first, node) == false) break; - } - } - } - return v; -} -} // namespace common -} // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/counters/CMakeLists.txt b/source/lib/rocprofiler-sdk/counters/CMakeLists.txt index 6256c85c35..ff442d7db6 100644 --- a/source/lib/rocprofiler-sdk/counters/CMakeLists.txt +++ b/source/lib/rocprofiler-sdk/counters/CMakeLists.txt @@ -9,6 +9,7 @@ target_sources(rocprofiler-object-library PRIVATE ${ROCPROFILER_LIB_COUNTERS_SOU add_subdirectory(xml) add_subdirectory(parser) +add_subdirectory(yaml) if(ROCPROFILER_BUILD_TESTS) add_subdirectory(tests) diff --git a/source/lib/rocprofiler-sdk/counters/metrics.cpp b/source/lib/rocprofiler-sdk/counters/metrics.cpp index e9b60dfede..562746d049 100644 --- a/source/lib/rocprofiler-sdk/counters/metrics.cpp +++ b/source/lib/rocprofiler-sdk/counters/metrics.cpp @@ -27,11 +27,19 @@ #include "lib/common/filesystem.hpp" #include "lib/common/static_object.hpp" #include "lib/common/utility.hpp" -#include "lib/common/xml.hpp" #include "lib/rocprofiler-sdk/agent.hpp" #include "glog/logging.h" +#include "yaml-cpp/exceptions.h" +#include "yaml-cpp/node/convert.h" +#include "yaml-cpp/node/detail/impl.h" +#include "yaml-cpp/node/impl.h" +#include "yaml-cpp/node/iterator.h" +#include "yaml-cpp/node/node.h" +#include "yaml-cpp/node/parse.h" +#include "yaml-cpp/parser.h" + #include // for dladdr #include #include @@ -79,55 +87,76 @@ get_constants() } return constants; } - -// Future TODO: inheritance? does it work for derived_counters.xml? +/** + * Expected YAML Format: + * COUNTER_NAME: + * architectures: + * gfxXX: // Can be more than one, / deliminated if they share idential data + * block: + * event: + * expression: + * description: + * gfxYY: + * ... + * description: General counter desctiption + */ MetricMap -loadXml(const std::string& filename, bool load_constants = false) +loadYAML(const std::string& filename, bool load_constants = false, bool load_derived = false) { MetricMap ret; ROCP_INFO << "Loading Counter Config: " << filename; - // todo: return unique_ptr.... - auto xml = common::Xml::Create(filename); - ROCP_FATAL_IF(!xml) - << "Could not open XML Counter Config File (set env ROCPROFILER_METRICS_PATH)"; + auto yaml = YAML::LoadFile(filename); - const auto& constant_metrics = get_constants(); - for(const auto& [gfx_name, nodes] : xml->GetAllNodes()) + for(auto it = yaml.begin(); it != yaml.end(); ++it) { - /** - * "top." is used to designate the root encapsulation of all contained XML subroots (in our - * case "gfxX"). This is inserted by the parser so it will always be present. .metric - * denotes XML tags that are contained in the subroots. This will not change unless we - * respec the XML (which we should...). - */ - if(gfx_name.find("metric") == std::string::npos || - gfx_name.find("top.") == std::string::npos || gfx_name.find("gfx") == std::string::npos) - continue; + auto counter_name = it->first.as(); + auto counter_def = it->second; + auto def_iterator = counter_def["architectures"]; - auto& metricVec = - ret.emplace(gfx_name.substr(strlen("top."), - gfx_name.size() - strlen("top.") - strlen(".metric")), - std::vector()) - .first->second; - for(const auto& node : nodes) + for(auto def_it = def_iterator.begin(); def_it != def_iterator.end(); ++def_it) { - metricVec.emplace_back(gfx_name, - node->opts["name"], - node->opts["block"], - node->opts["event"], - node->opts["descr"], - node->opts["expr"], - node->opts["special"], - current_id()); - current_id()++; - } + auto archs = def_it->first.as(); + auto def = def_it->second; + // To save space in the YAML file, we combine architectures with the same + // definition into a single entry. Split these out into separate entries. + // architectures: + // gfx10/gfx1010/gfx1030/gfx1031/.....9: + // expression: 400*SQ_WAIT_INST_LDS/SQ_WAVES/GRBM_GUI_ACTIVE + std::vector result; + std::stringstream ss(archs); + std::string arch_name; - if(load_constants) - { - metricVec.insert(metricVec.end(), constant_metrics.begin(), constant_metrics.end()); + while(std::getline(ss, arch_name, '/')) + { + auto& metricVec = ret.emplace(arch_name, std::vector()).first->second; + if(metricVec.empty() && load_constants) + { + metricVec.insert( + metricVec.end(), get_constants().begin(), get_constants().end()); + } + + if((def["expression"] && load_derived) || (!load_derived && !def["expression"])) + { + std::string description; + if(def["description"]) + description = def["description"].as(); + else if(counter_def["description"]) + description = counter_def["description"].as(); + metricVec.emplace_back( + arch_name, + counter_name, + (def["block"] ? def["block"].as() : ""), + (def["event"] ? def["event"].as() : ""), + description, + (def["expression"] ? def["expression"].as() : ""), + "", + current_id()); + current_id()++; + ROCP_TRACE << fmt::format("Inserted info {}: {}", arch_name, metricVec.back()); + } + } } } - ROCP_FATAL_IF(current_id() > 65536) << "Counter count exceeds 16 bits, which may break counter id output"; return ret; @@ -163,19 +192,19 @@ findViaEnvironment(const std::string& filename) MetricMap getDerivedHardwareMetrics() { - auto counters_path = findViaEnvironment("derived_counters.xml"); + auto counters_path = findViaEnvironment("counter_defs.yaml"); ROCP_FATAL_IF(!common::filesystem::exists(counters_path)) << "metric xml file '" << counters_path << "' does not exist"; - return loadXml(counters_path); + return loadYAML(counters_path, false, true); } MetricMap getBaseHardwareMetrics() { - auto counters_path = findViaEnvironment("basic_counters.xml"); + auto counters_path = findViaEnvironment("counter_defs.yaml"); ROCP_FATAL_IF(!common::filesystem::exists(counters_path)) << "metric xml file '" << counters_path << "' does not exist"; - return loadXml(counters_path, true); + return loadYAML(counters_path, true, false); } const MetricIdMap* diff --git a/source/lib/rocprofiler-sdk/counters/tests/core.cpp b/source/lib/rocprofiler-sdk/counters/tests/core.cpp index 7ba514ba4a..2b8583b079 100644 --- a/source/lib/rocprofiler-sdk/counters/tests/core.cpp +++ b/source/lib/rocprofiler-sdk/counters/tests/core.cpp @@ -208,6 +208,7 @@ TEST(core, check_packet_generation) */ rocprofiler_profile_config_id_t cfg_id = {}; rocprofiler_counter_id_t id = {.handle = metric.id()}; + ROCP_ERROR << fmt::format("Generating packet for {}", metric); ROCPROFILER_CALL( rocprofiler_create_profile_config(agent.get_rocp_agent()->id, &id, 1, &cfg_id), "Unable to create profile"); diff --git a/source/lib/rocprofiler-sdk/counters/tests/metrics_test.h b/source/lib/rocprofiler-sdk/counters/tests/metrics_test.h index b552924f27..897fcfff9b 100644 --- a/source/lib/rocprofiler-sdk/counters/tests/metrics_test.h +++ b/source/lib/rocprofiler-sdk/counters/tests/metrics_test.h @@ -31,11 +31,7 @@ // Layout is: {name, block, event, expression, description}. static const std::unordered_map>> basic_gfx908 = { {"gfx908", - {{"MAX_WAVE_SIZE", "", "", "1", "Max wave size constant"}, - {"SE_NUM", "", "", "1", "SE_NUM"}, - {"SIMD_NUM", "", "", "1", "SIMD Number"}, - {"CU_NUM", "", "", "1", "CU_NUM"}, - {"SQ_INSTS_VMEM_WR", + {{"SQ_INSTS_VMEM_WR", "SQ", "28", "", @@ -86,14 +82,14 @@ static const std::unordered_map", - "regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. " - "(per-simd, nondeterministic). Units in quad-cycles(4 cycles)"}, + "Number of cycles the SQ instruction arbiter is working on a VALU instruction. " + "(per-simd, emulated). Units in quad-cycles(4 cycles)"}, {"SQ_INST_CYCLES_SALU", "SQ", "85", "", "Number of cycles needed to execute non-memory read scalar operations. (per-simd, " - "emulated)"}, + "emulated). Units in quad-cycles(4 cycles)"}, {"SQ_THREAD_CYCLES_VALU", "SQ", "86", @@ -166,7 +162,11 @@ static const std::unordered_map>> derived_gfx908 = {{"gfx908", - {{"GPUBusy", + {{"MAX_WAVE_SIZE", "", "", "wave_front_size", "Max wave size constant"}, + {"SE_NUM", "", "", "array_count/simd_arrays_per_engine", "SE_NUM"}, + {"SIMD_NUM", "", "", "simd_per_cu/CU_NUM", "SIMD Number"}, + {"CU_NUM", "", "", "cu_per_simd_array*array_count", "CU_NUM"}, + {"GPUBusy", "", "", "100*GRBM_GUI_ACTIVE/GRBM_COUNT", @@ -231,7 +231,7 @@ static const std::unordered_mapTCC request latency for reads and atomics with return. Not + Windowed. +TCP_TCC_READ_REQ_LATENCY_sum: + architectures: + gfx90a: + expression: reduce(TCP_TCC_READ_REQ_LATENCY,sum) + description: Total TCP->TCC request latency for reads and atomics with return. Not + Windowed. Sum over TCP instances. +TCP_TCC_READ_REQ_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TCC_READ_REQ,sum) + description: Total read requests from TCP to all TCCs Sum over TCP instances. +TCP_TCC_RW_ATOMIC_REQ: + architectures: + gfx90a: + block: TCP + event: 87 + gfx942/gfx941/gfx940: + block: TCP + event: 82 + description: Total atomic requests with RW mtype from this TCP to all TCCs +TCP_TCC_RW_ATOMIC_REQ_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TCC_RW_ATOMIC_REQ,sum) + description: Total atomic requests with RW mtype from this TCP to all TCCs. Sum + over TCP instances. +TCP_TCC_RW_READ_REQ: + architectures: + gfx90a: + block: TCP + event: 85 + gfx942/gfx941/gfx940: + block: TCP + event: 80 + description: Total write requests with RW mtype from this TCP to all TCCs +TCP_TCC_RW_READ_REQ_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TCC_RW_READ_REQ,sum) + description: Total write requests with RW mtype from this TCP to all TCCs. Sum over + TCP instances. +TCP_TCC_RW_WRITE_REQ: + architectures: + gfx90a: + block: TCP + event: 86 + gfx942/gfx941/gfx940: + block: TCP + event: 81 + description: Total write requests with RW mtype from this TCP to all TCCs +TCP_TCC_RW_WRITE_REQ_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TCC_RW_WRITE_REQ,sum) + description: Total write requests with RW mtype from this TCP to all TCCs. Sum over + TCP instances. +TCP_TCC_UC_ATOMIC_REQ: + architectures: + gfx90a: + block: TCP + event: 80 + gfx942/gfx941/gfx940: + block: TCP + event: 76 + description: Total atomic requests with UC mtype from this TCP to all TCCs +TCP_TCC_UC_ATOMIC_REQ_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TCC_UC_ATOMIC_REQ,sum) + description: Total atomic requests with UC mtype from this TCP to all TCCs Sum over + TCP instances. +TCP_TCC_UC_READ_REQ: + architectures: + gfx90a: + block: TCP + event: 78 + gfx942/gfx941/gfx940: + block: TCP + event: 74 + description: Total read requests with UC mtype from this TCP to all TCCs +TCP_TCC_UC_READ_REQ_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TCC_UC_READ_REQ,sum) + description: Total read requests with UC mtype from this TCP to all TCCs Sum over + TCP instances. +TCP_TCC_UC_WRITE_REQ: + architectures: + gfx90a: + block: TCP + event: 79 + gfx942/gfx941/gfx940: + block: TCP + event: 75 + description: Total write requests with UC mtype from this TCP to all TCCs +TCP_TCC_UC_WRITE_REQ_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TCC_UC_WRITE_REQ,sum) + description: Total write requests with UC mtype from this TCP to all TCCs Sum over + TCP instances. +TCP_TCC_WRITE_REQ: + architectures: + gfx90a: + block: TCP + event: 70 + gfx942/gfx941/gfx940: + block: TCP + event: 66 + description: Total write requests from TCP to all TCCs +TCP_TCC_WRITE_REQ_LATENCY: + architectures: + gfx90a: + block: TCP + event: 67 + description: Total TCP->TCC request latency for writes and atomics without return. + Not Windowed. +TCP_TCC_WRITE_REQ_LATENCY_sum: + architectures: + gfx90a: + expression: reduce(TCP_TCC_WRITE_REQ_LATENCY,sum) + description: Total TCP->TCC request latency for writes and atomics without return. + Not Windowed. Sum over TCP instances. +TCP_TCC_WRITE_REQ_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TCC_WRITE_REQ,sum) + description: Total write requests from TCP to all TCCs Sum over TCP instances. +TCP_TCP_LATENCY: + architectures: + gfx90a: + block: TCP + event: 65 + description: Total TCP wave latency (from first clock of wave entering to first + clock of wave leaving), divide by TA_TCP_STATE_READ to avg wave latency +TCP_TCP_LATENCY_sum: + architectures: + gfx90a: + expression: reduce(TCP_TCP_LATENCY,sum) + description: Total TCP wave latency (from first clock of wave entering to first + clock of wave leaving), divide by TA_TCP_STATE_READ to avg wave latency Sum over + TCP instances. +TCP_TCP_TA_DATA_STALL_CYCLES: + architectures: + gfx8: + block: TCP + event: 3 + gfx942/gfx941/gfx906/gfx940/gfx908/gfx900/gfx90a/gfx9: + block: TCP + event: 6 + description: TCP stalls TA data interface. Now Windowed. +TCP_TCP_TA_DATA_STALL_CYCLES_max: + architectures: + gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9: + expression: reduce(TCP_TCP_TA_DATA_STALL_CYCLES,max) + description: Maximum number of TCP stalls TA data interface. +TCP_TCP_TA_DATA_STALL_CYCLES_sum: + architectures: + gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9: + expression: reduce(TCP_TCP_TA_DATA_STALL_CYCLES,sum) + description: Total number of TCP stalls TA data interface. +TCP_TCR_TCP_STALL_CYCLES: + architectures: + gfx942/gfx941/gfx940/gfx90a: + block: TCP + event: 8 + description: TCR stalls TCP_TCR_req interface +TCP_TCR_TCP_STALL_CYCLES_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TCR_TCP_STALL_CYCLES,sum) + description: TCR stalls TCP_TCR_req interface. Sum over TCP instances. +TCP_TD_TCP_STALL_CYCLES: + architectures: + gfx942/gfx941/gfx940/gfx90a: + block: TCP + event: 7 + description: TD stalls TCP +TCP_TD_TCP_STALL_CYCLES_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TD_TCP_STALL_CYCLES,sum) + description: TD stalls TCP. Sum over TCP instances. +TCP_TOTAL_ACCESSES: + architectures: + gfx90a: + block: TCP + event: 29 + gfx942/gfx941/gfx940: + block: TCP + event: 27 + description: Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD +TCP_TOTAL_ACCESSES_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TOTAL_ACCESSES,sum) + description: Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD. + Sum over TCP instances. +TCP_TOTAL_ATOMIC_WITHOUT_RET: + architectures: + gfx90a: + block: TCP + event: 39 + gfx942/gfx941/gfx940: + block: TCP + event: 37 + description: Total number of atomic without return pixels/buffers from TA +TCP_TOTAL_ATOMIC_WITHOUT_RET_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TOTAL_ATOMIC_WITHOUT_RET,sum) + description: Total number of atomic without return pixels/buffers from TA Sum over + TCP instances. +TCP_TOTAL_ATOMIC_WITH_RET: + architectures: + gfx90a: + block: TCP + event: 38 + gfx942/gfx941/gfx940: + block: TCP + event: 36 + description: Total number of atomic with return pixels/buffers from TA +TCP_TOTAL_ATOMIC_WITH_RET_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TOTAL_ATOMIC_WITH_RET,sum) + description: Total number of atomic with return pixels/buffers from TA. Sum over + TCP instances. +TCP_TOTAL_CACHE_ACCESSES: + architectures: + gfx942/gfx941/gfx940/gfx90a: + block: TCP + event: 60 + description: Count of total cache line (tag) accesses (includes hits and misses). +TCP_TOTAL_CACHE_ACCESSES_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TOTAL_CACHE_ACCESSES,sum) + description: Count of total cache line (tag) accesses (includes hits and misses). + Sum over TCP instances. +TCP_TOTAL_READ: + architectures: + gfx90a: + block: TCP + event: 30 + gfx942/gfx941/gfx940: + block: TCP + event: 28 + description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ +TCP_TOTAL_READ_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TOTAL_READ,sum) + description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ. Sum over + TCP instances. +TCP_TOTAL_WRITE: + architectures: + gfx90a: + block: TCP + event: 32 + gfx942/gfx941/gfx940: + block: TCP + event: 30 + description: Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+ + TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE +TCP_TOTAL_WRITEBACK_INVALIDATES: + architectures: + gfx90a: + block: TCP + event: 45 + gfx942/gfx941/gfx940: + block: TCP + event: 43 + description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ + TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. + Not Windowed. +TCP_TOTAL_WRITEBACK_INVALIDATES_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TOTAL_WRITEBACK_INVALIDATES,sum) + description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ + TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. + Not Windowed. Sum over TCP instances. +TCP_TOTAL_WRITE_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_TOTAL_WRITE,sum) + description: Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+ + TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE. Sum over TCP instances. +TCP_UTCL1_PERMISSION_MISS: + architectures: + gfx90a: + block: TCP + event: 50 + gfx942/gfx941/gfx940: + block: TCP + event: 49 + description: Total utcl1 permission misses +TCP_UTCL1_PERMISSION_MISS_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_UTCL1_PERMISSION_MISS,sum) + description: Total utcl1 permission misses Sum over TCP instances. +TCP_UTCL1_REQUEST: + architectures: + gfx90a: + block: TCP + event: 47 + gfx942/gfx941/gfx940: + block: TCP + event: 45 + description: Total CLIENT_UTCL1 NORMAL requests +TCP_UTCL1_REQUEST_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_UTCL1_REQUEST,sum) + description: Total CLIENT_UTCL1 NORMAL requests Sum over TCP instances. +TCP_UTCL1_TRANSLATION_HIT: + architectures: + gfx90a: + block: TCP + event: 49 + gfx942/gfx941/gfx940: + block: TCP + event: 48 + description: Total utcl1 translation hits +TCP_UTCL1_TRANSLATION_HIT_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_UTCL1_TRANSLATION_HIT,sum) + description: Total utcl1 translation hits Sum over TCP instances. +TCP_UTCL1_TRANSLATION_MISS: + architectures: + gfx90a: + block: TCP + event: 48 + gfx942/gfx941/gfx940: + block: TCP + event: 47 + description: Total utcl1 translation misses +TCP_UTCL1_TRANSLATION_MISS_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_UTCL1_TRANSLATION_MISS,sum) + description: Total utcl1 translation misses Sum over TCP instances. +TCP_VOLATILE: + architectures: + gfx90a: + block: TCP + event: 28 + gfx942/gfx941/gfx940: + block: TCP + event: 26 + description: Total number of L1 volatile pixels/buffers from TA +TCP_VOLATILE_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_VOLATILE,sum) + description: Total number of L1 volatile pixels/buffers from TA. Sum over TCP instances. +TCP_WRITE_TAGCONFLICT_STALL_CYCLES: + architectures: + gfx90a: + block: TCP + event: 12 + gfx942/gfx941/gfx940: + block: TCP + event: 11 + description: Tagram conflict stall on a write +TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TCP_WRITE_TAGCONFLICT_STALL_CYCLES,sum) + description: Tagram conflict stall on a write. Sum over TCP instances. +TD_ATOMIC_WAVEFRONT: + architectures: + gfx90a: + block: TD + event: 26 + gfx942/gfx941/gfx940: + block: TD + event: 17 + description: Count the wavefronts with opcode = atomic. +TD_ATOMIC_WAVEFRONT_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TD_ATOMIC_WAVEFRONT,sum) + description: Count the wavefronts with opcode = atomic. Sum over TD instances. +TD_COALESCABLE_WAVEFRONT: + architectures: + gfx90a: + block: TD + event: 32 + gfx942/gfx941/gfx940: + block: TD + event: 21 + description: Count wavefronts that TA finds coalescable. +TD_COALESCABLE_WAVEFRONT_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TD_COALESCABLE_WAVEFRONT,sum) + description: Count wavefronts that TA finds coalescable. Sum over TD instances. +TD_LOAD_WAVEFRONT: + architectures: + gfx90a: + block: TD + event: 25 + gfx942/gfx941/gfx940: + block: TD + event: 16 + description: Count the wavefronts with opcode = load, include atomics and store. +TD_LOAD_WAVEFRONT_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TD_LOAD_WAVEFRONT,sum) + description: Count the wavefronts with opcode = load, include atomics and store. + Sum over TD instances. +TD_SPI_STALL: + architectures: + gfx90a: + block: TD + event: 18 + gfx942/gfx941/gfx940: + block: TD + event: 15 + description: TD is stalled SPI vinit +TD_SPI_STALL_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TD_SPI_STALL,sum) + description: TD is stalled SPI vinit, sum of TCP instances +TD_STORE_WAVEFRONT: + architectures: + gfx90a: + block: TD + event: 27 + gfx942/gfx941/gfx940: + block: TD + event: 18 + description: Count the wavefronts with opcode = store. +TD_STORE_WAVEFRONT_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TD_STORE_WAVEFRONT,sum) + description: Count the wavefronts with opcode = store. Sum over TD instances. +TD_TC_STALL: + architectures: + gfx90a: + block: TD + event: 15 + gfx942/gfx941/gfx940: + block: TD + event: 12 + description: TD is stalled waiting for TC data. +TD_TC_STALL_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TD_TC_STALL,sum) + description: TD is stalled waiting for TC data. Sum over TD instances. +TD_TD_BUSY: + architectures: + gfx942/gfx941/gfx940/gfx90a: + block: TD + event: 1 + description: TD is processing or waiting for data. Perf_Windowing not supported + for this counter. +TD_TD_BUSY_sum: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: reduce(TD_TD_BUSY,sum) + description: TD is processing or waiting for data. Perf_Windowing not supported + for this counter. Sum over TD instances. +TOTAL_16_OPS: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: (SQ_INSTS_VALU_FMA_F16*2+SQ_INSTS_VALU_ADD_F16+SQ_INSTS_VALU_MUL_F16+SQ_INSTS_VALU_TRANS_F16)*64+((SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16)*512) + description: The number of 16 bits OPS executed +TOTAL_32_OPS: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: (SQ_INSTS_VALU_FMA_F32*2+SQ_INSTS_VALU_INT32+SQ_INSTS_VALU_ADD_F32+SQ_INSTS_VALU_MUL_F32+SQ_INSTS_VALU_TRANS_F32)*64+(SQ_INSTS_VALU_MFMA_MOPS_F32*512) + description: The number of 32 bits OPS executed +TOTAL_64_OPS: + architectures: + gfx942/gfx941/gfx940/gfx90a: + expression: (SQ_INSTS_VALU_FMA_F64*2+SQ_INSTS_VALU_INT64+SQ_INSTS_VALU_ADD_F64+SQ_INSTS_VALU_MUL_F64)*64+(SQ_INSTS_VALU_MFMA_MOPS_F64*512) + description: The number of 64 bits OPS executed +TaUtil: + architectures: + gfx90a: + expression: 100*GRBM_TA_BUSY/GRBM_GUI_ACTIVE + description: 'Unit: percent' +TcUtil: + architectures: + gfx90a: + expression: 100*GRBM_TC_BUSY/GRBM_GUI_ACTIVE + description: 'Unit: percent' +VALUBusy: + architectures: + gfx906/gfx908/gfx8/gfx90a/gfx9: + expression: 100*SQ_ACTIVE_INST_VALU*4/SIMD_NUM/GRBM_GUI_ACTIVE + gfx942/gfx941/gfx940: + expression: 100*reduce(SQ_ACTIVE_INST_VALU,sum)*4/SIMD_NUM/reduce(GRBM_GUI_ACTIVE,sum) + description: 'The percentage of GPUTime vector ALU instructions are processed. Value + range: 0% (bad) to 100% (optimal).' +VALUInsts: + architectures: + gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx8/gfx90a/gfx9: + expression: SQ_INSTS_VALU/SQ_WAVES + description: The average number of vector ALU instructions executed per work-item + (affected by flow control). +VALUUtilization: + architectures: + gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9: + expression: 100*SQ_THREAD_CYCLES_VALU/(SQ_ACTIVE_INST_VALU*MAX_WAVE_SIZE) + description: 'The percentage of active vector ALU threads in a wave. A lower number + can mean either more thread divergence in a wave or that the work-group size is + not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence).' +VFetchInsts: + architectures: + gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9: + expression: (SQ_INSTS_VMEM_RD-TA_FLAT_READ_WAVEFRONTS_sum)/SQ_WAVES + description: The average number of vector fetch instructions from the video memory + executed per work-item (affected by flow control). Excludes FLAT instructions + that fetch from video memory. +VWriteInsts: + architectures: + gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9: + expression: (SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES + description: The average number of vector write instructions to the video memory + executed per work-item (affected by flow control). Excludes FLAT instructions + that write to video memory. +ValuIops: + architectures: + gfx90a: + expression: (SQ_INSTS_VALU_INT32+SQ_INSTS_VALU_INT64)*64 + description: 'Unit: IOP' +ValuPipeIssueUtil: + architectures: + gfx90a: + expression: 100*SQ_ACTIVE_INST_VALU/(GRBM_GUI_ACTIVE*CU_NUM) + description: 'Unit: percent' +VmemLatency: + architectures: + gfx90a: + expression: SQ_ACCUM_PREV_HIRES/SQ_INSTS_VMEM + description: 'Unit: cycles' +VmemPipeIssueUtil: + architectures: + gfx90a: + expression: 400*(SQ_ACTIVE_INST_VMEM+SQ_ACTIVE_INST_FLAT)/(GRBM_GUI_ACTIVE*CU_NUM) + description: 'Unit: percent' +WAVE_DEP_WAIT: + architectures: + gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101: + expression: 100*SQ_WAIT_ANY/SQ_WAVE_CYCLES + description: Percentage of the SQ_WAVE_CYCLE time spent waiting for anything. +WAVE_ISSUE_WAIT: + architectures: + gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101: + expression: 100*SQ_WAIT_INST_ANY/SQ_WAVE_CYCLES + description: Percentage of the SQ_WAVE_CYCLE time spent waiting for any instruction + issue. +WDATA1_SIZE: + architectures: + gfx906: + expression: ((TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)*32+TCC_EA1_WRREQ_64B_sum*64) + description: The total kilobytes written to the video memory. This is measured on + EA1s. +WRITE_REQ_32B: + architectures: + gfx8: + expression: TCC_MC_WRREQ_sum + gfx906: + expression: (TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)+(TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)+(TCC_EA_WRREQ_64B_sum+TCC_EA1_WRREQ_64B_sum)*2 + gfx908/gfx90a/gfx9: + expression: TCC_EA_WRREQ_64B_sum*2+(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum) + gfx942/gfx941/gfx940: + expression: TCC_EA0_WRREQ_64B_sum*2+(TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum) + description: The total number of 32-byte effective memory writes. +WRITE_SIZE: + architectures: + gfx8: + expression: (TCC_MC_WRREQ_sum*32)/1024 + gfx906: + expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64+WDATA1_SIZE)/1024 + gfx908/gfx90a/gfx9: + expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024 + gfx942/gfx941/gfx940: + expression: ((TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum)*32+TCC_EA0_WRREQ_64B_sum*64)/1024 + description: The total kilobytes written to the video memory. This is measured with + all extra fetches and any cache or memory effects taken into account. +WaveDepWait: + architectures: + gfx90a: + expression: 100*SQ_WAIT_ANY/SQ_WAVE_CYCLES + description: 'Unit: percent' +WaveDuration: + architectures: + gfx90a: + expression: 4*SQ_WAVE_CYCLES/SQ_WAVES + description: 'Unit: cycles' +WaveExec: + architectures: + gfx90a: + expression: 100*SQ_ACTIVE_INST_ANY/SQ_WAVE_CYCLES + description: 'Unit: percent' +WaveIssueWait: + architectures: + gfx90a: + expression: 100*SQ_WAIT_INST_ANY/SQ_WAVE_CYCLES + description: 'Unit: percent' +WaveOccupancy: + architectures: + gfx90a: + expression: SQ_ACCUM_PREV_HIRES/GRBM_GUI_ACTIVE + description: 'Unit: wavefronts' +Wavefronts: + architectures: + gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx8/gfx90a/gfx9: + expression: SQ_WAVES + description: Total wavefronts. +WriteSize: + architectures: + gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9: + expression: WRITE_SIZE + description: The total kilobytes written to the video memory. This is measured with + all extra fetches and any cache or memory effects taken into account. +WriteUnitStalled: + architectures: + gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101: + expression: 100*GL2C_WRREQ_STALL_max/GRBM_GUI_ACTIVE + gfx906/gfx908/gfx8/gfx90a/gfx9: + expression: 100*TCC_WRREQ_STALL_max/GRBM_GUI_ACTIVE + description: 'The percentage of GPUTime the Write unit is stalled. Value range: + 0% to 100% (bad).' +sL1dCacheHitRate: + architectures: + gfx90a: + expression: 100*SQC_DCACHE_HITS/SQC_DCACHE_REQ + description: 'Unit: percent' +vL1dAtomicTagConfStallRate: + architectures: + gfx90a: + expression: 100*TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN2_sum + description: 'Unit: percent' +vL1dBufCoalesceRate: + architectures: + gfx90a: + expression: 6400*TA_TOTAL_WAVEFRONTS_sum/(TCP_TOTAL_ACCESSES_sum*4) + description: 'Unit: percent' +vL1dCacheTcbHitRate: + architectures: + gfx90a: + expression: 100*TCP_UTCL1_TRANSLATION_HIT_sum/TCP_UTCL1_REQUEST_sum + description: 'Unit: percent' +vL1dCacheUtil: + architectures: + gfx90a: + expression: 100*TCP_GATE_EN2_sum/TCP_GATE_EN1_sum + description: 'Unit: percent' +vL1dCacheWaveLatency: + architectures: + gfx90a: + expression: TCP_TCP_LATENCY_sum/TCP_TA_TCP_STATE_READ_sum + description: 'Unit: cycles' +vL1dDataPendRate: + architectures: + gfx90a: + expression: 100*TCP_PENDING_STALL_CYCLES_sum/TCP_GATE_EN2_sum + description: 'Unit: percent' +vL1dDataRetStallRate: + architectures: + gfx90a: + expression: 100*TD_TC_STALL_sum/TD_TD_BUSY_sum + description: 'Unit: percent' +vL1dMissReqStallRate: + architectures: + gfx90a: + expression: 100*TCP_TCR_TCP_STALL_CYCLES_sum/TCP_GATE_EN2_sum + description: 'Unit: percent' +vL1dRdTagConfStallRate: + architectures: + gfx90a: + expression: 100*TCP_READ_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN2_sum + description: 'Unit: percent' +vL1dReadFromL2Latency: + architectures: + gfx90a: + expression: TCP_TCC_READ_REQ_LATENCY_sum/(TCP_TCC_READ_REQ_sum+TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + description: 'Unit: cycles' +vL1dWrTagConfStallRate: + architectures: + gfx90a: + expression: 100*TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN2_sum + description: 'Unit: percent' +vL1dWriteToL2Latency: + architectures: + gfx90a: + expression: TCP_TCC_WRITE_REQ_LATENCY_sum/(TCP_TCC_WRITE_REQ_sum+TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + description: 'Unit: cycles'