Migrate XML counter defs and reader from v1/v2 (#25)

* Migrate XML counter defs and reader from v1/v2 * working set * more fixes * Update CMakeLists.txt * source formatting (clang-format v11) (#83) Co-authored-by: bwelton <bwelton@users.noreply.github.com> * Update source/lib/rocprofiler/counters/CMakeLists.txt --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: bwelton <bwelton@users.noreply.github.com> Co-authored-by: Jonathan R. Madsen <jrmadsen@users.noreply.github.com> [ROCm/rocprofiler-sdk commit: 59cc2a382c]
2023-09-26 11:50:57 -07:00
@@ -5,7 +5,7 @@ rocprofiler_activate_clang_tidy()

 set(common_sources config.cpp environment.cpp demangle.cpp)
 set(common_headers config.hpp defines.hpp environment.hpp demangle.hpp mpl.hpp
-                   utility.hpp)
+                   utility.hpp xml.hpp)

 add_library(rocprofiler-common-library STATIC)
 add_library(rocprofiler::rocprofiler-common-library ALIAS rocprofiler-common-library)
@@ -0,0 +1,606 @@
+/******************************************************************************
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*******************************************************************************/
+
+#pragma once
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <fstream>
+#include <iostream>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace xml
+{
+class Xml
+{
+public:
+    using token_t = std::vector<char>;
+
+    struct level_t;
+    using node_vect_t = std::vector<std::shared_ptr<level_t>>;
+    using node_list_t = std::list<std::shared_ptr<level_t>>;
+
+    using nodes_t = node_vect_t;
+    using opts_t  = std::map<std::string, std::string>;
+    struct level_t
+    {
+        std::string                    tag;
+        nodes_t                        nodes;
+        opts_t                         opts;
+        std::shared_ptr<const level_t> copy;
+    };
+    using nodes_vec_t = std::vector<std::shared_ptr<level_t>>;
+    using map_t       = std::map<std::string, nodes_vec_t>;
+
+    enum
+    {
+        DECL_STATE,
+        BODY_STATE
+    };
+
+    static std::shared_ptr<Xml> Create(const std::string& file_name, const Xml* obj = nullptr)
+    {
+        auto xml = std::make_shared<Xml>(file_name, obj);
+        if(xml != nullptr)
+        {
+            if(xml->Init() != false)
+            {
+                const std::size_t pos = file_name.rfind('/');
+                const std::string path =
+                    (pos != std::string::npos) ? file_name.substr(0, pos + 1) : "";
+
+                xml->PreProcess();
+                nodes_t incl_nodes;
+                for(const auto& node : xml->GetNodes("top.include"))
+                {
+                    if(node->opts.find("touch") == node->opts.end())
+                    {
+                        node->opts["touch"] = "";
+                        incl_nodes.push_back(node);
+                    }
+                }
+                for(const auto& incl : incl_nodes)
+                {
+                    const std::string& incl_name = path + incl->opts["file"];
+                    auto               ixml      = Create(incl_name, xml.get());
+                    if(!ixml)
+                    {
+                        xml.reset();
+                        break;
+                    }
+                }
+                if(xml)
+                {
+                    xml->Process();
+                }
+            }
+        }
+
+        return xml;
+    }
+
+    std::string GetName() { return file_name_; }
+
+    // clang-tidy incorrectly marks these functions as being staticable. They are not.
+    // NOLINTBEGIN
+    void AddExpr(const std::string& full_tag, const std::string& name, const std::string& expr)
+    {
+        const std::size_t pos       = full_tag.rfind('.');
+        const std::size_t pos1      = (pos == std::string::npos) ? 0 : pos + 1;
+        const std::string level_tag = full_tag.substr(pos1);
+        auto              level     = std::make_shared<level_t>();
+        (*map_)[full_tag].push_back(level);
+        level->tag          = level_tag;
+        level->opts["name"] = name;
+        level->opts["expr"] = expr;
+    }
+
+    void AddConst(const std::string& full_tag, const std::string& name, const uint64_t& val)
+    {
+        std::ostringstream oss;
+        oss << val;
+        AddExpr(full_tag, name, oss.str());
+    }
+    // NOLINTEND
+
+    nodes_t      GetNodes(const std::string& global_tag) { return (*map_)[global_tag]; }
+    const map_t& GetAllNodes() { return (*map_); }
+
+    template <class F>
+    F ForEach(const F& f_i)
+    {
+        F f = f_i;
+        if(map_)
+        {
+            for(auto& entry : *map_)
+            {
+                for(auto node : entry.second)
+                {
+                    if(f.fun(entry.first, node) == false) break;
+                }
+            }
+        }
+        return f;
+    }
+
+    template <class F>
+    F ForEach(const F& f_i) const
+    {
+        F f = f_i;
+        if(map_)
+        {
+            for(auto& entry : *map_)
+            {
+                for(const auto& node : entry.second)
+                {
+                    if(f.fun(entry.first, node) == false) break;
+                }
+            }
+        }
+        return f;
+    }
+
+    struct print_func
+    {
+        static bool fun(const std::string& global_tag, const std::shared_ptr<level_t>& node)
+        {
+            std::cout << global_tag << ":" << std::endl;
+            for(auto& opt : node->opts)
+            {
+                std::cout << global_tag << "." << opt.first << " = " << opt.second << std::endl;
+            }
+            return true;
+        }
+    };
+
+    void Print() const
+    {
+        std::cout << "XML file '" << file_name_ << "':" << std::endl;
+        ForEach(print_func());
+    }
+
+    Xml(std::string file_name, const Xml* obj)
+    : file_name_(std::move(file_name))
+    , state_(BODY_STATE)
+    {
+        if(obj != nullptr)
+        {
+            map_      = obj->map_;
+            level_    = obj->level_;
+            included_ = true;
+        }
+    }
+
+    ~Xml()
+    {
+        for(auto& x : stack_)
+        {
+            x->nodes.clear();
+            x->copy.reset();
+        }
+        if(!map_) return;
+        for(auto& [_, nodes] : *map_)
+        {
+            for(auto& node : nodes)
+            {
+                node->nodes.clear();
+                node->copy.reset();
+            }
+        }
+    }
+
+private:
+    bool Init()
+    {
+        fd_ = open(file_name_.c_str(), O_RDONLY);
+        if(fd_ == -1)
+        {
+            // perror((std::string("open XML file ") + file_name_).c_str());
+            return false;
+        }
+
+        if(map_ == nullptr)
+        {
+            map_ = std::make_unique<map_t>();
+            AddLevel("top");
+        }
+
+        return true;
+    }
+
+    void PreProcess()
+    {
+        uint32_t ind = 0;
+        char     buf[kBufSize];
+        bool     error = false;
+
+        while(true)
+        {
+            const uint32_t pos  = lseek(fd_, 0, SEEK_CUR);
+            uint32_t       size = read(fd_, buf, kBufSize);
+            if(size <= 0) break;
+            buf[size - 1] = '\0';
+
+            if(strncmp(buf, "#include \"", 10) == 0)
+            {
+                for(ind = 0; (ind < size) && (buf[ind] != '\n'); ++ind)
+                {}
+                if(ind < size)
+                {
+                    buf[ind] = '\0';
+                    size     = ind;
+                    lseek(fd_, pos + ind + 1, SEEK_SET);
+                }
+
+                for(ind = 10; (ind < size) && (buf[ind] != '"'); ++ind)
+                {}
+                if(ind == size)
+                {
+                    error = true;
+                    break;
+                }
+                buf[ind] = '\0';
+
+                AddLevel("include");
+                AddOption("file", &buf[10]);
+                UpLevel();
+            }
+        }
+
+        if(error)
+        {
+            fprintf(stderr, "XML PreProcess failed, line '%s'\n", buf);
+            abort();
+        }
+
+        lseek(fd_, 0, SEEK_SET);
+    }
+
+    void Process()
+    {
+        token_t remainder;
+
+        while(true)
+        {
+            token_t token = (!remainder.empty()) ? remainder : NextToken();
+            remainder.clear();
+
+            // token_t token1 = token;
+            // token1.push_back('\0');
+            // std::cout << ">>> " << &token1[0] << std::endl;
+
+            // End of file
+            if(token.empty()) break;
+
+            switch(state_)
+            {
+                case BODY_STATE:
+                    if(token[0] == '<')
+                    {
+                        bool     node_begin = true;
+                        unsigned ind        = 1;
+                        if(token[1] == '/')
+                        {
+                            node_begin = false;
+                            ++ind;
+                        }
+
+                        unsigned i = ind;
+                        while(i < token.size())
+                        {
+                            if(token[i] == '>') break;
+                            ++i;
+                        }
+                        for(unsigned j = i + 1; j < token.size(); ++j)
+                            remainder.push_back(token[j]);
+
+                        if(i == token.size())
+                        {
+                            if(node_begin)
+                                state_ = DECL_STATE;
+                            else
+                                BadFormat(token);
+                            token.push_back('\0');
+                        }
+                        else
+                        {
+                            token[i] = '\0';
+                        }
+
+                        const char* tag = &token[ind];
+                        if(node_begin)
+                        {
+                            AddLevel(tag);
+                        }
+                        else
+                        {
+                            Inherit(GetOption("base"));
+
+                            if(strncmp(CurrentLevel().c_str(), tag, strlen(tag)) != 0)
+                            {
+                                token.back() = '>';
+                                BadFormat(token);
+                            }
+                            UpLevel();
+                        }
+                    }
+                    else
+                    {
+                        BadFormat(token);
+                    }
+                    break;
+                case DECL_STATE:
+                    if(token[0] == '>')
+                    {
+                        state_ = BODY_STATE;
+                        for(unsigned j = 1; j < token.size(); ++j)
+                            remainder.push_back(token[j]);
+                        continue;
+                    }
+                    else
+                    {
+                        token.push_back('\0');
+                        unsigned j = 0;
+                        for(j = 0; j < token.size(); ++j)
+                            if(token[j] == '=') break;
+                        if(j == token.size()) BadFormat(token);
+                        token[j]                = '\0';
+                        const std::string key   = token.data();
+                        const std::string value = &token[j + 1];
+                        AddOption(key, value);
+                    }
+                    break;
+                default:
+                    std::cout << "XML parser error: wrong state: " << state_ << std::endl;
+                    abort();
+            }
+        }
+    }
+
+    bool SpaceCheck() const
+    {
+        bool cond = ((buffer_[index_] == ' ') || (buffer_[index_] == '\t'));
+        return cond;
+    }
+
+    bool LineEndCheck()
+    {
+        bool found = false;
+        if(buffer_[index_] == '\n')
+        {
+            buffer_[index_] = ' ';
+            ++file_line_;
+            found    = true;
+            comment_ = false;
+        }
+        else if(comment_ || (buffer_[index_] == '#'))
+        {
+            found    = true;
+            comment_ = true;
+        }
+        return found;
+    }
+
+    token_t NextToken()
+    {
+        token_t token;
+        bool    in_string    = false;
+        bool    special_symb = false;
+
+        while(true)
+        {
+            if(data_size_ == 0)
+            {
+                data_size_ = read(fd_, buffer_, kBufSize);
+                if(data_size_ <= 0) break;
+            }
+
+            if(token.empty())
+            {
+                while((index_ < data_size_) && (SpaceCheck() || LineEndCheck()))
+                {
+                    ++index_;
+                }
+            }
+            while((index_ < data_size_) && (in_string || !(SpaceCheck() || LineEndCheck())))
+            {
+                const char symb      = buffer_[index_];
+                bool       skip_symb = false;
+
+                switch(symb)
+                {
+                    case '\\':
+                        if(special_symb)
+                        {
+                            special_symb = false;
+                        }
+                        else
+                        {
+                            special_symb = true;
+                            skip_symb    = true;
+                        }
+                        break;
+                    case '"':
+                        if(special_symb)
+                        {
+                            special_symb = false;
+                        }
+                        else
+                        {
+                            in_string = !in_string;
+                            if(!in_string)
+                            {
+                                buffer_[index_] = ' ';
+                                --index_;
+                            }
+                            skip_symb = true;
+                        }
+                        break;
+                }
+
+                if(!skip_symb) token.push_back(symb);
+                ++index_;
+            }
+
+            if(index_ == data_size_)
+            {
+                index_     = 0;
+                data_size_ = 0;
+            }
+            else
+            {
+                if(special_symb || in_string) BadFormat(token);
+                break;
+            }
+        }
+
+        return token;
+    }
+
+    void BadFormat(token_t token)
+    {
+        token.push_back('\0');
+        std::cout << "Error: " << file_name_ << ", line " << file_line_ << ", bad XML token '"
+                  << token.data() << "'" << std::endl;
+        abort();
+    }
+
+    void AddLevel(const std::string& tag)
+    {
+        auto level = std::make_shared<level_t>();
+        level->tag = tag;
+        if(level_)
+        {
+            level_->nodes.push_back(level);
+            stack_.push_back(level_);
+        }
+        level_ = level;
+
+        std::string global_tag = GlobalTag(tag);
+        (*map_)[global_tag].push_back(level_);
+    }
+
+    void UpLevel()
+    {
+        level_ = stack_.back();
+        stack_.pop_back();
+    }
+
+    void Copy(const std::shared_ptr<level_t>& from, const std::shared_ptr<level_t>& to)
+    {
+        auto level = to;
+        if(level == nullptr)
+        {
+            AddLevel(from->tag);
+            level = level_;
+        }
+        level->copy = from;
+        level->opts = from->opts;
+
+        for(const auto& node : from->nodes)
+        {
+            bool              found      = false;
+            const std::string name       = GetOption("name", node);
+            const std::string global_tag = GlobalTag(level->tag) + "." + node->tag;
+            for(const auto& item : (*map_)[global_tag])
+            {
+                if((name == GetOption("name", item)) || (node == item->copy))
+                {
+                    found = true;
+                    break;
+                }
+            }
+            if(found == false) Copy(node, nullptr);
+        }
+
+        if(to == nullptr) UpLevel();
+    }
+
+    void Inherit(const std::string& tag)
+    {
+        if(!tag.empty())
+        {
+            const std::string global_tag = GlobalTag(tag);
+            auto              it         = map_->find(global_tag);
+            if(it == map_->end())
+            {
+                fprintf(stderr,
+                        "Node \"%s\": Base not found \"%s\"\n",
+                        level_->tag.c_str(),
+                        tag.c_str());
+                abort();
+            }
+            for(const auto& node : it->second)
+            {
+                Copy(node, level_);
+            }
+        }
+    }
+
+    std::string CurrentLevel() const { return level_->tag; }
+
+    std::string GlobalTag(const std::string& tag) const
+    {
+        std::string global_tag;
+        for(const auto& level : stack_)
+        {
+            global_tag += level->tag + ".";
+        }
+        global_tag += tag;
+        return global_tag;
+    }
+
+    void AddOption(const std::string& key, const std::string& value) { level_->opts[key] = value; }
+    std::string GetOption(const std::string& key, std::shared_ptr<const level_t> level = nullptr)
+    {
+        level   = (level != nullptr) ? level : level_;
+        auto it = level->opts.find(key);
+        return (it != level->opts.end()) ? it->second : "";
+    }
+
+    const std::string file_name_;
+    unsigned          file_line_{0};
+    int               fd_;
+
+    static const size_t kBufSize = 256;
+    char                buffer_[kBufSize];
+
+    unsigned                              data_size_{0};
+    unsigned                              index_{0};
+    unsigned                              state_{0};
+    bool                                  comment_{false};
+    std::vector<std::shared_ptr<level_t>> stack_;
+    bool                                  included_{false};
+    std::shared_ptr<level_t>              level_;
+    std::shared_ptr<map_t>                map_;
+};
+
+}  // namespace xml
@@ -30,6 +30,7 @@ target_sources(rocprofiler-object-library PRIVATE ${ROCPROFILER_LIB_SOURCES}

 add_subdirectory(hsa)
 add_subdirectory(context)
+add_subdirectory(counters)

 target_link_libraries(
    rocprofiler-object-library
@@ -96,9 +97,7 @@ target_link_libraries(
    rocprofiler-static-library
    PUBLIC rocprofiler::rocprofiler-headers rocprofiler::rocprofiler-hsa-runtime
           rocprofiler::rocprofiler-hip
-    PRIVATE rocprofiler::rocprofiler-build-flags rocprofiler::rocprofiler-memcheck
-            rocprofiler::rocprofiler-common-library rocprofiler::rocprofiler-stdcxxfs
-            rocprofiler::rocprofiler-dl rocprofiler::rocprofiler-amd-comgr)
+    PRIVATE rocprofiler::rocprofiler-common-library)

 set_target_properties(
    rocprofiler-static-library PROPERTIES OUTPUT_NAME rocprofiler64 DEFINE_SYMBOL
@@ -0,0 +1,11 @@
+set(ROCPROFILER_LIB_COUNTERS_SOURCES metrics.cpp)
+set(ROCPROFILER_LIB_COUNTERS_HEADERS metrics.hpp)
+
+target_sources(rocprofiler-object-library PRIVATE ${ROCPROFILER_LIB_COUNTERS_SOURCES}
+                                                  ${ROCPROFILER_LIB_COUNTERS_HEADERS})
+
+add_subdirectory(xml)
+
+if(ROCPROFILER_BUILD_TESTS)
+    add_subdirectory(tests)
+endif()
@@ -0,0 +1,118 @@
+/******************************************************************************
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*******************************************************************************/
+
+#include "metrics.hpp"
+
+#include <dlfcn.h>  // for dladdr
+#include <cstdlib>
+#include <filesystem>
+#include <optional>
+
+#include "glog/logging.h"
+#include "lib/common/xml.hpp"
+#include "rocprofiler/rocprofiler.h"
+
+namespace counters
+{
+namespace
+{
+MetricMap
+loadXml(const std::string& filename)
+{
+    MetricMap ret;
+    DLOG(INFO) << "Loading Counter Config: " << filename;
+    // todo: return unique_ptr....
+    auto xml = xml::Xml::Create(filename);
+    LOG_IF(FATAL, !xml)
+        << "Could not open XML Counter Config File (set env ROCPROFILER_METRICS_PATH)";
+
+    for(const auto& [gfx_name, nodes] : xml->GetAllNodes())
+    {
+        /**
+         * "top." is used to designate the root encapsulation of all contained XML subroots (in our
+         * case "gfxX"). This is inserted by the parser so it will always be present. .metric
+         * denotes XML tags that are contained in the subroots. This will not change unless we
+         * respec the XML (which we should...).
+         */
+        if(gfx_name.find("metric") == std::string::npos ||
+           gfx_name.find("top.") == std::string::npos)
+            continue;
+
+        auto& metricVec =
+            ret.emplace(gfx_name.substr(strlen("top."),
+                                        gfx_name.size() - strlen("top.") - strlen(".metric")),
+                        std::vector<Metric>())
+                .first->second;
+        for(const auto& node : nodes)
+        {
+            metricVec.emplace_back(node->opts["name"],
+                                   node->opts["block"],
+                                   node->opts["event"],
+                                   node->opts["descr"],
+                                   node->opts["expr"]);
+        }
+    }
+
+    DLOG(INFO) << fmt::format("{}", ret);
+    return ret;
+}
+
+std::string
+findViaInstallPath(const std::string& filename)
+{
+    Dl_info dl_info;
+    DLOG(INFO) << filename << " is being looked up via install path";
+    if(dladdr(reinterpret_cast<const void*>(rocprofiler_query_available_agents), &dl_info) != 0)
+    {
+        return std::filesystem::path{dl_info.dli_fname}.remove_filename() /
+               fmt::format("../lib/{}", filename);
+    }
+    return filename;
+}
+
+std::string
+findViaEnvironment(const std::string& filename)
+{
+    if(getenv("ROCPROFILER_METRICS_PATH"))
+    {
+        DLOG(INFO) << filename << " is being looked up via env variable ROCPROFILER_METRICS_PATH";
+        return std::filesystem::path{std::string(getenv("ROCPROFILER_METRICS_PATH"))} / filename;
+    }
+    // No environment variable, lookup via install path
+    return findViaInstallPath(filename);
+}
+
+}  // namespace
+
+MetricMap
+getDerrivedHardwareMetrics()
+{
+    return loadXml(findViaEnvironment("derived_counters.xml"));
+}
+
+MetricMap
+getBaseHardwareMetrics()
+{
+    return loadXml(findViaEnvironment("basic_counters.xml"));
+}
+
+};  // namespace counters
@@ -0,0 +1,101 @@
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <hsa/hsa_ven_amd_aqlprofile.h>
+
+#include "fmt/core.h"
+#include "fmt/ranges.h"
+
+namespace counters
+{
+// Base metrics (w/o instance information) defined in gfx_metrics/derrived.xml
+class Metric
+{
+public:
+    Metric(std::string name,
+           std::string block,
+           std::string event,
+           std::string dsc,
+           std::string expr)
+    : name_(std::move(name))
+    , block_(std::move(block))
+    , event_(std::move(event))
+    , description_(std::move(dsc))
+    , expression_(std::move(expr))
+    {}
+
+    const std::string& name() const { return name_; }
+    const std::string& block() const { return block_; }
+    const std::string& event() const { return event_; }
+    const std::string& description() const { return description_; }
+    const std::string& expression() const { return expression_; }
+
+private:
+    std::string name_;
+    std::string block_;
+    std::string event_;
+    std::string description_;
+    std::string expression_;
+};
+
+using MetricMap = std::unordered_map<std::string, std::vector<Metric>>;
+
+MetricMap
+getBaseHardwareMetrics();
+
+MetricMap
+getDerrivedHardwareMetrics();
+
+}  // namespace counters
+
+namespace fmt
+{
+// fmt::format support for metric
+template <>
+struct formatter<counters::Metric>
+{
+    template <typename ParseContext>
+    constexpr auto parse(ParseContext& ctx)
+    {
+        return ctx.begin();
+    }
+
+    template <typename Ctx>
+    auto format(counters::Metric const& metric, Ctx& ctx) const
+    {
+        return fmt::format_to(ctx.out(),
+                              "Metric: {} [Block: {}, Event: {}, Expression: {}, Description: {}]",
+                              metric.name(),
+                              metric.block(),
+                              metric.event(),
+                              metric.expression().empty() ? "<None>" : metric.expression(),
+                              metric.description());
+    }
+};
+
+// fmt::format support for MetricMap
+template <>
+struct formatter<counters::MetricMap>
+{
+    template <typename ParseContext>
+    constexpr auto parse(ParseContext& ctx)
+    {
+        return ctx.begin();
+    }
+
+    template <typename Ctx>
+    auto format(counters::MetricMap const& map, Ctx& ctx) const
+    {
+        std::string out;
+        for(const auto& [gfxName, counters] : map)
+        {
+            out += fmt::format("Counters for {}\n\t{}\n", gfxName, fmt::join(counters, "\n\t"));
+        }
+        return fmt::format_to(ctx.out(), "{}", out);
+    }
+};
+}  // namespace fmt
@@ -0,0 +1,24 @@
+rocprofiler_deactivate_clang_tidy()
+
+include(GoogleTest)
+
+set(ROCPROFILER_LIB_COUNTER_TEST_SOURCES "metrics_test.cpp")
+
+add_executable(counter-test)
+
+target_sources(
+    counter-test PRIVATE ${ROCPROFILER_LIB_COUNTER_TEST_SOURCES}
+                         $<TARGET_OBJECTS:rocprofiler::rocprofiler-object-library>)
+
+target_link_libraries(
+    counter-test
+    PRIVATE rocprofiler::rocprofiler-hip rocprofiler::rocprofiler-common-library
+            rocprofiler::rocprofiler-object-library GTest::gtest GTest::gtest_main)
+
+gtest_add_tests(
+    TARGET counter-test
+    SOURCES ${ROCPROFILER_LIB_COUNTER_TEST_SOURCES}
+    TEST_LIST counter-test_TESTS
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
+set_tests_properties(${counter-tests_TESTS} PROPERTIES TIMEOUT 45 LABELS "unittests")
@@ -0,0 +1,42 @@
+#include "metrics_test.h"
+
+#include <gtest/gtest.h>
+
+#include "lib/rocprofiler/counters/metrics.hpp"
+
+namespace
+{
+auto
+loadTestData(std::unordered_map<std::string, std::vector<std::vector<std::string>>> map)
+{
+    std::unordered_map<std::string, std::vector<counters::Metric>> ret;
+    for(auto& [gfx, dataMap] : map)
+    {
+        auto& metric_vec = ret.emplace(gfx, std::vector<counters::Metric>{}).first->second;
+        for(auto& data_vec : dataMap)
+        {
+            metric_vec.emplace_back(
+                data_vec.at(0), data_vec.at(1), data_vec.at(2), data_vec.at(4), data_vec.at(3));
+        }
+    }
+    return ret;
+}
+}  // namespace
+
+TEST(MetricsTest, BaseMetricLoad)
+{
+    auto x         = counters::getBaseHardwareMetrics();
+    auto test_data = loadTestData(basic_gfx908);
+    ASSERT_EQ(x.count("gfx908"), 1);
+    ASSERT_EQ(test_data.count("gfx908"), 1);
+    EXPECT_EQ(fmt::format("{}", x["gfx908"]), fmt::format("{}", test_data["gfx908"]));
+}
+
+TEST(MetricsTest, DerrivedMetricLoad)
+{
+    auto x         = counters::getDerrivedHardwareMetrics();
+    auto test_data = loadTestData(derrived_gfx908);
+    ASSERT_EQ(x.count("gfx908"), 1);
+    ASSERT_EQ(test_data.count("gfx908"), 1);
+    EXPECT_EQ(fmt::format("{}", x["gfx908"]), fmt::format("{}", test_data["gfx908"]));
+}
@@ -0,0 +1,323 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+// Expected values for GFX908. GFX908 was chosen because it is not the first
+// arch defined in the XML and it is also an arch that inherits values (from gfx9)
+// Layout is: {name, block, event, expression, description}.
+static const std::unordered_map<std::string, std::vector<std::vector<std::string>>> basic_gfx908 = {
+    {"gfx908",
+     {{"SQ_INSTS_VMEM_WR",
+       "SQ",
+       "28",
+       "<None>",
+       "Number of VMEM write instructions issued (including FLAT). (per-simd, emulated)"},
+      {"SQ_INSTS_VMEM_RD",
+       "SQ",
+       "29",
+       "<None>",
+       "Number of VMEM read instructions issued (including FLAT). (per-simd, emulated)"},
+      {"SQ_INSTS_SALU",
+       "SQ",
+       "31",
+       "<None>",
+       "Number of SALU instructions issued. (per-simd, emulated)"},
+      {"SQ_INSTS_SMEM",
+       "SQ",
+       "32",
+       "<None>",
+       "Number of SMEM instructions issued. (per-simd, emulated)"},
+      {"SQ_INSTS_FLAT",
+       "SQ",
+       "33",
+       "<None>",
+       "Number of FLAT instructions issued. (per-simd, emulated)"},
+      {"SQ_INSTS_FLAT_LDS_ONLY",
+       "SQ",
+       "34",
+       "<None>",
+       "Number of FLAT instructions issued that read/wrote only from/to LDS (only works if "
+       "EARLY_TA_DONE is enabled). (per-simd, emulated)"},
+      {"SQ_INSTS_LDS",
+       "SQ",
+       "35",
+       "<None>",
+       "Number of LDS instructions issued (including FLAT). (per-simd, emulated)"},
+      {"SQ_INSTS_GDS",
+       "SQ",
+       "36",
+       "<None>",
+       "Number of GDS instructions issued. (per-simd, emulated)"},
+      {"SQ_WAIT_INST_LDS",
+       "SQ",
+       "64",
+       "<None>",
+       "Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. "
+       "(per-simd, nondeterministic)"},
+      {"SQ_ACTIVE_INST_VALU",
+       "SQ",
+       "72",
+       "<None>",
+       "regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. "
+       "(per-simd, nondeterministic). Units in quad-cycles(4 cycles)"},
+      {"SQ_INST_CYCLES_SALU",
+       "SQ",
+       "85",
+       "<None>",
+       "Number of cycles needed to execute non-memory read scalar operations. (per-simd, "
+       "emulated)"},
+      {"SQ_THREAD_CYCLES_VALU",
+       "SQ",
+       "86",
+       "<None>",
+       "Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but "
+       "multiplied by # of active threads). (per-simd)"},
+      {"SQ_LDS_BANK_CONFLICT",
+       "SQ",
+       "94",
+       "<None>",
+       "Number of cycles LDS is stalled by bank conflicts. (emulated)"},
+      {"TCC_HIT", "TCC", "17", "<None>", "Number of cache hits."},
+      {"TCC_MISS", "TCC", "19", "<None>", "Number of cache misses. UC reads count as misses."},
+      {"TCC_EA_WRREQ",
+       "TCC",
+       "26",
+       "<None>",
+       "Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. "
+       "Atomics may travel over the same interface and are generally classified as write requests. "
+       "This does not include probe commands."},
+      {"TCC_EA_WRREQ_64B",
+       "TCC",
+       "27",
+       "<None>",
+       "Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq "
+       "interface."},
+      {"TCC_EA_WRREQ_STALL",
+       "TCC",
+       "30",
+       "<None>",
+       "Number of cycles a write request was stalled."},
+      {"TCC_EA_RDREQ",
+       "TCC",
+       "38",
+       "<None>",
+       "Number of TCC/EA read requests (either 32-byte or 64-byte)"},
+      {"TCC_EA_RDREQ_32B", "TCC", "39", "<None>", "Number of 32-byte TCC/EA read requests"},
+      {"GRBM_COUNT", "GRBM", "0", "<None>", "Tie High - Count Number of Clocks"},
+      {"GRBM_GUI_ACTIVE", "GRBM", "2", "<None>", "The GUI is Active"},
+      {"SQ_WAVES",
+       "SQ",
+       "4",
+       "<None>",
+       "Count number of waves sent to SQs. (per-simd, emulated, global)"},
+      {"SQ_INSTS_VALU",
+       "SQ",
+       "26",
+       "<None>",
+       "Number of VALU instructions issued. (per-simd, emulated)"},
+      {"TA_TA_BUSY",
+       "TA",
+       "15",
+       "<None>",
+       "TA block is busy. Perf_Windowing not supported for this counter."},
+      {"TA_FLAT_READ_WAVEFRONTS",
+       "TA",
+       "101",
+       "<None>",
+       "Number of flat opcode reads processed by the TA."},
+      {"TA_FLAT_WRITE_WAVEFRONTS",
+       "TA",
+       "102",
+       "<None>",
+       "Number of flat opcode writes processed by the TA."},
+      {"TCP_TCP_TA_DATA_STALL_CYCLES",
+       "TCP",
+       "6",
+       "<None>",
+       "TCP stalls TA data interface. Now Windowed."}}}};
+
+static const std::unordered_map<std::string, std::vector<std::vector<std::string>>>
+    derrived_gfx908 = {
+        {"gfx908",
+         {{"TCC_HIT_sum",
+           "",
+           "",
+           "sum(TCC_HIT,32)",
+           "Number of cache hits. Sum over TCC instances."},
+          {"TCC_MISS_sum",
+           "",
+           "",
+           "sum(TCC_MISS,32)",
+           "Number of cache misses. Sum over TCC instances."},
+          {"TCC_EA_RDREQ_32B_sum",
+           "",
+           "",
+           "sum(TCC_EA_RDREQ_32B,32)",
+           "Number of 32-byte TCC/EA read requests. Sum over TCC instances."},
+          {"TCC_EA_RDREQ_sum",
+           "",
+           "",
+           "sum(TCC_EA_RDREQ,32)",
+           "Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances."},
+          {"TCC_EA_WRREQ_sum",
+           "",
+           "",
+           "sum(TCC_EA_WRREQ,32)",
+           "Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq "
+           "interface. Sum over TCC instances."},
+          {"TCC_EA_WRREQ_64B_sum",
+           "",
+           "",
+           "sum(TCC_EA_WRREQ_64B,32)",
+           "Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq "
+           "interface. Sum over TCC instances."},
+          {"TCC_WRREQ_STALL_max",
+           "",
+           "",
+           "max(TCC_EA_WRREQ_STALL,32)",
+           "Number of cycles a write request was stalled. Max over TCC instances."},
+          {"CU_UTILIZATION",
+           "",
+           "",
+           "GRBM_GUI_ACTIVE/GRBM_COUNT",
+           "The total number of active cycles divided by total number of elapsed cycles"},
+          {"KERNEL_DURATION", "", "", "1", "The duration of the kernel dispatch"},
+          {"TA_BUSY_avr",
+           "",
+           "",
+           "avr(TA_TA_BUSY,16)",
+           "TA block is busy. Average over TA instances."},
+          {"TA_BUSY_max", "", "", "max(TA_TA_BUSY,16)", "TA block is busy. Max over TA instances."},
+          {"TA_BUSY_min", "", "", "min(TA_TA_BUSY,16)", "TA block is busy. Min over TA instances."},
+          {"TA_FLAT_READ_WAVEFRONTS_sum",
+           "",
+           "",
+           "sum(TA_FLAT_READ_WAVEFRONTS,16)",
+           "Number of flat opcode reads processed by the TA. Sum over TA instances."},
+          {"TA_FLAT_WRITE_WAVEFRONTS_sum",
+           "",
+           "",
+           "sum(TA_FLAT_WRITE_WAVEFRONTS,16)",
+           "Number of flat opcode writes processed by the TA. Sum over TA instances."},
+          {"TCP_TCP_TA_DATA_STALL_CYCLES_sum",
+           "",
+           "",
+           "sum(TCP_TCP_TA_DATA_STALL_CYCLES,16)",
+           "Total number of TCP stalls TA data interface."},
+          {"TCP_TCP_TA_DATA_STALL_CYCLES_max",
+           "",
+           "",
+           "max(TCP_TCP_TA_DATA_STALL_CYCLES,16)",
+           "Maximum number of TCP stalls TA data interface."},
+          {"FETCH_SIZE",
+           "",
+           "",
+           "(TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024",
+           "The total kilobytes fetched from the video memory. This is measured with all extra "
+           "fetches and any cache or memory effects taken into account."},
+          {"WRITE_SIZE",
+           "",
+           "",
+           "((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024",
+           "The total kilobytes written to the video memory. This is measured with all extra "
+           "fetches and any cache or memory effects taken into account."},
+          {"WRITE_REQ_32B",
+           "",
+           "",
+           "TCC_EA_WRREQ_64B_sum*2+(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)",
+           "The total number of 32-byte effective memory writes."},
+          {"VFetchInsts",
+           "",
+           "",
+           "(SQ_INSTS_VMEM_RD-TA_FLAT_READ_WAVEFRONTS_sum)/SQ_WAVES",
+           "The average number of vector fetch instructions from the video memory executed per "
+           "work-item (affected by flow control). Excludes FLAT instructions that fetch from video "
+           "memory."},
+          {"VWriteInsts",
+           "",
+           "",
+           "(SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES",
+           "The average number of vector write instructions to the video memory executed per "
+           "work-item (affected by flow control). Excludes FLAT instructions that write to video "
+           "memory."},
+          {"FlatVMemInsts",
+           "",
+           "",
+           "(SQ_INSTS_FLAT-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES",
+           "The average number of FLAT instructions that read from or write to the video memory "
+           "executed per work item (affected by flow control). Includes FLAT instructions that "
+           "read from or write to scratch."},
+          {"LDSInsts",
+           "",
+           "",
+           "(SQ_INSTS_LDS-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES",
+           "The average number of LDS read or LDS write instructions executed per work item "
+           "(affected by flow control).  Excludes FLAT instructions that read from or write to "
+           "LDS."},
+          {"FlatLDSInsts",
+           "",
+           "",
+           "SQ_INSTS_FLAT_LDS_ONLY/SQ_WAVES",
+           "The average number of FLAT instructions that read or write to LDS executed per work "
+           "item (affected by flow control)."},
+          {"VALUUtilization",
+           "",
+           "",
+           "100*SQ_THREAD_CYCLES_VALU/(SQ_ACTIVE_INST_VALU*MAX_WAVE_SIZE)",
+           "The percentage of active vector ALU threads in a wave. A lower number can mean either "
+           "more thread divergence in a wave or that the work-group size is not a multiple of 64. "
+           "Value range: 0\% (bad), 100\% (ideal - no thread divergence)."},
+          {"VALUBusy",
+           "",
+           "",
+           "100*SQ_ACTIVE_INST_VALU*4/SIMD_NUM/GRBM_GUI_ACTIVE",
+           "The percentage of GPUTime vector ALU instructions are processed. Value range: 0\% "
+           "(bad) to 100\% (optimal)."},
+          {"SALUBusy",
+           "",
+           "",
+           "100*SQ_INST_CYCLES_SALU*4/SIMD_NUM/GRBM_GUI_ACTIVE",
+           "The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) "
+           "to 100% (optimal)."},
+          {"FetchSize",
+           "",
+           "",
+           "FETCH_SIZE",
+           "The total kilobytes fetched from the video memory. This is measured with all extra "
+           "fetches and any cache or memory effects taken into account."},
+          {"WriteSize",
+           "",
+           "",
+           "WRITE_SIZE",
+           "The total kilobytes written to the video memory. This is measured with all extra "
+           "fetches and any cache or memory effects taken into account."},
+          {"MemWrites32B",
+           "",
+           "",
+           "WRITE_REQ_32B",
+           "The total number of effective 32B write transactions to the memory"},
+          {"L2CacheHit",
+           "",
+           "",
+           "100*sum(TCC_HIT,16)/(sum(TCC_HIT,16)+sum(TCC_MISS,16))",
+           "The percentage of fetch, write, atomic, and other instructions that hit the data in L2 "
+           "cache. Value range: 0\% (no hit) to 100\% (optimal)."},
+          {"MemUnitStalled",
+           "",
+           "",
+           "100*TCP_TCP_TA_DATA_STALL_CYCLES_max/GRBM_GUI_ACTIVE/SE_NUM",
+           "The percentage of GPUTime the memory unit is stalled. Try reducing the number or size "
+           "of fetches and writes if possible. Value range: 0\% (optimal) to 100\% (bad)."},
+          {"WriteUnitStalled",
+           "",
+           "",
+           "100*TCC_WRREQ_STALL_max/GRBM_GUI_ACTIVE",
+           "The percentage of GPUTime the Write unit is stalled. Value range: 0\% to 100\% (bad)."},
+          {"LDSBankConflict",
+           "",
+           "",
+           "100*SQ_LDS_BANK_CONFLICT/GRBM_GUI_ACTIVE/CU_NUM",
+           "The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0\% (optimal) "
+           "to 100\% (bad)."}}}};
@@ -0,0 +1,3 @@
+configure_file(basic_counters.xml ${PROJECT_BINARY_DIR}/lib/basic_counters.xml COPYONLY)
+configure_file(derived_counters.xml ${PROJECT_BINARY_DIR}/lib/derived_counters.xml
+               COPYONLY)
@@ -0,0 +1,744 @@
+<gfx8 base="gfx8">
+  <metric name="GRBM_COUNT" block=GRBM event=0 descr="Tie High - Count Number of Clocks"></metric>
+  <metric name="GRBM_GUI_ACTIVE" block=GRBM event=2 descr="The GUI is Active"></metric>
+
+  <metric name="SQ_WAVES" block=SQ event=4 descr="Count number of waves sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_INSTS_VALU" block=SQ event=26 descr="Number of VALU instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VMEM_WR" block=SQ event=27 descr="Number of VMEM write instructions issued (including FLAT). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VMEM_RD" block=SQ event=28 descr="Number of VMEM read instructions issued (including FLAT). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_SALU" block=SQ event=30 descr="Number of SALU instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_SMEM" block=SQ event=31 descr="Number of SMEM instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_FLAT" block=SQ event=32 descr="Number of FLAT instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_FLAT_LDS_ONLY" block=SQ event=33 descr="Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_LDS" block=SQ event=34 descr="Number of LDS instructions issued (including FLAT). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_GDS" block=SQ event=35 descr="Number of GDS instructions issued. (per-simd, emulated)"></metric>
+
+  <metric name="SQ_WAIT_INST_LDS" block=SQ event=61 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
+  <metric name="SQ_ACTIVE_INST_VALU" block=SQ event=69 descr="Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_INST_CYCLES_SALU" block=SQ event=86 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"></metric>
+  <metric name="SQ_THREAD_CYCLES_VALU" block=SQ event=89 descr="Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)"></metric>
+  <metric name="SQ_LDS_BANK_CONFLICT" block=SQ event=97 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
+
+  <metric name="TA_TA_BUSY" block=TA event=15 descr="TA block is busy. Perf_Windowing not supported for this counter."></metric>
+  <metric name="TA_FLAT_READ_WAVEFRONTS" block=TA event=101 descr="Number of flat opcode reads processed by the TA."></metric>
+  <metric name="TA_FLAT_WRITE_WAVEFRONTS" block=TA event=102 descr="Number of flat opcode writes processed by the TA."></metric>
+
+  <metric name="TCC_HIT" block=TCC event=18 descr="Number of cache hits."></metric>
+  <metric name="TCC_MISS" block=TCC event=19 descr="Number of cache misses. UC reads count as misses."></metric>
+  <metric name="TCC_MC_RDREQ" block=TCC event=35 descr="Number of 32-byte reads. The hardware actually does 64-byte reads but the number is adjusted to provide uniformity."></metric>
+  <metric name="TCC_MC_WRREQ" block=TCC event=26 descr="Number of 32-byte transactions going over the TC_MC_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests."></metric>
+  <metric name="TCC_MC_WRREQ_STALL" block=TCC event=28 descr="Number of cycles a write request was stalled."></metric>
+
+  <metric name="TCP_TCP_TA_DATA_STALL_CYCLES" block=TCP event=3 descr="TCP stalls TA data interface. Now Windowed."></metric>
+</gfx8>
+
+<gfx9>
+  <metric name="GRBM_COUNT" block=GRBM event=0 descr="Tie High - Count Number of Clocks"></metric>
+  <metric name="GRBM_GUI_ACTIVE" block=GRBM event=2 descr="The GUI is Active"></metric>
+
+  <metric name="SQ_WAVES" block=SQ event=4 descr="Count number of waves sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_INSTS_VALU" block=SQ event=26 descr="Number of VALU instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VMEM_WR" block=SQ event=27 descr="Number of VMEM write instructions issued (including FLAT). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VMEM_RD" block=SQ event=28 descr="Number of VMEM read instructions issued (including FLAT). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_SALU" block=SQ event=30 descr="Number of SALU instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_SMEM" block=SQ event=31 descr="Number of SMEM instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_FLAT" block=SQ event=32 descr="Number of FLAT instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_FLAT_LDS_ONLY" block=SQ event=33 descr="Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_LDS" block=SQ event=34 descr="Number of LDS instructions issued (including FLAT). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_GDS" block=SQ event=35 descr="Number of GDS instructions issued. (per-simd, emulated)"></metric>
+
+  <metric name="SQ_WAIT_INST_LDS" block=SQ event=63 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
+  <metric name="SQ_ACTIVE_INST_VALU" block=SQ event=71 descr="regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_INST_CYCLES_SALU" block=SQ event=84 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"></metric>
+  <metric name="SQ_THREAD_CYCLES_VALU" block=SQ event=85 descr="Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)"></metric>
+  <metric name="SQ_LDS_BANK_CONFLICT" block=SQ event=93 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
+
+  <metric name="TA_TA_BUSY" block=TA event=15 descr="TA block is busy. Perf_Windowing not supported for this counter."></metric>
+  <metric name="TA_FLAT_READ_WAVEFRONTS" block=TA event=101 descr="Number of flat opcode reads processed by the TA."></metric>
+  <metric name="TA_FLAT_WRITE_WAVEFRONTS" block=TA event=102 descr="Number of flat opcode writes processed by the TA."></metric>
+
+  <metric name="TCC_HIT" block=TCC event=20 descr="Number of cache hits."></metric>
+  <metric name="TCC_MISS" block=TCC event=22 descr="Number of cache misses. UC reads count as misses."></metric>
+  <metric name="TCC_EA_WRREQ" block=TCC event=29 descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands."></metric>
+  <metric name="TCC_EA_WRREQ_64B" block=TCC event=30 descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface."></metric>
+  <metric name="TCC_EA_WRREQ_STALL" block=TCC event=33 descr="Number of cycles a write request was stalled."></metric>
+  <metric name="TCC_EA_RDREQ" block=TCC event=41 descr="Number of TCC/EA read requests (either 32-byte or 64-byte)"></metric>
+  <metric name="TCC_EA_RDREQ_32B" block=TCC event=42 descr="Number of 32-byte TCC/EA read requests"></metric>
+
+  <metric name="TCP_TCP_TA_DATA_STALL_CYCLES" block=TCP event=6 descr="TCP stalls TA data interface. Now Windowed."></metric>
+</gfx9>
+
+<gfx906 base="gfx9">
+  # EA1
+  <metric name="TCC_EA1_WRREQ" block=TCC event=256 descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands."></metric>
+  <metric name="TCC_EA1_WRREQ_64B" block=TCC event=257 descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface."></metric>
+  <metric name="TCC_EA1_WRREQ_STALL" block=TCC event=260 descr="Number of cycles a write request was stalled."></metric>
+  <metric name="TCC_EA1_RDREQ" block=TCC event=267 descr="Number of TCC/EA read requests (either 32-byte or 64-byte)"></metric>
+  <metric name="TCC_EA1_RDREQ_32B" block=TCC event=268 descr="Number of 32-byte TCC/EA read requests"></metric>
+</gfx906>
+
+<gfx908 base="gfx9">
+  <metric name="SQ_INSTS_VMEM_WR" block=SQ event=28 descr="Number of VMEM write instructions issued (including FLAT). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VMEM_RD" block=SQ event=29 descr="Number of VMEM read instructions issued (including FLAT). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_SALU" block=SQ event=31 descr="Number of SALU instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_SMEM" block=SQ event=32 descr="Number of SMEM instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_FLAT" block=SQ event=33 descr="Number of FLAT instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_FLAT_LDS_ONLY" block=SQ event=34 descr="Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_LDS" block=SQ event=35 descr="Number of LDS instructions issued (including FLAT). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_GDS" block=SQ event=36 descr="Number of GDS instructions issued. (per-simd, emulated)"></metric>
+
+  <metric name="SQ_WAIT_INST_LDS" block=SQ event=64 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
+  <metric name="SQ_ACTIVE_INST_VALU" block=SQ event=72 descr="regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_INST_CYCLES_SALU" block=SQ event=85 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"></metric>
+  <metric name="SQ_THREAD_CYCLES_VALU" block=SQ event=86 descr="Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)"></metric>
+  <metric name="SQ_LDS_BANK_CONFLICT" block=SQ event=94 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
+
+  <metric name="TCC_HIT" block=TCC event=17 descr="Number of cache hits."></metric>
+  <metric name="TCC_MISS" block=TCC event=19 descr="Number of cache misses. UC reads count as misses."></metric>
+  <metric name="TCC_EA_WRREQ" block=TCC event=26 descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands."></metric>
+  <metric name="TCC_EA_WRREQ_64B" block=TCC event=27 descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface."></metric>
+  <metric name="TCC_EA_WRREQ_STALL" block=TCC event=30 descr="Number of cycles a write request was stalled."></metric>
+  <metric name="TCC_EA_RDREQ" block=TCC event=38 descr="Number of TCC/EA read requests (either 32-byte or 64-byte)"></metric>
+  <metric name="TCC_EA_RDREQ_32B" block=TCC event=39 descr="Number of 32-byte TCC/EA read requests"></metric>
+</gfx908>
+
+<gfx90a>
+  <metric name="SQ_WAIT_INST_LDS" block=SQ event=91 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
+  <metric name="TCP_TCP_TA_DATA_STALL_CYCLES" block=TCP event=6 descr="TCP stalls TA data interface. Now Windowed."></metric>
+  <metric name="GRBM_COUNT" block=GRBM event=0 descr="Tie High - Count Number of Clocks"></metric>
+  <metric name="GRBM_GUI_ACTIVE" block=GRBM event=2 descr="The GUI is Active"></metric>
+  <metric name="GRBM_CP_BUSY" block=GRBM event=3 descr="Any of the Command Processor (CPG/CPC/CPF) blocks are busy."></metric>
+  <metric name="GRBM_SPI_BUSY" block=GRBM event=11 descr="Any of the Shader Pipe Interpolators (SPI) are busy in the shader engine(s)."></metric>
+  <metric name="GRBM_TA_BUSY" block=GRBM event=13 descr="Any of the Texture Pipes (TA) are busy in the shader engine(s)."></metric>
+  <metric name="GRBM_TC_BUSY" block=GRBM event=28 descr="Any of the Texture Cache Blocks (TCP/TCI/TCA/TCC) are busy."></metric>
+  <metric name="GRBM_CPC_BUSY" block=GRBM event=30 descr="The Command Processor Compute (CPC) is busy."></metric>
+  <metric name="GRBM_CPF_BUSY" block=GRBM event=31 descr="The Command Processor Fetchers (CPF) is busy."></metric>
+  <metric name="GRBM_UTCL2_BUSY" block=GRBM event=34 descr="The Unified Translation Cache Level-2 (UTCL2) block is busy."></metric>
+  <metric name="GRBM_EA_BUSY" block=GRBM event=35 descr="The Efficiency Arbiter (EA) block is busy."></metric>
+  <metric name="CPC_ME1_BUSY_FOR_PACKET_DECODE" block=CPC event=13 descr="Me1 busy for packet decode."></metric>
+  <metric name="CPC_UTCL1_STALL_ON_TRANSLATION" block=CPC event=24 descr="One of the UTCL1s is stalled waiting on translation, XNACK or PENDING response."></metric>
+  <metric name="CPC_CPC_STAT_BUSY" block=CPC event=25 descr="CPC Busy."></metric>
+  <metric name="CPC_CPC_STAT_IDLE" block=CPC event=26 descr="CPC Idle."></metric>
+  <metric name="CPC_CPC_STAT_STALL" block=CPC event=27 descr="CPC Stalled."></metric>
+  <metric name="CPC_CPC_TCIU_BUSY" block=CPC event=28 descr="CPC TCIU interface Busy."></metric>
+  <metric name="CPC_CPC_TCIU_IDLE" block=CPC event=29 descr="CPC TCIU interface Idle."></metric>
+  <metric name="CPC_CPC_UTCL2IU_BUSY" block=CPC event=30 descr="CPC UTCL2 interface Busy."></metric>
+  <metric name="CPC_CPC_UTCL2IU_IDLE" block=CPC event=31 descr="CPC UTCL2 interface Idle."></metric>
+  <metric name="CPC_CPC_UTCL2IU_STALL" block=CPC event=32 descr="CPC UTCL2 interface Stalled waiting on Free, Tags or Translation."></metric>
+  <metric name="CPC_ME1_DC0_SPI_BUSY" block=CPC event=33 descr="CPC Me1 Processor Busy."></metric>
+  <metric name="CPF_CMP_UTCL1_STALL_ON_TRANSLATION" block=CPF event=20 descr="One of the Compute UTCL1s is stalled waiting on translation, XNACK or PENDING response."></metric>
+  <metric name="CPF_CPF_STAT_BUSY" block=CPF event=23 descr="CPF Busy."></metric>
+  <metric name="CPF_CPF_STAT_IDLE" block=CPF event=24 descr="CPF Idle."></metric>
+  <metric name="CPF_CPF_STAT_STALL" block=CPF event=25 descr="CPF Stalled."></metric>
+  <metric name="CPF_CPF_TCIU_BUSY" block=CPF event=26 descr="CPF TCIU interface Busy."></metric>
+  <metric name="CPF_CPF_TCIU_IDLE" block=CPF event=27 descr="CPF TCIU interface Idle."></metric>
+  <metric name="CPF_CPF_TCIU_STALL" block=CPF event=28 descr="CPF TCIU interface Stalled waiting on Free, Tags."></metric>
+  <metric name="SPI_CSN_WINDOW_VALID" block=SPI event=47 descr="Clock count enabled by perfcounter_start event. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"></metric>
+  <metric name="SPI_CSN_BUSY" block=SPI event=48 descr="Number of clocks with outstanding waves (SPI or SH). Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"></metric>
+  <metric name="SPI_CSN_NUM_THREADGROUPS" block=SPI event=49 descr="Number of threadgroups launched. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"></metric>
+  <metric name="SPI_CSN_WAVE" block=SPI event=52 descr="Number of waves. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"></metric>
+  <metric name="SPI_RA_REQ_NO_ALLOC" block=SPI event=79 descr="Arb cycles with requests but no allocation. Source is RA0"></metric>
+  <metric name="SPI_RA_REQ_NO_ALLOC_CSN" block=SPI event=85 descr="Arb cycles with CSn req and no CSn alloc. Source is RA0"></metric>
+  <metric name="SPI_RA_RES_STALL_CSN" block=SPI event=91 descr="Arb cycles with CSn req and no CSn fits. Source is RA0"></metric>
+  <metric name="SPI_RA_TMP_STALL_CSN" block=SPI event=97 descr="Cycles where csn wants to req but does not fit in temp space."></metric>
+  <metric name="SPI_RA_WAVE_SIMD_FULL_CSN" block=SPI event=103 descr="Sum of SIMD where WAVE can't take csn wave when !fits. Source is RA0"></metric>
+  <metric name="SPI_RA_VGPR_SIMD_FULL_CSN" block=SPI event=109 descr="Sum of SIMD where VGPR can't take csn wave when !fits. Source is RA0"></metric>
+  <metric name="SPI_RA_SGPR_SIMD_FULL_CSN" block=SPI event=115 descr="Sum of SIMD where SGPR can't take csn wave when !fits. Source is RA0"></metric>
+  <metric name="SPI_RA_LDS_CU_FULL_CSN" block=SPI event=120 descr="Sum of CU where LDS can't take csn wave when !fits. Source is RA0"></metric>
+  <metric name="SPI_RA_BAR_CU_FULL_CSN" block=SPI event=123 descr="Sum of CU where BARRIER can't take csn wave when !fits. Source is RA0"></metric>
+  <metric name="SPI_RA_BULKY_CU_FULL_CSN" block=SPI event=125 descr="Sum of CU where BULKY can't take csn wave when !fits. Source is RA0"></metric>
+  <metric name="SPI_RA_TGLIM_CU_FULL_CSN" block=SPI event=127 descr="Cycles where csn wants to req but all CU are at tg_limit"></metric>
+  <metric name="SPI_RA_WVLIM_STALL_CSN" block=SPI event=133 descr="Number of clocks csn is stalled due to WAVE LIMIT."></metric>
+  <metric name="SPI_SWC_CSC_WR" block=SPI event=189 descr="Number of clocks to write CSC waves to SGPRs (need to multiply this value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"></metric>
+  <metric name="SPI_VWC_CSC_WR" block=SPI event=195 descr="Number of clocks to write CSC waves to VGPRs (need to multiply this value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"></metric>
+  <metric name="SQ_ACCUM_PREV" block=SQ event=1 descr="For counter N, increment by the value of counter N-1. Only accumulates once every 4 cycles."></metric>
+  <metric name="SQ_CYCLES" block=SQ event=2 descr="Clock cycles. (nondeterministic, per-simd, global)"></metric>
+  <metric name="SQ_BUSY_CYCLES" block=SQ event=3 descr="Clock cycles while SQ is reporting that it is busy. (nondeterministic, per-simd, global)"></metric>
+  <metric name="SQ_WAVES" block=SQ event=4 descr="Count number of waves sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_LEVEL_WAVES" block=SQ event=5 descr="Track the number of waves. Set ACCUM_PREV for the next counter to use this. (level, per-simd, global)"></metric>
+  <metric name="SQ_WAVES_EQ_64" block=SQ event=6 descr="Count number of waves with exactly 64 active threads sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_WAVES_LT_64" block=SQ event=7 descr="Count number of waves with <64 active threads sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_WAVES_LT_48" block=SQ event=8 descr="Count number of waves with <48 active threads sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_WAVES_LT_32" block=SQ event=9 descr="Count number of waves sent <32 active threads sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_WAVES_LT_16" block=SQ event=10 descr="Count number of waves sent <16 active threads sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_BUSY_CU_CYCLES" block=SQ event=13 descr="Count quad-cycles each CU is busy. (nondeterministic, per-simd)"></metric>
+  <metric name="SQ_ITEMS" block=SQ event=14 descr="Number of valid items per wave. (per-simd, global)"></metric>
+  <metric name="SQ_INSTS" block=SQ event=25 descr="Number of instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU" block=SQ event=26 descr="Number of VALU instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_ADD_F16" block=SQ event=27 descr="Number of VALU ADD/SUB instructions on float16. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MUL_F16" block=SQ event=28 descr="Number of VALU MUL instructions on float16. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_FMA_F16" block=SQ event=29 descr="Number of VALU FMA/MAD instructions on float16. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_TRANS_F16" block=SQ event=30 descr="Number of VALU transcendental instructions on float16. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_ADD_F32" block=SQ event=31 descr="Number of VALU ADD/SUB instructions on float32. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MUL_F32" block=SQ event=32 descr="Number of VALU MUL instructions on float32. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_FMA_F32" block=SQ event=33 descr="Number of VALU FMA/MAD instructions on float32. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_TRANS_F32" block=SQ event=34 descr="Number of VALU transcendental instructions on float32. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_ADD_F64" block=SQ event=35 descr="Number of VALU ADD/SUB instructions on float64. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MUL_F64" block=SQ event=36 descr="Number of VALU MUL instructions on float64. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_FMA_F64" block=SQ event=37 descr="Number of VALU FMA/MAD instructions on float64. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_TRANS_F64" block=SQ event=38 descr="Number of VALU transcendental instructions on float64. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_INT32" block=SQ event=39 descr="Number of VALU 32-bit integer (signed or unsigned) instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_INT64" block=SQ event=40 descr="Number of VALU 64-bit integer (signed or unsigned) instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_CVT" block=SQ event=41 descr="Number of VALU data conversion instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_I8" block=SQ event=42 descr="Number of VALU V_MFMA_*_I8 instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_F16" block=SQ event=43 descr="Number of VALU V_MFMA_*_F16 instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_BF16" block=SQ event=44 descr="Number of VALU V_MFMA_*_BF16 instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_F32" block=SQ event=45 descr="Number of VALU V_MFMA_*_F32 instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_F64" block=SQ event=46 descr="Number of VALU V_MFMA_*_F64 instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_MOPS_I8" block=SQ event=47 descr="Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask, of data type I8. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_MOPS_F16" block=SQ event=48 descr="Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask, of data type F16. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_MOPS_BF16" block=SQ event=49 descr="Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask, of data type BF16. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_MOPS_F32" block=SQ event=50 descr="Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask, of data type F32. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_MOPS_F64" block=SQ event=51 descr="Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask, of data type F64. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_MFMA" block=SQ event=52 descr="Number of MFMA instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VMEM_WR" block=SQ event=53 descr="Number of VMEM write instructions issued (including FLAT). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VMEM_RD" block=SQ event=54 descr="Number of VMEM read instructions issued (including FLAT). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VMEM" block=SQ event=55 descr="Number of VMEM instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_SALU" block=SQ event=56 descr="Number of SALU instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_SMEM" block=SQ event=57 descr="Number of SMEM instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_FLAT" block=SQ event=58 descr="Number of FLAT instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_FLAT_LDS_ONLY" block=SQ event=59 descr="Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_LDS" block=SQ event=60 descr="Number of LDS instructions issued (including FLAT). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_GDS" block=SQ event=61 descr="Number of GDS instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_EXP_GDS" block=SQ event=63 descr="Number of EXP and GDS instructions issued, excluding skipped export instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_BRANCH" block=SQ event=64 descr="Number of Branch instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_SENDMSG" block=SQ event=65 descr="Number of Sendmsg instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VSKIPPED" block=SQ event=66 descr="Number of vector instructions skipped. (per-simd, emulated)"></metric>
+  <metric name="SQ_INST_LEVEL_VMEM" block=SQ event=67 descr="Number of in-flight VMEM instructions. Set next counter to ACCUM_PREV and divide by INSTS_VMEM for average latency. Includes FLAT instructions. (per-simd, level, nondeterministic)"></metric>
+  <metric name="SQ_INST_LEVEL_SMEM" block=SQ event=68 descr="Number of in-flight SMEM instructions (*2 load/store; *2 atomic; *2 memtime; *4 wb/inv). Set next counter to ACCUM_PREV and divide by INSTS_SMEM for average latency per smem request. Falls slightly short of total request latency because some fetches are divided into two requests that may finish at different times and this counter collects the average latency of the two. (per-simd, level, nondeterministic)"></metric>
+  <metric name="SQ_INST_LEVEL_LDS" block=SQ event=69 descr="Number of in-flight LDS instructions. Set next counter to ACCUM_PREV and divide by INSTS_LDS for average latency. Includes FLAT instructions. (per-simd, level, nondeterministic)"></metric>
+  <metric name="SQ_VALU_MFMA_BUSY_CYCLES" block=SQ event=72 descr="Number of cycles the MFMA ALU is busy (per-simd, emulated)"></metric>
+  <metric name="SQ_WAVE_CYCLES" block=SQ event=74 descr="Number of wave-cycles spent by waves in the CUs (per-simd, nondeterministic). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_WAIT_ANY" block=SQ event=85 descr="Number of wave-cycles spent waiting for anything (per-simd, nondeterministic). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_WAIT_INST_ANY" block=SQ event=88 descr="Number of wave-cycles spent waiting for any instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
+  <metric name="SQ_ACTIVE_INST_ANY" block=SQ event=96 descr="Number of cycles each wave is working on an instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_ACTIVE_INST_VMEM" block=SQ event=97 descr="Number of cycles the SQ instruction arbiter is working on a VMEM instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_ACTIVE_INST_LDS" block=SQ event=98 descr="Number of cycles the SQ instruction arbiter is working on a LDS instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_ACTIVE_INST_VALU" block=SQ event=99 descr="Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_ACTIVE_INST_SCA" block=SQ event=100 descr="Number of cycles the SQ instruction arbiter is working on a SALU or SMEM instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_ACTIVE_INST_EXP_GDS" block=SQ event=101 descr="Number of cycles the SQ instruction arbiter is working on an EXPORT or GDS instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_ACTIVE_INST_MISC" block=SQ event=102 descr="Number of cycles the SQ instruction aribter is working on a BRANCH or SENDMSG instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_ACTIVE_INST_FLAT" block=SQ event=103 descr="Number of cycles the SQ instruction arbiter is working on a FLAT instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_INST_CYCLES_VMEM_WR" block=SQ event=104 descr="Number of cycles needed to send addr and cmd data for VMEM write instructions. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_INST_CYCLES_VMEM_RD" block=SQ event=105 descr="Number of cycles needed to send addr and cmd data for VMEM read instructions. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_INST_CYCLES_SMEM" block=SQ event=111 descr="Number of cycles needed to execute scalar memory reads. (per-simd, emulated)"></metric>
+  <metric name="SQ_INST_CYCLES_SALU" block=SQ event=112 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_THREAD_CYCLES_VALU" block=SQ event=113 descr="Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)"></metric>
+  <metric name="SQ_IFETCH" block=SQ event=115 descr="Number of instruction fetch requests from cache. (per-simd, emulated)"></metric>
+  <metric name="SQ_IFETCH_LEVEL" block=SQ event=116 descr="Number of instruction fetch requests from cache. (per-simd, level)"></metric>
+  <metric name="SQ_LDS_BANK_CONFLICT" block=SQ event=121 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
+  <metric name="SQ_LDS_ADDR_CONFLICT" block=SQ event=122 descr="Number of cycles LDS is stalled by address conflicts. (emulated,nondeterministic)"></metric>
+  <metric name="SQ_LDS_UNALIGNED_STALL" block=SQ event=123 descr="Number of cycles LDS is stalled processing flat unaligned load/store ops. (emulated)"></metric>
+  <metric name="SQ_LDS_MEM_VIOLATIONS" block=SQ event=124 descr="Number of threads that have a memory violation in the LDS.(emulated)"></metric>
+  <metric name="SQ_LDS_ATOMIC_RETURN" block=SQ event=125 descr="Number of atomic return cycles in LDS. (per-simd, emulated)"></metric>
+  <metric name="SQ_LDS_IDX_ACTIVE" block=SQ event=126 descr="Number of cycles LDS is used for indexed (non-direct,non-interpolation) operations. (per-simd, emulated)"></metric>
+  <metric name="SQ_ACCUM_PREV_HIRES" block=SQ event=185 descr="For counter N, increment by the value of counter N-1."></metric>
+  <metric name="SQ_WAVES_RESTORED" block=SQ event=186 descr="Count number of context-restored waves sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_WAVES_SAVED" block=SQ event=187 descr="Count number of context-saved waves. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_INSTS_SMEM_NORM" block=SQ event=188 descr="Number of SMEM instructions issued normalized to match smem_level (*2 load/store; *2 atomic; *2 memtime; *4 wb/inv). (per-simd, emulated)"></metric>
+  <metric name="SQC_DCACHE_INPUT_VALID_READYB" block=SQ event=260 descr="Input stalled by SQC (per-SQ, nondeterministic, unwindowed)"></metric>
+  <metric name="SQC_TC_REQ" block=SQ event=262 descr="Total number of TC requests that were issued by instruction and constant caches. (No-Masking, nondeterministic)"></metric>
+  <metric name="SQC_TC_INST_REQ" block=SQ event=263 descr="Number of insruction requests to the TC (No-Masking, nondeterministic)"></metric>
+  <metric name="SQC_TC_DATA_READ_REQ" block=SQ event=264 descr="Number of data read requests to the TC (No-Masking, nondeterministic)"></metric>
+  <metric name="SQC_TC_DATA_WRITE_REQ" block=SQ event=265 descr="Number of data write requests to the TC (No-Masking, nondeterministic)"></metric>
+  <metric name="SQC_TC_DATA_ATOMIC_REQ" block=SQ event=266 descr="Number of data atomic requests to the TC (No-Masking, nondeterministic)"></metric>
+  <metric name="SQC_TC_STALL" block=SQ event=267 descr="Valid request stalled TC request interface (no-credits). (No-Masking, nondeterministic, unwindowed)"></metric>
+  <metric name="SQC_ICACHE_REQ" block=SQ event=270 descr="Number of requests. (per-SQ, per-Bank)"></metric>
+  <metric name="SQC_ICACHE_HITS" block=SQ event=271 descr="Number of cache hits. (per-SQ, per-Bank, nondeterministic)"></metric>
+  <metric name="SQC_ICACHE_MISSES" block=SQ event=272 descr="Number of cache misses, includes uncached requests. (per-SQ, per-Bank, nondeterministic)"></metric>
+  <metric name="SQC_ICACHE_MISSES_DUPLICATE" block=SQ event=273 descr="Number of misses that were duplicates (access to a non-resident, miss pending CL). (per-SQ, per-Bank, nondeterministic)" ></metric>
+  <metric name="SQC_DCACHE_REQ" block=SQ event=290 descr="Number of requests (post-bank-serialization). (per-SQ, per-Bank)"></metric>
+  <metric name="SQC_DCACHE_HITS" block=SQ event=291 descr="Number of cache hits. (per-SQ, per-Bank, nondeterministic)"></metric>
+  <metric name="SQC_DCACHE_MISSES" block=SQ event=292 descr="Number of cache misses, includes uncached requests. (per-SQ, per-Bank, nondeterministic)"></metric>
+  <metric name="SQC_DCACHE_MISSES_DUPLICATE" block=SQ event=293 descr="Number of misses that were duplicates (access to a non-resident, miss pending CL). (per-SQ, per-Bank, nondeterministic)" ></metric>
+  <metric name="SQC_DCACHE_ATOMIC" block=SQ event=298 descr="Number of atomic requests. (per-SQ, per-Bank)"></metric>
+  <metric name="SQC_DCACHE_REQ_READ_1" block=SQ event=323 descr="Number of constant cache 1 dw read requests. (per-SQ)"></metric>
+  <metric name="SQC_DCACHE_REQ_READ_2" block=SQ event=324 descr="Number of constant cache 2 dw read requests. (per-SQ)"></metric>
+  <metric name="SQC_DCACHE_REQ_READ_4" block=SQ event=325 descr="Number of constant cache 4 dw read requests. (per-SQ)"></metric>
+  <metric name="SQC_DCACHE_REQ_READ_8" block=SQ event=326 descr="Number of constant cache 8 dw read requests. (per-SQ)"></metric>
+  <metric name="SQC_DCACHE_REQ_READ_16" block=SQ event=327 descr="Number of constant cache 16 dw read requests. (per-SQ)"></metric>
+  <metric name="TA_TA_BUSY" block=TA event=15 descr="TA block is busy. Perf_Windowing not supported for this counter."></metric>
+  <metric name="TA_TOTAL_WAVEFRONTS" block=TA event=32 descr="Total number of wavefronts processed by TA."></metric>
+  <metric name="TA_BUFFER_WAVEFRONTS" block=TA event=44 descr="Number of buffer wavefronts processed by TA."></metric>
+  <metric name="TA_BUFFER_READ_WAVEFRONTS" block=TA event=45 descr="Number of buffer read wavefronts processed by TA."></metric>
+  <metric name="TA_BUFFER_WRITE_WAVEFRONTS" block=TA event=46 descr="Number of buffer write wavefronts processed by TA."></metric>
+  <metric name="TA_BUFFER_ATOMIC_WAVEFRONTS" block=TA event=47 descr="Number of buffer atomic wavefronts processed by TA."></metric>
+  <metric name="TA_BUFFER_TOTAL_CYCLES" block=TA event=49 descr="Number of buffer cycles issued to TC."></metric>
+  <metric name="TA_BUFFER_COALESCED_READ_CYCLES" block=TA event=52 descr="Number of buffer coalesced read cycles issued to TC."></metric>
+  <metric name="TA_BUFFER_COALESCED_WRITE_CYCLES" block=TA event=53 descr="Number of buffer coalesced write cycles issued to TC."></metric>
+  <metric name="TA_ADDR_STALLED_BY_TC_CYCLES" block=TA event=54 descr="Number of cycles addr path stalled by TC. Perf_Windowing not supported for this counter."></metric>
+  <metric name="TA_ADDR_STALLED_BY_TD_CYCLES" block=TA event=55 descr="Number of cycles addr path stalled by TD. Perf_Windowing not supported for this counter."></metric>
+  <metric name="TA_DATA_STALLED_BY_TC_CYCLES" block=TA event=56 descr="Number of cycles data path stalled by TC. Perf_Windowing not supported for this counter."></metric>
+  <metric name="TA_FLAT_WAVEFRONTS" block=TA event=100 descr="Number of flat opcode wavfronts processed by the TA."></metric>
+  <metric name="TA_FLAT_READ_WAVEFRONTS" block=TA event=101 descr="Number of flat opcode reads processed by the TA."></metric>
+  <metric name="TA_FLAT_WRITE_WAVEFRONTS" block=TA event=102 descr="Number of flat opcode writes processed by the TA."></metric>
+  <metric name="TA_FLAT_ATOMIC_WAVEFRONTS" block=TA event=103 descr="Number of flat opcode atomics processed by the TA."></metric>
+  <metric name="TD_TD_BUSY" block=TD event=1 descr="TD is processing or waiting for data. Perf_Windowing not supported for this counter."></metric>
+  <metric name="TD_TC_STALL" block=TD event=15 descr="TD is stalled waiting for TC data."></metric>
+  <metric name="TD_SPI_STALL" block=TD event=18 descr="TD is stalled SPI vinit"></metric>
+  <metric name="TD_LOAD_WAVEFRONT" block=TD event=25 descr="Count the wavefronts with opcode = load, include atomics and store."></metric>
+  <metric name="TD_ATOMIC_WAVEFRONT" block=TD event=26 descr="Count the wavefronts with opcode = atomic."></metric>
+  <metric name="TD_STORE_WAVEFRONT" block=TD event=27 descr="Count the wavefronts with opcode = store."></metric>
+  <metric name="TD_COALESCABLE_WAVEFRONT" block=TD event=32 descr="Count wavefronts that TA finds coalescable."></metric>
+  <metric name="TCP_GATE_EN1" block=TCP event=0 descr="TCP interface clocks are turned on. Not Windowed."></metric>
+  <metric name="TCP_GATE_EN2" block=TCP event=1 descr="TCP core clocks are turned on. Not Windowed."></metric>
+  <metric name="TCP_TD_TCP_STALL_CYCLES" block=TCP event=7 descr="TD stalls TCP"></metric>
+  <metric name="TCP_TCR_TCP_STALL_CYCLES" block=TCP event=8 descr="TCR stalls TCP_TCR_req interface"></metric>
+  <metric name="TCP_READ_TAGCONFLICT_STALL_CYCLES" block=TCP event=11 descr="Tagram conflict stall on a read"></metric>
+  <metric name="TCP_WRITE_TAGCONFLICT_STALL_CYCLES" block=TCP event=12 descr="Tagram conflict stall on a write"></metric>
+  <metric name="TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES" block=TCP event=13 descr="Tagram conflict stall on an atomic"></metric>
+  <metric name="TCP_PENDING_STALL_CYCLES" block=TCP event=22 descr="Stall due to data pending from L2"></metric>
+  <metric name="TCP_TA_TCP_STATE_READ" block=TCP event=27 descr="Number of state reads"></metric>
+  <metric name="TCP_VOLATILE" block=TCP event=28 descr="Total number of L1 volatile pixels/buffers from TA"></metric>
+  <metric name="TCP_TOTAL_ACCESSES" block=TCP event=29 descr="Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD"></metric>
+  <metric name="TCP_TOTAL_READ" block=TCP event=30 descr="Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ"></metric>
+  <metric name="TCP_TOTAL_WRITE" block=TCP event=32 descr="Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+ TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE"></metric>
+  <metric name="TCP_TOTAL_ATOMIC_WITH_RET" block=TCP event=38 descr="Total number of atomic with return pixels/buffers from TA"></metric>
+  <metric name="TCP_TOTAL_ATOMIC_WITHOUT_RET" block=TCP event=39 descr="Total number of atomic without return pixels/buffers from TA"></metric>
+  <metric name="TCP_TOTAL_WRITEBACK_INVALIDATES" block=TCP event=45 descr="Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. Not Windowed."></metric>
+  <metric name="TCP_UTCL1_REQUEST" block=TCP event=47 descr="Total CLIENT_UTCL1 NORMAL requests"></metric>
+  <metric name="TCP_UTCL1_TRANSLATION_MISS" block=TCP event=48 descr="Total utcl1 translation misses"></metric>
+  <metric name="TCP_UTCL1_TRANSLATION_HIT" block=TCP event=49 descr="Total utcl1 translation hits"></metric>
+  <metric name="TCP_UTCL1_PERMISSION_MISS" block=TCP event=50 descr="Total utcl1 permission misses"></metric>
+  <metric name="TCP_TOTAL_CACHE_ACCESSES" block=TCP event=60 descr="Count of total cache line (tag) accesses (includes hits and misses)."></metric>
+  <metric name="TCP_TCP_LATENCY" block=TCP event=65 descr="Total TCP wave latency (from first clock of wave entering to first clock of wave leaving), divide by TA_TCP_STATE_READ to avg wave latency"></metric>
+  <metric name="TCP_TCC_READ_REQ_LATENCY" block=TCP event=66 descr="Total TCP->TCC request latency for reads and atomics with return. Not Windowed."></metric>
+  <metric name="TCP_TCC_WRITE_REQ_LATENCY" block=TCP event=67 descr="Total TCP->TCC request latency for writes and atomics without return. Not Windowed."></metric>
+  <metric name="TCP_TCC_READ_REQ" block=TCP event=69 descr="Total read requests from TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_WRITE_REQ" block=TCP event=70 descr="Total write requests from TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_ATOMIC_WITH_RET_REQ" block=TCP event=71 descr="Total atomic with return requests from TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_ATOMIC_WITHOUT_RET_REQ" block=TCP event=72 descr="Total atomic without return requests from TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_NC_READ_REQ" block=TCP event=75 descr="Total read requests with NC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_NC_WRITE_REQ" block=TCP event=76 descr="Total write requests with NC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_NC_ATOMIC_REQ" block=TCP event=77 descr="Total atomic requests with NC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_UC_READ_REQ" block=TCP event=78 descr="Total read requests with UC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_UC_WRITE_REQ" block=TCP event=79 descr="Total write requests with UC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_UC_ATOMIC_REQ" block=TCP event=80 descr="Total atomic requests with UC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_CC_READ_REQ" block=TCP event=81 descr="Total write requests with CC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_CC_WRITE_REQ" block=TCP event=82 descr="Total write requests with CC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_CC_ATOMIC_REQ" block=TCP event=83 descr="Total atomic requests with CC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_RW_READ_REQ" block=TCP event=85 descr="Total write requests with RW mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_RW_WRITE_REQ" block=TCP event=86 descr="Total write requests with RW mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_RW_ATOMIC_REQ" block=TCP event=87 descr="Total atomic requests with RW mtype from this TCP to all TCCs"></metric>
+  <metric name="TCA_CYCLE" block=TCA event=1 descr="Number of cycles. Not windowable."></metric>
+  <metric name="TCA_BUSY" block=TCA event=2 descr="Number of cycles we have a request pending. Not windowable."></metric>
+  <metric name="TCC_CYCLE" block=TCC event=1 descr="Number of cycles. Not windowable."></metric>
+  <metric name="TCC_BUSY" block=TCC event=2 descr="Number of cycles we have a request pending. Not windowable."></metric>
+  <metric name="TCC_REQ" block=TCC event=3 descr="Number of requests of all types. This is measured at the tag block. This may be more than the number of requests arriving at the TCC, but it is a good indication of the total amount of work that needs to be performed."></metric>
+  <metric name="TCC_STREAMING_REQ" block=TCC event=4 descr="Number of streaming requests. This is measured at the tag block."></metric>
+  <metric name="TCC_NC_REQ" block=TCC event=5 descr="The number of noncoherently cached requests. This is measured at the tag block."></metric>
+  <metric name="TCC_UC_REQ" block=TCC event=6 descr="The number of uncached requests. This is measured at the tag block."></metric>
+  <metric name="TCC_CC_REQ" block=TCC event=7 descr="The number of coherently cached requests. This is measured at the tag block."></metric>
+  <metric name="TCC_RW_REQ" block=TCC event=8 descr="The number of RW requests. This is measured at the tag block."></metric>
+  <metric name="TCC_PROBE" block=TCC event=9 descr="Number of probe requests. Not windowable."></metric>
+  <metric name="TCC_PROBE_ALL" block=TCC event=10 descr="Number of external probe requests with with EA_TCC_preq_all== 1. Not windowable."></metric>
+  <metric name="TCC_READ" block=TCC event=12 descr="Number of read requests. Compressed reads are included in this, but metadata reads are not included."></metric>
+  <metric name="TCC_WRITE" block=TCC event=13 descr="Number of write requests."></metric>
+  <metric name="TCC_ATOMIC" block=TCC event=14 descr="Number of atomic requests of all types."></metric>
+  <metric name="TCC_HIT" block=TCC event=17 descr="Number of cache hits."></metric>
+  <metric name="TCC_MISS" block=TCC event=19 descr="Number of cache misses. UC reads count as misses."></metric>
+  <metric name="TCC_WRITEBACK" block=TCC event=22 descr="Number of lines written back to main memory. This includes writebacks of dirty lines and uncached write/atomic requests."></metric>
+  <metric name="TCC_EA_WRREQ" block=TCC event=26 descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands."></metric>
+  <metric name="TCC_EA_WRREQ_64B" block=TCC event=27 descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface."></metric>
+  <metric name="TCC_EA_WR_UNCACHED_32B" block=TCC event=29 descr="Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2"></metric>
+  <metric name="TCC_EA_WRREQ_STALL" block=TCC event=30 descr="Number of cycles a write request was stalled."></metric>
+  <metric name="TCC_EA_WRREQ_IO_CREDIT_STALL" block=TCC event=31 descr="Number of cycles a EA write request was stalled because the interface was out of IO credits."></metric>
+  <metric name="TCC_EA_WRREQ_GMI_CREDIT_STALL" block=TCC event=32 descr="Number of cycles a EA write request was stalled because the interface was out of GMI credits."></metric>
+  <metric name="TCC_EA_WRREQ_DRAM_CREDIT_STALL" block=TCC event=33 descr="Number of cycles a EA write request was stalled because the interface was out of DRAM credits."></metric>
+  <metric name="TCC_TOO_MANY_EA_WRREQS_STALL" block=TCC event=34 descr="Number of cycles the TCC could not send a EA write request because it already reached its maximum number of pending EA write requests."></metric>
+  <metric name="TCC_EA_WRREQ_LEVEL" block=TCC event=35 descr="The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ."></metric>
+  <metric name="TCC_EA_ATOMIC" block=TCC event=36 descr="Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests."></metric>
+  <metric name="TCC_EA_ATOMIC_LEVEL" block=TCC event=37 descr="The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC."></metric>
+  <metric name="TCC_EA_RDREQ" block=TCC event=38 descr="Number of TCC/EA read requests (either 32-byte or 64-byte)"></metric>
+  <metric name="TCC_EA_RDREQ_32B" block=TCC event=39 descr="Number of 32-byte TCC/EA read requests"></metric>
+  <metric name="TCC_EA_RD_UNCACHED_32B" block=TCC event=40 descr="Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2"></metric>
+  <metric name="TCC_EA_RDREQ_IO_CREDIT_STALL" block=TCC event=41 descr="Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur regardless of whether a read needed to be performed or not."></metric>
+  <metric name="TCC_EA_RDREQ_GMI_CREDIT_STALL" block=TCC event=42 descr="Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur regardless of whether a read needed to be performed or not."></metric>
+  <metric name="TCC_EA_RDREQ_DRAM_CREDIT_STALL" block=TCC event=43 descr="Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur regardless of whether a read needed to be performed or not."></metric>
+  <metric name="TCC_EA_RDREQ_LEVEL" block=TCC event=44 descr="The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ."></metric>
+  <metric name="TCC_TAG_STALL" block=TCC event=45 descr="Number of cycles the normal request pipeline in the tag was stalled for any reason. Normally, stalls of this nature are measured exactly from one point the pipeline, but that is not the case for this counter. Probes can stall the pipeline at a variety of places, and there is no single point that can reasonably measure the total stalls accurately."></metric>
+  <metric name="TCC_NORMAL_WRITEBACK" block=TCC event=68 descr="Number of writebacks due to requests that are not writeback requests."></metric>
+  <metric name="TCC_ALL_TC_OP_WB_WRITEBACK" block=TCC event=73 descr="Number of writebacks due to all TC_OP writeback requests."></metric>
+  <metric name="TCC_NORMAL_EVICT" block=TCC event=74 descr="Number of evictions due to requests that are not invalidate or probe requests."></metric>
+  <metric name="TCC_ALL_TC_OP_INV_EVICT" block=TCC event=80 descr="Number of evictions due to all TC_OP invalidate requests."></metric>
+  <metric name="TCC_EA_RDREQ_DRAM" block=TCC event=102 descr="Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC)."></metric>
+  <metric name="TCC_EA_WRREQ_DRAM" block=TCC event=103 descr="Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC)."></metric>
+</gfx90a>
+
+<gfx940>
+  <metric name="SQ_WAIT_INST_LDS" block=SQ event=96 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
+  <metric name="TCP_TCP_TA_DATA_STALL_CYCLES" block=TCP event=6 descr="TCP stalls TA data interface. Now Windowed."></metric>
+  <metric name="GRBM_COUNT" block=GRBM event=0 descr="Tie High - Count Number of Clocks"></metric>
+  <metric name="GRBM_GUI_ACTIVE" block=GRBM event=2 descr="The GUI is Active"></metric>
+  <metric name="GRBM_CP_BUSY" block=GRBM event=3 descr="Any of the Command Processor (CPG/CPC/CPF) blocks are busy."></metric>
+  <metric name="GRBM_SPI_BUSY" block=GRBM event=11 descr="Any of the Shader Pipe Interpolators (SPI) are busy in the shader engine(s)."></metric>
+  <metric name="GRBM_TA_BUSY" block=GRBM event=13 descr="Any of the Texture Pipes (TA) are busy in the shader engine(s)."></metric>
+  <metric name="GRBM_TC_BUSY" block=GRBM event=28 descr="Any of the Texture Cache Blocks (TCP/TCI/TCA/TCC) are busy."></metric>
+  <metric name="GRBM_CPC_BUSY" block=GRBM event=30 descr="The Command Processor Compute (CPC) is busy."></metric>
+  <metric name="GRBM_CPF_BUSY" block=GRBM event=31 descr="The Command Processor Fetchers (CPF) is busy."></metric>
+  <metric name="GRBM_UTCL2_BUSY" block=GRBM event=34 descr="The Unified Translation Cache Level-2 (UTCL2) block is busy."></metric>
+  <metric name="GRBM_EA_BUSY" block=GRBM event=35 descr="The Efficiency Arbiter (EA) block is busy."></metric>
+  <metric name="CPC_ME1_BUSY_FOR_PACKET_DECODE" block=CPC event=13 descr="Me1 busy for packet decode."></metric>
+  <metric name="CPC_UTCL1_STALL_ON_TRANSLATION" block=CPC event=24 descr="One of the UTCL1s is stalled waiting on translation, XNACK or PENDING response."></metric>
+  <metric name="CPC_CPC_STAT_BUSY" block=CPC event=25 descr="CPC Busy."></metric>
+  <metric name="CPC_CPC_STAT_IDLE" block=CPC event=26 descr="CPC Idle."></metric>
+  <metric name="CPC_CPC_STAT_STALL" block=CPC event=27 descr="CPC Stalled."></metric>
+  <metric name="CPC_CPC_TCIU_BUSY" block=CPC event=28 descr="CPC TCIU interface Busy."></metric>
+  <metric name="CPC_CPC_TCIU_IDLE" block=CPC event=29 descr="CPC TCIU interface Idle."></metric>
+  <metric name="CPC_CPC_UTCL2IU_BUSY" block=CPC event=30 descr="CPC UTCL2 interface Busy."></metric>
+  <metric name="CPC_CPC_UTCL2IU_IDLE" block=CPC event=31 descr="CPC UTCL2 interface Idle."></metric>
+  <metric name="CPC_CPC_UTCL2IU_STALL" block=CPC event=32 descr="CPC UTCL2 interface Stalled waiting on Free, Tags or Translation."></metric>
+  <metric name="CPC_ME1_DC0_SPI_BUSY" block=CPC event=33 descr="CPC Me1 Processor Busy."></metric>
+  <metric name="CPF_CMP_UTCL1_STALL_ON_TRANSLATION" block=CPF event=20 descr="One of the Compute UTCL1s is stalled waiting on translation, XNACK or PENDING response."></metric>
+  <metric name="CPF_CPF_STAT_BUSY" block=CPF event=23 descr="CPF Busy."></metric>
+  <metric name="CPF_CPF_STAT_IDLE" block=CPF event=24 descr="CPF Idle."></metric>
+  <metric name="CPF_CPF_STAT_STALL" block=CPF event=25 descr="CPF Stalled."></metric>
+  <metric name="CPF_CPF_TCIU_BUSY" block=CPF event=26 descr="CPF TCIU interface Busy."></metric>
+  <metric name="CPF_CPF_TCIU_IDLE" block=CPF event=27 descr="CPF TCIU interface Idle."></metric>
+  <metric name="CPF_CPF_TCIU_STALL" block=CPF event=28 descr="CPF TCIU interface Stalled waiting on Free, Tags."></metric>
+  <metric name="SPI_CSN_WINDOW_VALID" block=SPI event=47 descr="Clock count enabled by perfcounter_start event. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"></metric>
+  <metric name="SPI_CSN_BUSY" block=SPI event=48 descr="Number of clocks with outstanding waves (SPI or SH). Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"></metric>
+  <metric name="SPI_CSN_NUM_THREADGROUPS" block=SPI event=49 descr="Number of threadgroups launched. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"></metric>
+  <metric name="SPI_CSN_WAVE" block=SPI event=52 descr="Number of waves. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"></metric>
+  <metric name="SPI_RA_REQ_NO_ALLOC" block=SPI event=79 descr="Arb cycles with requests but no allocation. Source is RA0"></metric>
+  <metric name="SPI_RA_REQ_NO_ALLOC_CSN" block=SPI event=85 descr="Arb cycles with CSn req and no CSn alloc. Source is RA0"></metric>
+  <metric name="SPI_RA_RES_STALL_CSN" block=SPI event=91 descr="Arb cycles with CSn req and no CSn fits. Source is RA0"></metric>
+  <metric name="SPI_RA_TMP_STALL_CSN" block=SPI event=97 descr="Cycles where csn wants to req but does not fit in temp space."></metric>
+  <metric name="SPI_RA_WAVE_SIMD_FULL_CSN" block=SPI event=103 descr="Sum of SIMD where WAVE can't take csn wave when !fits. Source is RA0"></metric>
+  <metric name="SPI_RA_VGPR_SIMD_FULL_CSN" block=SPI event=109 descr="Sum of SIMD where VGPR can't take csn wave when !fits. Source is RA0"></metric>
+  <metric name="SPI_RA_SGPR_SIMD_FULL_CSN" block=SPI event=115 descr="Sum of SIMD where SGPR can't take csn wave when !fits. Source is RA0"></metric>
+  <metric name="SPI_RA_LDS_CU_FULL_CSN" block=SPI event=120 descr="Sum of CU where LDS can't take csn wave when !fits. Source is RA0"></metric>
+  <metric name="SPI_RA_BAR_CU_FULL_CSN" block=SPI event=123 descr="Sum of CU where BARRIER can't take csn wave when !fits. Source is RA0"></metric>
+  <metric name="SPI_RA_BULKY_CU_FULL_CSN" block=SPI event=125 descr="Sum of CU where BULKY can't take csn wave when !fits. Source is RA0"></metric>
+  <metric name="SPI_RA_TGLIM_CU_FULL_CSN" block=SPI event=127 descr="Cycles where csn wants to req but all CU are at tg_limit"></metric>
+  <metric name="SPI_RA_WVLIM_STALL_CSN" block=SPI event=133 descr="Number of clocks csn is stalled due to WAVE LIMIT."></metric>
+  <metric name="SPI_SWC_CSC_WR" block=SPI event=189 descr="Number of clocks to write CSC waves to SGPRs (need to multiply this value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"></metric>
+  <metric name="SPI_VWC_CSC_WR" block=SPI event=195 descr="Number of clocks to write CSC waves to VGPRs (need to multiply this value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"></metric>
+  <metric name="SQ_ACCUM_PREV" block=SQ event=1 descr="For counter N, increment by the value of counter N-1. Only accumulates once every 4 cycles."></metric>
+  <metric name="SQ_CYCLES" block=SQ event=2 descr="Clock cycles. (nondeterministic, per-simd, global)"></metric>
+  <metric name="SQ_BUSY_CYCLES" block=SQ event=3 descr="Clock cycles while SQ is reporting that it is busy. (nondeterministic, per-simd, global)"></metric>
+  <metric name="SQ_WAVES" block=SQ event=4 descr="Count number of waves sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_LEVEL_WAVES" block=SQ event=5 descr="Track the number of waves. Set ACCUM_PREV for the next counter to use this. (level, per-simd, global)"></metric>
+  <metric name="SQ_WAVES_EQ_64" block=SQ event=6 descr="Count number of waves with exactly 64 active threads sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_WAVES_LT_64" block=SQ event=7 descr="Count number of waves with <64 active threads sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_WAVES_LT_48" block=SQ event=8 descr="Count number of waves with <48 active threads sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_WAVES_LT_32" block=SQ event=9 descr="Count number of waves sent <32 active threads sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_WAVES_LT_16" block=SQ event=10 descr="Count number of waves sent <16 active threads sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_BUSY_CU_CYCLES" block=SQ event=13 descr="Count quad-cycles each CU is busy. (nondeterministic, per-simd)"></metric>
+  <metric name="SQ_ITEMS" block=SQ event=14 descr="Number of valid items per wave. (per-simd, global)"></metric>
+  <metric name="SQ_INSTS" block=SQ event=25 descr="Number of instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU" block=SQ event=26 descr="Number of VALU instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_ADD_F16" block=SQ event=27 descr="Number of VALU ADD/SUB instructions on float16. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MUL_F16" block=SQ event=28 descr="Number of VALU MUL instructions on float16. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_FMA_F16" block=SQ event=29 descr="Number of VALU FMA/MAD instructions on float16. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_TRANS_F16" block=SQ event=30 descr="Number of VALU transcendental instructions on float16. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_ADD_F32" block=SQ event=31 descr="Number of VALU ADD/SUB instructions on float32. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MUL_F32" block=SQ event=32 descr="Number of VALU MUL instructions on float32. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_FMA_F32" block=SQ event=33 descr="Number of VALU FMA/MAD instructions on float32. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_TRANS_F32" block=SQ event=34 descr="Number of VALU transcendental instructions on float32. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_ADD_F64" block=SQ event=35 descr="Number of VALU ADD/SUB instructions on float64. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MUL_F64" block=SQ event=36 descr="Number of VALU MUL instructions on float64. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_FMA_F64" block=SQ event=37 descr="Number of VALU FMA/MAD instructions on float64. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_TRANS_F64" block=SQ event=38 descr="Number of VALU transcendental instructions on float64. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_INT32" block=SQ event=39 descr="Number of VALU 32-bit integer (signed or unsigned) instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_INT64" block=SQ event=40 descr="Number of VALU 64-bit integer (signed or unsigned) instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_CVT" block=SQ event=41 descr="Number of VALU data conversion instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_I8" block=SQ event=42 descr="Number of VALU V_MFMA_*_I8 instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_F16" block=SQ event=43 descr="Number of VALU V_MFMA_*_F16 instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_BF16" block=SQ event=44 descr="Number of VALU V_MFMA_*_BF16 instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_F32" block=SQ event=45 descr="Number of VALU V_MFMA_*_F32 instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_F64" block=SQ event=46 descr="Number of VALU V_MFMA_*_F64 instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_MOPS_I8" block=SQ event=49 descr="Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask, of data type I8. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_MOPS_F16" block=SQ event=50 descr="Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask, of data type F16. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_MOPS_BF16" block=SQ event=51 descr="Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask, of data type BF16. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_MOPS_F32" block=SQ event=52 descr="Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask, of data type F32. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VALU_MFMA_MOPS_F64" block=SQ event=53 descr="Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask, of data type F64. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_MFMA" block=SQ event=56 descr="Number of MFMA instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VMEM_WR" block=SQ event=57 descr="Number of VMEM write instructions issued (including FLAT). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VMEM_RD" block=SQ event=58 descr="Number of VMEM read instructions issued (including FLAT). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VMEM" block=SQ event=59 descr="Number of VMEM instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_SALU" block=SQ event=60 descr="Number of SALU instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_SMEM" block=SQ event=61 descr="Number of SMEM instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_FLAT" block=SQ event=62 descr="Number of FLAT instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_LDS" block=SQ event=65 descr="Number of LDS instructions issued (including FLAT). (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_GDS" block=SQ event=66 descr="Number of GDS instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_EXP_GDS" block=SQ event=68 descr="Number of EXP and GDS instructions issued, excluding skipped export instructions. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_BRANCH" block=SQ event=69 descr="Number of Branch instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_SENDMSG" block=SQ event=70 descr="Number of Sendmsg instructions issued. (per-simd, emulated)"></metric>
+  <metric name="SQ_INSTS_VSKIPPED" block=SQ event=71 descr="Number of vector instructions skipped. (per-simd, emulated)"></metric>
+  <metric name="SQ_INST_LEVEL_VMEM" block=SQ event=72 descr="Number of in-flight VMEM instructions. Set next counter to ACCUM_PREV and divide by INSTS_VMEM for average latency. Includes FLAT instructions. (per-simd, level, nondeterministic)"></metric>
+  <metric name="SQ_INST_LEVEL_SMEM" block=SQ event=73 descr="Number of in-flight SMEM instructions (*2 load/store; *2 atomic; *2 memtime; *4 wb/inv). Set next counter to ACCUM_PREV and divide by INSTS_SMEM for average latency per smem request. Falls slightly short of total request latency because some fetches are divided into two requests that may finish at different times and this counter collects the average latency of the two. (per-simd, level, nondeterministic)"></metric>
+  <metric name="SQ_INST_LEVEL_LDS" block=SQ event=74 descr="Number of in-flight LDS instructions. Set next counter to ACCUM_PREV and divide by INSTS_LDS for average latency. Includes FLAT instructions. (per-simd, level, nondeterministic)"></metric>
+  <metric name="SQ_VALU_MFMA_BUSY_CYCLES" block=SQ event=77 descr="Number of cycles the MFMA ALU is busy (per-simd, emulated)"></metric>
+  <metric name="SQ_WAVE_CYCLES" block=SQ event=79 descr="Number of wave-cycles spent by waves in the CUs (per-simd, nondeterministic). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_WAIT_ANY" block=SQ event=90 descr="Number of wave-cycles spent waiting for anything (per-simd, nondeterministic). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_WAIT_INST_ANY" block=SQ event=93 descr="Number of wave-cycles spent waiting for any instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
+  <metric name="SQ_ACTIVE_INST_ANY" block=SQ event=101 descr="Number of cycles each wave is working on an instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_ACTIVE_INST_VMEM" block=SQ event=102 descr="Number of cycles the SQ instruction arbiter is working on a VMEM instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_ACTIVE_INST_LDS" block=SQ event=103 descr="Number of cycles the SQ instruction arbiter is working on a LDS instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_ACTIVE_INST_VALU" block=SQ event=104 descr="Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_ACTIVE_INST_SCA" block=SQ event=105 descr="Number of cycles the SQ instruction arbiter is working on a SALU or SMEM instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_ACTIVE_INST_EXP_GDS" block=SQ event=106 descr="Number of cycles the SQ instruction arbiter is working on an EXPORT or GDS instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_ACTIVE_INST_MISC" block=SQ event=107 descr="Number of cycles the SQ instruction aribter is working on a BRANCH or SENDMSG instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_ACTIVE_INST_FLAT" block=SQ event=108 descr="Number of cycles the SQ instruction arbiter is working on a FLAT instruction. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_INST_CYCLES_VMEM_WR" block=SQ event=109 descr="Number of cycles needed to send addr and cmd data for VMEM write instructions. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_INST_CYCLES_VMEM_RD" block=SQ event=110 descr="Number of cycles needed to send addr and cmd data for VMEM read instructions. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_INST_CYCLES_SMEM" block=SQ event=116 descr="Number of cycles needed to execute scalar memory reads. (per-simd, emulated)"></metric>
+  <metric name="SQ_INST_CYCLES_SALU" block=SQ event=117 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated). Units in quad-cycles(4 cycles)"></metric>
+  <metric name="SQ_THREAD_CYCLES_VALU" block=SQ event=118 descr="Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)"></metric>
+  <metric name="SQ_IFETCH" block=SQ event=120 descr="Number of instruction fetch requests from cache. (per-simd, emulated)"></metric>
+  <metric name="SQ_IFETCH_LEVEL" block=SQ event=121 descr="Number of instruction fetch requests from cache. (per-simd, level)"></metric>
+  <metric name="SQ_LDS_BANK_CONFLICT" block=SQ event=126 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
+  <metric name="SQ_LDS_ADDR_CONFLICT" block=SQ event=127 descr="Number of cycles LDS is stalled by address conflicts. (emulated,nondeterministic)"></metric>
+  <metric name="SQ_LDS_UNALIGNED_STALL" block=SQ event=128 descr="Number of cycles LDS is stalled processing flat unaligned load/store ops. (emulated)"></metric>
+  <metric name="SQ_LDS_MEM_VIOLATIONS" block=SQ event=129 descr="Number of threads that have a memory violation in the LDS.(emulated)"></metric>
+  <metric name="SQ_LDS_ATOMIC_RETURN" block=SQ event=130 descr="Number of atomic return cycles in LDS. (per-simd, emulated)"></metric>
+  <metric name="SQ_LDS_IDX_ACTIVE" block=SQ event=131 descr="Number of cycles LDS is used for indexed (non-direct,non-interpolation) operations. (per-simd, emulated)"></metric>
+  <metric name="SQ_ACCUM_PREV_HIRES" block=SQ event=184 descr="For counter N, increment by the value of counter N-1."></metric>
+  <metric name="SQ_WAVES_RESTORED" block=SQ event=185 descr="Count number of context-restored waves sent to SQs. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_WAVES_SAVED" block=SQ event=186 descr="Count number of context-saved waves. (per-simd, emulated, global)"></metric>
+  <metric name="SQ_INSTS_SMEM_NORM" block=SQ event=187 descr="Number of SMEM instructions issued normalized to match smem_level (*2 load/store; *2 atomic; *2 memtime; *4 wb/inv). (per-simd, emulated)"></metric>
+  <metric name="SQC_ICACHE_INPUT_VALID_READYB" block=SQ event=257 descr=" Input stalled by SQC (per-SQ, nondeterministic, unwindowed)"></metric>
+  <metric name="SQC_DCACHE_INPUT_VALID_READYB" block=SQ event=260 descr="Input stalled by SQC (per-SQ, nondeterministic, unwindowed)"></metric>
+  <metric name="SQC_TC_REQ" block=SQ event=262 descr="Total number of TC requests that were issued by instruction and constant caches. (No-Masking, nondeterministic)"></metric>
+  <metric name="SQC_TC_INST_REQ" block=SQ event=263 descr="Number of insruction requests to the TC (No-Masking, nondeterministic)"></metric>
+  <metric name="SQC_TC_DATA_READ_REQ" block=SQ event=264 descr="Number of data read requests to the TC (No-Masking, nondeterministic)"></metric>
+  <metric name="SQC_TC_DATA_WRITE_REQ" block=SQ event=265 descr="Number of data write requests to the TC (No-Masking, nondeterministic)"></metric>
+  <metric name="SQC_TC_DATA_ATOMIC_REQ" block=SQ event=266 descr="Number of data atomic requests to the TC (No-Masking, nondeterministic)"></metric>
+  <metric name="SQC_TC_STALL" block=SQ event=267 descr="Valid request stalled TC request interface (no-credits). (No-Masking, nondeterministic, unwindowed)"></metric>
+  <metric name="SQC_ICACHE_BUSY_CYCLES" block=SQ event=269 descr="Clock cycles while cache is reporting that it is busy. (No-Masking, nondeterministic, unwindowed)"></metric>
+  <metric name="SQC_ICACHE_REQ" block=SQ event=270 descr="Number of requests. (per-SQ, per-Bank)"></metric>
+  <metric name="SQC_ICACHE_HITS" block=SQ event=271 descr="Number of cache hits. (per-SQ, per-Bank, nondeterministic)"></metric>
+  <metric name="SQC_ICACHE_MISSES" block=SQ event=272 descr="Number of cache misses, includes uncached requests. (per-SQ, per-Bank, nondeterministic)"></metric>
+  <metric name="SQC_ICACHE_MISSES_DUPLICATE" block=SQ event=273 descr="Number of misses that were duplicates (access to a non-resident, miss pending CL). (per-SQ, per-Bank, nondeterministic)" ></metric>
+  <metric name="SQC_DCACHE_BUSY_CYCLES" block=SQ event=289 descr=" Clock cycles while cache is reporting that it is busy. (No-Masking, nondeterministic, unwindowed)"></metric>
+  <metric name="SQC_DCACHE_REQ" block=SQ event=290 descr="Number of requests (post-bank-serialization). (per-SQ, per-Bank)"></metric>
+  <metric name="SQC_DCACHE_HITS" block=SQ event=291 descr="Number of cache hits. (per-SQ, per-Bank, nondeterministic)"></metric>
+  <metric name="SQC_DCACHE_MISSES" block=SQ event=292 descr="Number of cache misses, includes uncached requests. (per-SQ, per-Bank, nondeterministic)"></metric>
+  <metric name="SQC_DCACHE_MISSES_DUPLICATE" block=SQ event=293 descr="Number of misses that were duplicates (access to a non-resident, miss pending CL). (per-SQ, per-Bank, nondeterministic)" ></metric>
+  <metric name="SQC_DCACHE_ATOMIC" block=SQ event=298 descr="Number of atomic requests. (per-SQ, per-Bank)"></metric>
+  <metric name="SQC_DCACHE_REQ_READ_1" block=SQ event=323 descr="Number of constant cache 1 dw read requests. (per-SQ)"></metric>
+  <metric name="SQC_DCACHE_REQ_READ_2" block=SQ event=324 descr="Number of constant cache 2 dw read requests. (per-SQ)"></metric>
+  <metric name="SQC_DCACHE_REQ_READ_4" block=SQ event=325 descr="Number of constant cache 4 dw read requests. (per-SQ)"></metric>
+  <metric name="SQC_DCACHE_REQ_READ_8" block=SQ event=326 descr="Number of constant cache 8 dw read requests. (per-SQ)"></metric>
+  <metric name="SQC_DCACHE_REQ_READ_16" block=SQ event=327 descr="Number of constant cache 16 dw read requests. (per-SQ)"></metric>
+  <metric name="TA_TA_BUSY" block=TA event=13 descr="TA block is busy. Perf_Windowing not supported for this counter."></metric>
+  <metric name="TA_TOTAL_WAVEFRONTS" block=TA event=29 descr="Total number of wavefronts processed by TA."></metric>
+  <metric name="TA_BUFFER_WAVEFRONTS" block=TA event=32 descr="Number of buffer wavefronts processed by TA."></metric>
+  <metric name="TA_BUFFER_READ_WAVEFRONTS" block=TA event=33 descr="Number of buffer read wavefronts processed by TA."></metric>
+  <metric name="TA_BUFFER_WRITE_WAVEFRONTS" block=TA event=34 descr="Number of buffer write wavefronts processed by TA."></metric>
+  <metric name="TA_BUFFER_ATOMIC_WAVEFRONTS" block=TA event=35 descr="Number of buffer atomic wavefronts processed by TA."></metric>
+  <metric name="TA_BUFFER_TOTAL_CYCLES" block=TA event=37 descr="Number of buffer cycles issued to TC."></metric>
+  <metric name="TA_BUFFER_COALESCED_READ_CYCLES" block=TA event=40 descr="Number of buffer coalesced read cycles issued to TC."></metric>
+  <metric name="TA_BUFFER_COALESCED_WRITE_CYCLES" block=TA event=41 descr="Number of buffer coalesced write cycles issued to TC."></metric>
+  <metric name="TA_ADDR_STALLED_BY_TC_CYCLES" block=TA event=42 descr="Number of cycles addr path stalled by TC. Perf_Windowing not supported for this counter."></metric>
+  <metric name="TA_ADDR_STALLED_BY_TD_CYCLES" block=TA event=43 descr="Number of cycles addr path stalled by TD. Perf_Windowing not supported for this counter."></metric>
+  <metric name="TA_DATA_STALLED_BY_TC_CYCLES" block=TA event=44 descr="Number of cycles data path stalled by TC. Perf_Windowing not supported for this counter."></metric>
+  <metric name="TA_FLAT_WAVEFRONTS" block=TA event=51 descr="Number of flat opcode wavfronts processed by the TA."></metric>
+  <metric name="TA_FLAT_READ_WAVEFRONTS" block=TA event=52 descr="Number of flat opcode reads processed by the TA."></metric>
+  <metric name="TA_FLAT_WRITE_WAVEFRONTS" block=TA event=53 descr="Number of flat opcode writes processed by the TA."></metric>
+  <metric name="TA_FLAT_ATOMIC_WAVEFRONTS" block=TA event=54 descr="Number of flat opcode atomics processed by the TA."></metric>
+  <metric name="TD_TD_BUSY" block=TD event=1 descr="TD is processing or waiting for data. Perf_Windowing not supported for this counter."></metric>
+  <metric name="TD_TC_STALL" block=TD event=12 descr="TD is stalled waiting for TC data."></metric>
+  <metric name="TD_SPI_STALL" block=TD event=15 descr="TD is stalled SPI vinit"></metric>
+  <metric name="TD_LOAD_WAVEFRONT" block=TD event=16 descr="Count the wavefronts with opcode = load, include atomics and store."></metric>
+  <metric name="TD_ATOMIC_WAVEFRONT" block=TD event=17 descr="Count the wavefronts with opcode = atomic."></metric>
+  <metric name="TD_STORE_WAVEFRONT" block=TD event=18 descr="Count the wavefronts with opcode = store."></metric>
+  <metric name="TD_COALESCABLE_WAVEFRONT" block=TD event=21 descr="Count wavefronts that TA finds coalescable."></metric>
+  <metric name="TCP_GATE_EN1" block=TCP event=0 descr="TCP interface clocks are turned on. Not Windowed."></metric>
+  <metric name="TCP_GATE_EN2" block=TCP event=1 descr="TCP core clocks are turned on. Not Windowed."></metric>
+  <metric name="TCP_TD_TCP_STALL_CYCLES" block=TCP event=7 descr="TD stalls TCP"></metric>
+  <metric name="TCP_TCR_TCP_STALL_CYCLES" block=TCP event=8 descr="TCR stalls TCP_TCR_req interface"></metric>
+  <metric name="TCP_READ_TAGCONFLICT_STALL_CYCLES" block=TCP event=10 descr="Tagram conflict stall on a read"></metric>
+  <metric name="TCP_WRITE_TAGCONFLICT_STALL_CYCLES" block=TCP event=11 descr="Tagram conflict stall on a write"></metric>
+  <metric name="TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES" block=TCP event=12 descr="Tagram conflict stall on an atomic"></metric>
+  <metric name="TCP_PENDING_STALL_CYCLES" block=TCP event=21 descr="Stall due to data pending from L2"></metric>
+  <metric name="TCP_TA_TCP_STATE_READ" block=TCP event=25 descr="Number of state reads"></metric>
+  <metric name="TCP_VOLATILE" block=TCP event=26 descr="Total number of L1 volatile pixels/buffers from TA"></metric>
+  <metric name="TCP_TOTAL_ACCESSES" block=TCP event=27 descr="Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD"></metric>
+  <metric name="TCP_TOTAL_READ" block=TCP event=28 descr="Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ"></metric>
+  <metric name="TCP_TOTAL_WRITE" block=TCP event=30 descr="Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+ TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE"></metric>
+  <metric name="TCP_TOTAL_ATOMIC_WITH_RET" block=TCP event=36 descr="Total number of atomic with return pixels/buffers from TA"></metric>
+  <metric name="TCP_TOTAL_ATOMIC_WITHOUT_RET" block=TCP event=37 descr="Total number of atomic without return pixels/buffers from TA"></metric>
+  <metric name="TCP_TOTAL_WRITEBACK_INVALIDATES" block=TCP event=43 descr="Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. Not Windowed."></metric>
+  <metric name="TCP_UTCL1_REQUEST" block=TCP event=45 descr="Total CLIENT_UTCL1 NORMAL requests"></metric>
+  <metric name="TCP_UTCL1_TRANSLATION_MISS" block=TCP event=47 descr="Total utcl1 translation misses"></metric>
+  <metric name="TCP_UTCL1_TRANSLATION_HIT" block=TCP event=48 descr="Total utcl1 translation hits"></metric>
+  <metric name="TCP_UTCL1_PERMISSION_MISS" block=TCP event=49 descr="Total utcl1 permission misses"></metric>
+  <metric name="TCP_TOTAL_CACHE_ACCESSES" block=TCP event=60 descr="Count of total cache line (tag) accesses (includes hits and misses)."></metric>
+  <metric name="TCP_TCC_READ_REQ" block=TCP event=65 descr="Total read requests from TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_WRITE_REQ" block=TCP event=66 descr="Total write requests from TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_ATOMIC_WITH_RET_REQ" block=TCP event=67 descr="Total atomic with return requests from TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_ATOMIC_WITHOUT_RET_REQ" block=TCP event=68 descr="Total atomic without return requests from TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_NC_READ_REQ" block=TCP event=71 descr="Total read requests with NC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_NC_WRITE_REQ" block=TCP event=72 descr="Total write requests with NC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_NC_ATOMIC_REQ" block=TCP event=73 descr="Total atomic requests with NC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_UC_READ_REQ" block=TCP event=74 descr="Total read requests with UC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_UC_WRITE_REQ" block=TCP event=75 descr="Total write requests with UC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_UC_ATOMIC_REQ" block=TCP event=76 descr="Total atomic requests with UC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_CC_READ_REQ" block=TCP event=77 descr="Total write requests with CC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_CC_WRITE_REQ" block=TCP event=78 descr="Total write requests with CC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_CC_ATOMIC_REQ" block=TCP event=79 descr="Total atomic requests with CC mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_RW_READ_REQ" block=TCP event=80 descr="Total write requests with RW mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_RW_WRITE_REQ" block=TCP event=81 descr="Total write requests with RW mtype from this TCP to all TCCs"></metric>
+  <metric name="TCP_TCC_RW_ATOMIC_REQ" block=TCP event=82 descr="Total atomic requests with RW mtype from this TCP to all TCCs"></metric>
+  <metric name="TCA_CYCLE" block=TCA event=1 descr="Number of cycles. Not windowable."></metric>
+  <metric name="TCA_BUSY" block=TCA event=2 descr="Number of cycles we have a request pending. Not windowable."></metric>
+  <metric name="TCC_CYCLE" block=TCC event=1 descr="Number of cycles. Not windowable."></metric>
+  <metric name="TCC_BUSY" block=TCC event=2 descr="Number of cycles we have a request pending. Not windowable."></metric>
+  <metric name="TCC_REQ" block=TCC event=3 descr="Number of requests of all types. This is measured at the tag block. This may be more than the number of requests arriving at the TCC, but it is a good indication of the total amount of work that needs to be performed."></metric>
+  <metric name="TCC_STREAMING_REQ" block=TCC event=4 descr="Number of streaming requests. This is measured at the tag block."></metric>
+  <metric name="TCC_NC_REQ" block=TCC event=5 descr="The number of noncoherently cached requests. This is measured at the tag block."></metric>
+  <metric name="TCC_UC_REQ" block=TCC event=6 descr="The number of uncached requests. This is measured at the tag block."></metric>
+  <metric name="TCC_CC_REQ" block=TCC event=7 descr="The number of coherently cached requests. This is measured at the tag block."></metric>
+  <metric name="TCC_RW_REQ" block=TCC event=8 descr="The number of RW requests. This is measured at the tag block."></metric>
+  <metric name="TCC_PROBE" block=TCC event=9 descr="Number of probe requests. Not windowable."></metric>
+  <metric name="TCC_PROBE_ALL" block=TCC event=10 descr="Number of external probe requests with with EA_TCC_preq_all== 1. Not windowable."></metric>
+  <metric name="TCC_INTERNAL_PROBE" block=TCC event=11 descr="Number of self-probes spawned by TCC for CC writes/atomic operations. Not windowable."></metric>
+  <metric name="TCC_READ" block=TCC event=12 descr="Number of read requests. Compressed reads are included in this, but metadata reads are not included."></metric>
+  <metric name="TCC_WRITE" block=TCC event=13 descr="Number of write requests."></metric>
+  <metric name="TCC_ATOMIC" block=TCC event=14 descr="Number of atomic requests of all types."></metric>
+  <metric name="TCC_HIT" block=TCC event=17 descr="Number of cache hits."></metric>
+  <metric name="TCC_MISS" block=TCC event=19 descr="Number of cache misses. UC reads count as misses."></metric>
+  <metric name="TCC_WRITEBACK" block=TCC event=22 descr="Number of lines written back to main memory. This includes writebacks of dirty lines and uncached write/atomic requests."></metric>
+  <metric name="TCC_EA0_WRREQ" block=TCC event=26 descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands."></metric>
+  <metric name="TCC_EA0_WRREQ_64B" block=TCC event=27 descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface."></metric>
+  <metric name="TCC_EA0_WRREQ_PROBE_COMMAND" block=TCC event=28 descr="Number of probe commands going over the TC_EA_wrreq interface."></metric>
+  <metric name="TCC_EA0_WR_UNCACHED_32B" block=TCC event=29 descr="Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2"></metric>
+  <metric name="TCC_EA0_WRREQ_STALL" block=TCC event=30 descr="Number of cycles a write request was stalled."></metric>
+  <metric name="TCC_EA0_WRREQ_IO_CREDIT_STALL" block=TCC event=31 descr="Number of cycles a EA write request was stalled because the interface was out of IO credits."></metric>
+  <metric name="TCC_EA0_WRREQ_GMI_CREDIT_STALL" block=TCC event=32 descr="Number of cycles a EA write request was stalled because the interface was out of GMI credits."></metric>
+  <metric name="TCC_EA0_WRREQ_DRAM_CREDIT_STALL" block=TCC event=33 descr="Number of cycles a EA write request was stalled because the interface was out of DRAM credits."></metric>
+  <metric name="TCC_TOO_MANY_EA_WRREQS_STALL" block=TCC event=34 descr="Number of cycles the TCC could not send a EA write request because it already reached its maximum number of pending EA write requests."></metric>
+  <metric name="TCC_EA0_WRREQ_LEVEL" block=TCC event=35 descr="The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ."></metric>
+  <metric name="TCC_EA0_ATOMIC" block=TCC event=36 descr="Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests."></metric>
+  <metric name="TCC_EA0_ATOMIC_LEVEL" block=TCC event=37 descr="The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC."></metric>
+  <metric name="TCC_EA0_RDREQ" block=TCC event=38 descr="Number of TCC/EA read requests (either 32-byte or 64-byte)"></metric>
+  <metric name="TCC_EA0_RDREQ_32B" block=TCC event=39 descr="Number of 32-byte TCC/EA read requests"></metric>
+  <metric name="TCC_EA0_RD_UNCACHED_32B" block=TCC event=40 descr="Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2"></metric>
+  <metric name="TCC_EA0_RDREQ_IO_CREDIT_STALL" block=TCC event=41 descr="Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur regardless of whether a read needed to be performed or not."></metric>
+  <metric name="TCC_EA0_RDREQ_GMI_CREDIT_STALL" block=TCC event=42 descr="Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur regardless of whether a read needed to be performed or not."></metric>
+  <metric name="TCC_EA0_RDREQ_DRAM_CREDIT_STALL" block=TCC event=43 descr="Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur regardless of whether a read needed to be performed or not."></metric>
+  <metric name="TCC_EA0_RDREQ_LEVEL" block=TCC event=44 descr="The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ."></metric>
+  <metric name="TCC_TAG_STALL" block=TCC event=45 descr="Number of cycles the normal request pipeline in the tag was stalled for any reason. Normally, stalls of this nature are measured exactly from one point the pipeline, but that is not the case for this counter. Probes can stall the pipeline at a variety of places, and there is no single point that can reasonably measure the total stalls accurately."></metric>
+  <metric name="TCC_NORMAL_WRITEBACK" block=TCC event=68 descr="Number of writebacks due to requests that are not writeback requests."></metric>
+  <metric name="TCC_ALL_TC_OP_WB_WRITEBACK" block=TCC event=73 descr="Number of writebacks due to all TC_OP writeback requests."></metric>
+  <metric name="TCC_NORMAL_EVICT" block=TCC event=74 descr="Number of evictions due to requests that are not invalidate or probe requests."></metric>
+  <metric name="TCC_ALL_TC_OP_INV_EVICT" block=TCC event=80 descr="Number of evictions due to all TC_OP invalidate requests."></metric>
+  <metric name="TCC_PROBE_EVICT" block=TCC event=81 descr="Number of evictions/invalidations due to probes. Not windowable."></metric>
+  <metric name="TCC_EA0_RDREQ_DRAM" block=TCC event=102 descr="Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC)."></metric>
+  <metric name="TCC_EA0_WRREQ_DRAM" block=TCC event=103 descr="Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC)."></metric>
+</gfx940>
+
+<gfx941 base="gfx940"></gfx941>
+<gfx942 base="gfx940"></gfx942>
+
+<gfx10>
+  <metric name="GRBM_COUNT" block=GRBM event=0 descr="Tie High - Count Number of Clocks"></metric>
+  <metric name="GRBM_GUI_ACTIVE" block=GRBM event=2 descr="The GUI is Active"></metric>
+  <metric name="GRBM_CP_BUSY" block=GRBM event=3 descr="Any of the Command Processor (CPG/CPC/CPF) blocks are busy."></metric>
+  <metric name="GRBM_SPI_BUSY" block=GRBM event=11 descr="Any of the Shader Pipe Interpolators (SPI) are busy in the shader engine(s)."></metric>
+  <metric name="GRBM_TA_BUSY" block=GRBM event=13 descr="Any of the Texture Pipes (TA) are busy in the shader engine(s)."></metric>
+  <metric name="GRBM_GDS_BUSY" block=GRBM event=25 descr="The Global Data Share (GDS) is busy."></metric>
+  <metric name="GRBM_EA_BUSY" block=GRBM event=35 descr="The Efficiency Arbiter (EA) block is busy."></metric>
+  <metric name="GRBM_GL2CC_BUSY" block=GRBM event=40 descr="The GL2CC block is busy."></metric>
+
+  <metric name="GL2C_HIT" block=GL2C event=42 descr="Number of cache hits"></metric>
+  <metric name="GL2C_MISS" block=GL2C event=43 descr="Number of cache misses.  UC reads count as misses."></metric>
+  <metric name="GL2C_MC_WRREQ" block=GL2C event=83 descr="Number of transactions (either 32-byte or 64-byte) going over the GL2C_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands"></metric>
+  <metric name="GL2C_EA_WRREQ_64B" block=GL2C event=85 descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface."></metric>
+  <metric name="GL2C_MC_WRREQ_STALL" block=GL2C event=88 descr="Number of cycles a write request was stalled."></metric>
+  <metric name="GL2C_MC_RDREQ" block=GL2C event=96 descr="Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte)."></metric>
+  <metric name="GL2C_EA_RDREQ_32B" block=GL2C event=99 descr="Number of 32-byte GL2C/EA read requests"></metric>
+  <metric name="GL2C_EA_RDREQ_64B" block=GL2C event=100 descr="Number of 64-byte GL2C/EA read requests"></metric>
+  <metric name="GL2C_EA_RDREQ_96B" block=GL2C event=101 descr="Number of 96-byte GL2C/EA read requests"></metric>
+  <metric name="GL2C_EA_RDREQ_128B" block=GL2C event=102 descr="Number of 128-byte GL2C/EA read requests"></metric>
+
+  <metric name="SQ_ACCUM_PREV" block=SQ event=1 descr="For counter N, increment by the value of counter N-1."></metric>
+  <metric name="SQ_BUSY_CYCLES" block=SQ event=3 descr="Clock cycles while SQ is reporting that it is busy. {nondeterministic, global, C2}"></metric>
+  <metric name="SQ_WAVES" block=SQ event=4 descr="Count number of waves sent to SQs. {emulated, global, C1}"></metric>
+  <metric name="SQ_LEVEL_WAVES" block=SQ event=7 descr="Track the aggregated number of waves over certain period of time, Set next counter to ACCUM_PREV and divide by SQ_PERF_SEL_WAVES for average wave life."></metric>
+  <metric name="SQ_WAVE_CYCLES" block=SQ event=26 descr="Number of clock cycles spent by waves in the SQs. Incremented by # of living (valid) waves each cycle. {nondeterministic, C1}"></metric>
+  <metric name="SQ_WAIT_INST_ANY" block=SQ event=28 descr="Number of clock cycles spent waiting for any instruction issue. In units of cycles. {nondeterministic}"></metric>
+  <metric name="SQ_WAIT_ANY" block=SQ event=37 descr="Number of clock cycles spent waiting for anything. {nondeterministic, C1}"></metric>
+  <metric name="SQ_INSTS_WAVE32" block=SQ event=71 descr="Number of wave32 instructions issued, for flat, lds, valu, tex. {emulated, C1}"></metric>
+  <metric name="SQ_INSTS_WAVE32_LDS" block=SQ event=74 descr="Number of wave32 LDS indexed instructions issued. Wave64 may count 1 or 2, depending on what gets issued. {emulated, C1}"></metric>
+  <metric name="SQ_INSTS_WAVE32_VALU" block=SQ event=75 descr="Number of wave32 valu instructions issued. Wave64 may count 1 or 2, depending on what gets issued. {emulated, C1}"></metric>
+  <metric name="SQ_WAVE32_INSTS" block=SQ event=84 descr="Number of instructions issued by wave32 waves. Skipped instructions are not counted. {emulated}"></metric>
+  <metric name="SQ_WAVE64_INSTS" block=SQ event=85 descr="Number of instructions issued by wave64 waves. Skipped instructions are not counted. {emulated}"></metric>
+  <metric name="SQ_INST_LEVEL_GDS" block=SQ event=98 descr="Number of in-flight GDS instructions. Set next counter to ACCUM_PREV and divide by INSTS_GDS for average latency. {level, nondeterministic, C1}"></metric>
+  <metric name="SQ_INST_LEVEL_LDS" block=SQ event=99 descr="Number of in-flight LDS instructions. Set next counter to ACCUM_PREV and divide by INSTS_LDS for average latency. Includes FLAT instructions. {level, nondeterministic, C1}"></metric>
+  <metric name="SQ_INST_CYCLES_VMEM" block=SQ event=120 descr="Number of cycles needed to send addr and data for VMEM (lds, buffer, image, flat, scratch, global) instructions, windowed by perf_en. {emulated, C1}"></metric>
+  <metric name="SQC_LDS_BANK_CONFLICT" block=SQ event=285 descr="Number of cycles LDS is stalled by bank conflicts. (emulated, C1)"></metric>
+  <metric name="SQC_LDS_IDX_ACTIVE" block=SQ event=290 descr="Number of cycles LDS is used for indexed (non-direct,non-interpolation) operations. {per-simd, emulated, C1}"></metric>
+  <metric name="SQ_INSTS_VALU" block=SQ event=64 descr="Number of VALU instructions issued excluding skipped instructions. {emulated, C1}"></metric>
+  <metric name="SQ_INSTS_SALU" block=SQ event=60 descr="Number of SALU instructions issued. {emulated, C1}"></metric>
+  <metric name="SQ_INSTS_SMEM" block=SQ event=61 descr="Number of SMEM instructions issued. {emulated, C1}"></metric>
+  <metric name="SQ_INSTS_FLAT" block=SQ event=57 descr="Number of FLAT instructions issued. {emulated, C2}"></metric>
+  <metric name="SQ_INSTS_LDS" block=SQ event=59 descr="Number of LDS indexed instructions issued. {emulated, C1}"></metric>
+  <metric name="SQ_INSTS_GDS" block=SQ event=55 descr="Number of GDS instructions issued. {emulated, C1}"></metric>
+  <metric name="SQ_WAIT_INST_LDS" block=SQ event=31 descr="Number of clock cycles spent waiting for LDS (indexed) instruction issue. In units of cycles. {nondeterministic, C1}"></metric>
+  
+  <metric name="TA_TA_BUSY" block=TA event=15 descr="TA block is busy. Perf_Windowing not supported for this counter."></metric>
+  <metric name="TA_FLAT_LOAD_WAVEFRONTS" block=TA event=101 descr=" Number of flat load vec32 packets processed by TA, same as flat_read_wavefronts in earlier IP"></metric>
+  <metric name="TA_FLAT_STORE_WAVEFRONTS" block=TA event=102 descr="Number of flat store vec32 packets processed by TA, same as flat_write_wavefronts in earlier IP"></metric>
+</gfx10>
+
+<gfx1030 base="gfx10">
+</gfx1030>
+
+<gfx1031 base="gfx10">
+</gfx1031>
+
+<gfx1032 base="gfx10">
+</gfx1032>
+
+<gfx11>
+  <metric name="GRBM_COUNT" block=GRBM event=0 descr="Tie High - Count Number of Clocks"></metric>
+  <metric name="GRBM_GUI_ACTIVE" block=GRBM event=2 descr="The GUI is Active"></metric>
+  <metric name="GL2C_HIT" block=GL2C event=42 descr="Number of cache hits"></metric>
+  <metric name="GL2C_MISS" block=GL2C event=43 descr="Number of cache misses.  UC reads count as misses."></metric>
+  <metric name="GL2C_MC_WRREQ" block=GL2C event=83 descr="Number of transactions (either 32-byte or 64-byte) going over the GL2C_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands"></metric>
+  <metric name="GL2C_EA_WRREQ_64B" block=GL2C event=85 descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface."></metric>
+  <metric name="GL2C_MC_WRREQ_STALL" block=GL2C event=88 descr="Number of cycles a write request was stalled."></metric>
+  <metric name="GL2C_MC_RDREQ" block=GL2C event=96 descr="Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte)."></metric>
+  <metric name="GL2C_EA_RDREQ_32B" block=GL2C event=99 descr="Number of 32-byte GL2C/EA read requests"></metric>
+  <metric name="GL2C_EA_RDREQ_64B" block=GL2C event=100 descr="Number of 64-byte GL2C/EA read requests"></metric>
+  <metric name="GL2C_EA_RDREQ_96B" block=GL2C event=101 descr="Number of 96-byte GL2C/EA read requests"></metric>
+  <metric name="GL2C_EA_RDREQ_128B" block=GL2C event=102 descr="Number of 128-byte GL2C/EA read requests"></metric>
+  <metric name="SQ_ACCUM_PREV" block=SQ event=1 descr="For counter N, increment by the value of counter N-1."></metric>
+  <metric name="SQ_BUSY_CYCLES" block=SQ event=3 descr="Clock cycles while SQ is reporting that it is busy. {nondeterministic, global, C2}"></metric>
+  <metric name="SQ_WAVES" block=SQ event=4 descr="Count number of waves sent to SQs. {emulated, global, C1}"></metric>
+  <metric name="SQ_WAVE_CYCLES" block=SQ event=24 descr="Number of clock cycles spent by waves in the SQs. Incremented by number of living (valid) waves each cycle. {nondeterministic, C1}"></metric>
+  <metric name="SQ_WAIT_INST_ANY" block=SQ event=26 descr="Number of clock-cycles spent waiting for any instruction issue. In units of cycles. (nondeterministic)"></metric>
+  <metric name="SQ_WAIT_ANY" block=SQ event=35 descr="Number of wave-cycles spent waiting for anything (nondeterministic, C1)"></metric>
+  <metric name="SQ_INSTS_WAVE32" block=SQ event=70 descr="Number of wave32 instructions issued, for flat, lds, valu, tex. {emulated, C1}"></metric>
+  <metric name="SQ_INSTS_WAVE32_LDS" block=SQ event=72 descr="Number of wave32 LDS indexed instructions issued. Wave64 may count 1 or 2, depending on what gets issued. {emulated, C1}"></metric>
+  <metric name="SQ_INSTS_WAVE32_VALU" block=SQ event=73 descr="Number of wave32 valu instructions issued. Wave64 may count 1 or 2, depending on what gets issued. {emulated, C1}"></metric>
+  <metric name="SQ_WAVE32_INSTS" block=SQ event=82 descr="Number of instructions issued by wave32 waves. Skipped instructions are not counted. {emulated}"></metric>
+  <metric name="SQ_WAVE64_INSTS" block=SQ event=83 descr="Number of instructions issued by wave64 waves. Skipped instructions are not counted. {emulated}"></metric>
+  <metric name="SQ_INST_LEVEL_GDS" block=SQ event=87 descr="Number of in-flight GDS instructions. Set next counter to ACCUM_PREV and divide by INSTS_GDS for average latency. {level, nondeterministic, C1}"></metric>
+  <metric name="SQ_INST_LEVEL_LDS" block=SQ event=88 descr="Number of in-flight LDS instructions. Set next counter to ACCUM_PREV and divide by INSTS_LDS for average latency. Includes FLAT instructions. {level, nondeterministic, C1}"></metric>
+  <metric name="SQ_INST_CYCLES_VMEM" block=SQ event=106 descr="Number of cycles needed to send addr and data for VMEM (lds, buffer, image, flat, scratch, global) instructions, windowed by perf_en. {emulated, C1}"></metric>
+  <metric name="SQC_LDS_BANK_CONFLICT" block=SQ event=256 descr="Number of cycles LDS is stalled by bank conflicts. (emulated, C1)"></metric>
+  <metric name="SQC_LDS_IDX_ACTIVE" block=SQ event=261 descr="Number of cycles LDS is used for indexed (non-direct,non-interpolation) operations. {per-simd, emulated, C1}"></metric>
+  <metric name="SQ_INSTS_VALU" block=SQ event=62 descr="Number of VALU instructions issued excluding skipped instructions. {emulated, C1}"></metric>
+  <metric name="SQ_INSTS_SALU" block=SQ event=58 descr="Number of SALU instructions issued. {emulated, C1}"></metric>
+  <metric name="SQ_INSTS_SMEM" block=SQ event=59 descr="Number of SMEM instructions issued. {emulated, C1}"></metric>
+  <metric name="SQ_INSTS_FLAT" block=SQ event=56 descr="Number of FLAT instructions issued. {emulated, C2}"></metric>
+  <metric name="SQ_INSTS_LDS" block=SQ event=57 descr="Number of LDS indexed instructions issued. {emulated, C1}"></metric>
+  <metric name="SQ_INSTS_GDS" block=SQ event=54 descr="Number of GDS instructions issued. {emulated, C1}"></metric>
+  <metric name="SQ_INSTS_TEX_LOAD" block=SQ event=66 descr="Number of buffer load, image load, sample, or atomic (with return) instructions issued. {emulated, C1}"></metric>
+  <metric name="SQ_INSTS_TEX_STORE" block=SQ event=67 descr="Number of buffer store, image store, or atomic (without return) instructions issued. {emulated, C1}"></metric>
+  <metric name="SQ_WAIT_INST_LDS" block=SQ event=29 descr="Number of clock cycles spent waiting for LDS (indexed) instruction issue. In units of cycles. {nondeterministic, C1}"></metric>
+  <metric name="TA_TA_BUSY" block=TA event=15 descr="TA block is busy. Perf_Windowing not supported for this counter."></metric>
+  <metric name="TA_BUFFER_LOAD_WAVEFRONTS" block=TA event=45 descr="Number of buffer load vec32 packets processed by TA"></metric>
+  <metric name="TA_BUFFER_STORE_WAVEFRONTS" block=TA event=46 descr="Number of buffer store vec32 packets processed by TA"></metric>
+</gfx11>
+
+<gfx1100 base="gfx11">
+</gfx1100>
+
+<gfx1101 base="gfx11">
+</gfx1101>
+
@@ -0,0 +1,585 @@
+<gfx8_expr>
+  <metric name="KERNEL_DURATION" expr=1 descr="The duration of the kernel dispatch"></metric>
+  <metric name="TA_BUSY_avr" expr=avr(TA_TA_BUSY,16) descr="TA block is busy. Average over TA instances."></metric>
+  <metric name="TA_BUSY_max" expr=max(TA_TA_BUSY,16) descr="TA block is busy. Max over TA instances."></metric>
+  <metric name="TA_BUSY_min" expr=min(TA_TA_BUSY,16) descr="TA block is busy. Min over TA instances."></metric>
+  <metric name="TA_FLAT_READ_WAVEFRONTS_sum" expr=sum(TA_FLAT_READ_WAVEFRONTS,16) descr="Number of flat opcode reads processed by the TA. Sum over TA instances."></metric>
+  <metric name="TA_FLAT_WRITE_WAVEFRONTS_sum" expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) descr="Number of flat opcode writes processed by the TA. Sum over TA instances."></metric>
+
+  <metric name="TCC_HIT_sum" expr=sum(TCC_HIT,16) descr="Number of cache hits. Sum over TCC instances."></metric>
+  <metric name="TCC_MISS_sum" expr=sum(TCC_MISS,16) descr="Number of cache misses. Sum over TCC instances."></metric>
+  <metric name="TCC_MC_RDREQ_sum" expr=sum(TCC_MC_RDREQ,16) descr="Number of 32-byte reads. Sum over TCC instaces."></metric>
+  <metric name="TCC_MC_WRREQ_sum" expr=sum(TCC_MC_WRREQ,16) descr="Number of 32-byte transactions going over the TC_MC_wrreq interface. Sum over TCC instaces."></metric>
+  <metric name="TCC_WRREQ_STALL_max" expr=max(TCC_MC_WRREQ_STALL,16) descr="Number of cycles a write request was stalled. Max over TCC instances."></metric>
+
+  <metric name="FETCH_SIZE" expr=(TCC_MC_RDREQ_sum*32)/1024 descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="WRITE_SIZE" expr=(TCC_MC_WRREQ_sum*32)/1024 descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="WRITE_REQ_32B" expr=TCC_MC_WRREQ_sum descr="The total number of 32-byte effective memory writes."></metric>
+  <metric name="VFetchInsts" expr=(SQ_INSTS_VMEM_RD-TA_FLAT_READ_WAVEFRONTS_sum)/SQ_WAVES descr="The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory."></metric>
+  <metric name="VWriteInsts" expr=(SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES descr="The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory."></metric>
+  <metric name="FlatVMemInsts" expr=(SQ_INSTS_FLAT-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES descr="The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch."></metric>
+  <metric name="LDSInsts" expr=(SQ_INSTS_LDS-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES descr="The average number of LDS read or LDS write instructions executed per work item (affected by flow control).  Excludes FLAT instructions that read from or write to LDS."></metric>
+  <metric name="FlatLDSInsts" expr=SQ_INSTS_FLAT_LDS_ONLY/SQ_WAVES descr="The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control)."></metric>
+  <metric name="VALUUtilization" expr=100*SQ_THREAD_CYCLES_VALU/(SQ_ACTIVE_INST_VALU*MAX_WAVE_SIZE) descr="The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence)."></metric>
+  <metric name="VALUBusy" expr=100*SQ_ACTIVE_INST_VALU*4/SIMD_NUM/GRBM_GUI_ACTIVE descr="The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."></metric>
+  <metric name="SALUBusy" expr=100*SQ_INST_CYCLES_SALU*4/SIMD_NUM/GRBM_GUI_ACTIVE descr="The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."></metric>
+  <metric name="FetchSize" expr=FETCH_SIZE descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="WriteSize" expr=WRITE_SIZE descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="MemWrites32B" expr=WRITE_REQ_32B descr="The total number of effective 32B write transactions to the memory"></metric>
+  <metric name="L2CacheHit" expr=100*sum(TCC_HIT,16)/(sum(TCC_HIT,16)+sum(TCC_MISS,16)) descr="The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal)."></metric>
+  <metric name="MemUnitStalled" expr=100*max(TCP_TCP_TA_DATA_STALL_CYCLES,16)/GRBM_GUI_ACTIVE/SE_NUM descr="The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad)."></metric>
+  <metric name="WriteUnitStalled" expr=100*TCC_WRREQ_STALL_max/GRBM_GUI_ACTIVE descr="The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad)."></metric>
+  # LDSBankConflict The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad).
+  <metric name="LDSBankConflict" expr=100*SQ_LDS_BANK_CONFLICT/GRBM_GUI_ACTIVE/CU_NUM descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."></metric>
+</gfx8_expr>
+
+<gfx9_expr>
+  <metric name="KERNEL_DURATION" expr=1 descr="The duration of the kernel dispatch"></metric>
+  <metric name="TA_BUSY_avr" expr=avr(TA_TA_BUSY,16) descr="TA block is busy. Average over TA instances."></metric>
+  <metric name="TA_BUSY_max" expr=max(TA_TA_BUSY,16) descr="TA block is busy. Max over TA instances."></metric>
+  <metric name="TA_BUSY_min" expr=min(TA_TA_BUSY,16) descr="TA block is busy. Min over TA instances."></metric>
+  <metric name="TA_FLAT_READ_WAVEFRONTS_sum" expr=sum(TA_FLAT_READ_WAVEFRONTS,16) descr="Number of flat opcode reads processed by the TA. Sum over TA instances."></metric>
+  <metric name="TA_FLAT_WRITE_WAVEFRONTS_sum" expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) descr="Number of flat opcode writes processed by the TA. Sum over TA instances."></metric>
+
+  <metric name="TCC_HIT_sum" expr=sum(TCC_HIT,16) descr="Number of cache hits. Sum over TCC instances."></metric>
+  <metric name="TCC_MISS_sum" expr=sum(TCC_MISS,16) descr="Number of cache misses. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_RDREQ_32B_sum" expr=sum(TCC_EA_RDREQ_32B,16) descr="Number of 32-byte TCC/EA read requests. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_RDREQ_sum" expr=sum(TCC_EA_RDREQ,16) descr="Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances."></metric>
+  <metric name="TCC_EA_WRREQ_sum" expr=sum(TCC_EA_WRREQ,16) descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_WRREQ_64B_sum" expr=sum(TCC_EA_WRREQ_64B,16) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances."></metric>
+  <metric name="TCC_WRREQ_STALL_max" expr=max(TCC_EA_WRREQ_STALL,16) descr="Number of cycles a write request was stalled. Max over TCC instances."></metric>
+
+  <metric name="TCP_TCP_TA_DATA_STALL_CYCLES_sum" expr=sum(TCP_TCP_TA_DATA_STALL_CYCLES,16) descr="Total number of TCP stalls TA data interface."></metric>
+  <metric name="TCP_TCP_TA_DATA_STALL_CYCLES_max" expr=max(TCP_TCP_TA_DATA_STALL_CYCLES,16) descr="Maximum number of TCP stalls TA data interface."></metric>
+
+  <metric name="FETCH_SIZE" expr=(TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024 descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="WRITE_SIZE" expr=((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024 descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="WRITE_REQ_32B" expr=TCC_EA_WRREQ_64B_sum*2+(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum) descr="The total number of 32-byte effective memory writes."></metric>
+  <metric name="VFetchInsts" expr=(SQ_INSTS_VMEM_RD-TA_FLAT_READ_WAVEFRONTS_sum)/SQ_WAVES descr="The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory."></metric>
+  <metric name="VWriteInsts" expr=(SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES descr="The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory."></metric>
+  <metric name="FlatVMemInsts" expr=(SQ_INSTS_FLAT-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES descr="The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch."></metric>
+  <metric name="LDSInsts" expr=(SQ_INSTS_LDS-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES descr="The average number of LDS read or LDS write instructions executed per work item (affected by flow control).  Excludes FLAT instructions that read from or write to LDS."></metric>
+  <metric name="FlatLDSInsts" expr=SQ_INSTS_FLAT_LDS_ONLY/SQ_WAVES descr="The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control)."></metric>
+  <metric name="VALUUtilization" expr=100*SQ_THREAD_CYCLES_VALU/(SQ_ACTIVE_INST_VALU*MAX_WAVE_SIZE) descr="The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence)."></metric>
+  <metric name="VALUBusy" expr=100*SQ_ACTIVE_INST_VALU*4/SIMD_NUM/GRBM_GUI_ACTIVE descr="The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."></metric>
+  <metric name="SALUBusy" expr=100*SQ_INST_CYCLES_SALU*4/SIMD_NUM/GRBM_GUI_ACTIVE descr="The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."></metric>
+  <metric name="FetchSize" expr=FETCH_SIZE descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="WriteSize" expr=WRITE_SIZE descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="MemWrites32B" expr=WRITE_REQ_32B descr="The total number of effective 32B write transactions to the memory"></metric>
+  <metric name="L2CacheHit" expr=100*sum(TCC_HIT,16)/(sum(TCC_HIT,16)+sum(TCC_MISS,16)) descr="The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal)."></metric>
+  <metric name="MemUnitStalled" expr=100*TCP_TCP_TA_DATA_STALL_CYCLES_max/GRBM_GUI_ACTIVE/SE_NUM descr="The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad)."></metric>
+  <metric name="WriteUnitStalled" expr=100*TCC_WRREQ_STALL_max/GRBM_GUI_ACTIVE descr="The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad)."></metric>
+  # LDSBankConflict The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad).
+  <metric name="LDSBankConflict" expr=100*SQ_LDS_BANK_CONFLICT/GRBM_GUI_ACTIVE/CU_NUM descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."></metric>
+</gfx9_expr>
+
+<gfx906_expr base="gfx9_expr">
+  # EA1
+  <metric name="TCC_EA1_RDREQ_32B_sum" expr=sum(TCC_EA1_RDREQ_32B,16) descr="Number of 32-byte TCC/EA read requests. Sum over TCC EA1s."></metric>
+  <metric name="TCC_EA1_RDREQ_sum" expr=sum(TCC_EA1_RDREQ,16) descr="Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC EA1s."></metric>
+  <metric name="TCC_EA1_WRREQ_sum" expr=sum(TCC_EA1_WRREQ,16) descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC EA1s."></metric>
+  <metric name="TCC_EA1_WRREQ_64B_sum" expr=sum(TCC_EA1_WRREQ_64B,16) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC EA1s."></metric>
+  <metric name="TCC_WRREQ1_STALL_max" expr=max(TCC_EA1_WRREQ_STALL,16) descr="Number of cycles a write request was stalled. Max over TCC instances."></metric>
+
+  <metric name="RDATA1_SIZE" expr=(TCC_EA1_RDREQ_32B_sum*32+(TCC_EA1_RDREQ_sum-TCC_EA1_RDREQ_32B_sum)*64) descr="The total kilobytes fetched from the video memory. This is measured on EA1s."></metric>
+  <metric name="WDATA1_SIZE" expr=((TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)*32+TCC_EA1_WRREQ_64B_sum*64) descr="The total kilobytes written to the video memory. This is measured on EA1s."></metric>
+
+  # both EA0 and EA1 should be included
+  <metric name="FETCH_SIZE" expr=(TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64+RDATA1_SIZE)/1024 descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="WRITE_SIZE" expr=((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64+WDATA1_SIZE)/1024 descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="WRITE_REQ_32B" expr=(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)+(TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)+(TCC_EA_WRREQ_64B_sum+TCC_EA1_WRREQ_64B_sum)*2 descr="The total number of 32-byte effective memory writes."></metric>
+</gfx906_expr>
+
+<gfx908_expr base="gfx9_expr">
+  <metric name="TCC_HIT_sum" expr=sum(TCC_HIT,32) descr="Number of cache hits. Sum over TCC instances."></metric>
+  <metric name="TCC_MISS_sum" expr=sum(TCC_MISS,32) descr="Number of cache misses. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_RDREQ_32B_sum" expr=sum(TCC_EA_RDREQ_32B,32) descr="Number of 32-byte TCC/EA read requests. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_RDREQ_sum" expr=sum(TCC_EA_RDREQ,32) descr="Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances."></metric>
+  <metric name="TCC_EA_WRREQ_sum" expr=sum(TCC_EA_WRREQ,32) descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_WRREQ_64B_sum" expr=sum(TCC_EA_WRREQ_64B,32) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances."></metric>
+  <metric name="TCC_WRREQ_STALL_max" expr=max(TCC_EA_WRREQ_STALL,32) descr="Number of cycles a write request was stalled. Max over TCC instances."></metric>
+
+  <metric name="CU_UTILIZATION"                                         expr=GRBM_GUI_ACTIVE/GRBM_COUNT descr="The total number of active cycles divided by total number of elapsed cycles"></metric>
+</gfx908_expr>
+
+<gfx90a_expr base="gfx9_expr">
+  <metric name="MeanOccupancyPerCU" expr=SQ_LEVEL_WAVES*0+SQ_ACCUM_PREV_HIRES/GRBM_GUI_ACTIVE/CU_NUM descr="Mean occupancy per compute unit."></metric>
+  <metric name="MeanOccupancyPerActiveCU" expr=SQ_LEVEL_WAVES*0+SQ_ACCUM_PREV_HIRES*4/SQ_BUSY_CYCLES/CU_NUM descr="Mean occupancy per active compute unit."></metric>
+  <metric name="TA_BUSY_avr"                                            expr=avr(TA_TA_BUSY,16) descr="TA block is busy. Average over TA instances."></metric>
+  <metric name="TA_BUSY_max"                                            expr=max(TA_TA_BUSY,16) descr="TA block is busy. Max over TA instances."></metric>
+  <metric name="TA_BUSY_min"                                            expr=min(TA_TA_BUSY,16) descr="TA block is busy. Min over TA instances."></metric>
+  <metric name="TA_TA_BUSY_sum"                                         expr=sum(TA_TA_BUSY,16) descr="TA block is busy. Perf_Windowing not supported for this counter. Sum over TA instances."></metric>
+  <metric name="TA_TOTAL_WAVEFRONTS_sum"                                expr=sum(TA_TOTAL_WAVEFRONTS,16) descr="Total number of wavefronts processed by TA. Sum over TA instances."></metric>
+  <metric name="TA_ADDR_STALLED_BY_TC_CYCLES_sum"                       expr=sum(TA_ADDR_STALLED_BY_TC_CYCLES,16) descr="Number of cycles addr path stalled by TC. Perf_Windowing not supported for this counter. Sum over TA instances."></metric>
+  <metric name="TA_ADDR_STALLED_BY_TD_CYCLES_sum"                       expr=sum(TA_ADDR_STALLED_BY_TD_CYCLES,16) descr="Number of cycles addr path stalled by TD. Perf_Windowing not supported for this counter. Sum over TA instances."></metric>
+  <metric name="TA_DATA_STALLED_BY_TC_CYCLES_sum"                       expr=sum(TA_DATA_STALLED_BY_TC_CYCLES,16) descr="Number of cycles data path stalled by TC. Perf_Windowing not supported for this counter. Sum over TA instances."></metric>
+  <metric name="TA_FLAT_WAVEFRONTS_sum"                                 expr=sum(TA_FLAT_WAVEFRONTS,16) descr="Number of flat opcode wavfronts processed by the TA. Sum over TA instances."></metric>
+  <metric name="TA_FLAT_READ_WAVEFRONTS_sum"                            expr=sum(TA_FLAT_READ_WAVEFRONTS,16) descr="Number of flat opcode reads processed by the TA. Sum over TA instances."></metric>
+  <metric name="TA_FLAT_WRITE_WAVEFRONTS_sum"                           expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) descr="Number of flat opcode writes processed by the TA. Sum over TA instances."></metric>
+  <metric name="TA_FLAT_ATOMIC_WAVEFRONTS_sum"                          expr=sum(TA_FLAT_ATOMIC_WAVEFRONTS,16) descr="Number of flat opcode atomics processed by the TA. Sum over TA instances."></metric>
+  <metric name="TA_BUFFER_WAVEFRONTS_sum"                               expr=sum(TA_BUFFER_WAVEFRONTS,16) descr="Number of buffer wavefronts processed by TA. Sum over TA instances."></metric>
+  <metric name="TA_BUFFER_READ_WAVEFRONTS_sum"                          expr=sum(TA_BUFFER_READ_WAVEFRONTS,16) descr="Number of buffer read wavefronts processed by TA. Sum over TA instances."></metric>
+  <metric name="TA_BUFFER_WRITE_WAVEFRONTS_sum"                         expr=sum(TA_BUFFER_WRITE_WAVEFRONTS,16) descr="Number of buffer write wavefronts processed by TA. Sum over TA instances."></metric>
+  <metric name="TA_BUFFER_ATOMIC_WAVEFRONTS_sum"                        expr=sum(TA_BUFFER_ATOMIC_WAVEFRONTS,16) descr="Number of buffer atomic wavefronts processed by TA. Sum over TA instances."></metric>
+  <metric name="TA_BUFFER_TOTAL_CYCLES_sum"                             expr=sum(TA_BUFFER_TOTAL_CYCLES,16) descr="Number of buffer cycles issued to TC. Sum over TA instances."></metric>
+  <metric name="TA_BUFFER_COALESCED_READ_CYCLES_sum"                    expr=sum(TA_BUFFER_COALESCED_READ_CYCLES,16) descr="Number of buffer coalesced read cycles issued to TC. Sum over TA instances."></metric>
+  <metric name="TA_BUFFER_COALESCED_WRITE_CYCLES_sum"                   expr=sum(TA_BUFFER_COALESCED_WRITE_CYCLES,16) descr="Number of buffer coalesced write cycles issued to TC. Sum over TA instances."></metric>
+  <metric name="TD_TD_BUSY_sum"                                         expr=sum(TD_TD_BUSY,16) descr="TD is processing or waiting for data. Perf_Windowing not supported for this counter. Sum over TD instances."></metric>
+  <metric name="TD_TC_STALL_sum"                                        expr=sum(TD_TC_STALL,16) descr="TD is stalled waiting for TC data. Sum over TD instances."></metric>
+  <metric name="TD_LOAD_WAVEFRONT_sum"                                  expr=sum(TD_LOAD_WAVEFRONT,16) descr="Count the wavefronts with opcode = load, include atomics and store. Sum over TD instances."></metric>
+  <metric name="TD_ATOMIC_WAVEFRONT_sum"                                expr=sum(TD_ATOMIC_WAVEFRONT,16) descr="Count the wavefronts with opcode = atomic. Sum over TD instances."></metric>
+  <metric name="TD_STORE_WAVEFRONT_sum"                                 expr=sum(TD_STORE_WAVEFRONT,16) descr="Count the wavefronts with opcode = store. Sum over TD instances."></metric>
+  <metric name="TD_COALESCABLE_WAVEFRONT_sum"                           expr=sum(TD_COALESCABLE_WAVEFRONT,16) descr="Count wavefronts that TA finds coalescable. Sum over TD instances."></metric>
+  <metric name="TD_SPI_STALL_sum"                                       expr=sum(TD_SPI_STALL,16) descr="TD is stalled SPI vinit, sum of TCP instances"></metric>
+  <metric name="TCP_GATE_EN1_sum"                                       expr=sum(TCP_GATE_EN1,16) descr="TCP interface clocks are turned on. Not Windowed. Sum over TCP instances."></metric>
+  <metric name="TCP_GATE_EN2_sum"                                       expr=sum(TCP_GATE_EN2,16) descr="TCP core clocks are turned on. Not Windowed. Sum over TCP instances."></metric>
+  <metric name="TCP_TD_TCP_STALL_CYCLES_sum"                            expr=sum(TCP_TD_TCP_STALL_CYCLES,16) descr="TD stalls TCP. Sum over TCP instances."></metric>
+  <metric name="TCP_TCR_TCP_STALL_CYCLES_sum"                           expr=sum(TCP_TCR_TCP_STALL_CYCLES,16) descr="TCR stalls TCP_TCR_req interface. Sum over TCP instances."></metric>
+  <metric name="TCP_READ_TAGCONFLICT_STALL_CYCLES_sum"                  expr=sum(TCP_READ_TAGCONFLICT_STALL_CYCLES,16) descr="Tagram conflict stall on a read. Sum over TCP instances."></metric>
+  <metric name="TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum"                 expr=sum(TCP_WRITE_TAGCONFLICT_STALL_CYCLES,16) descr="Tagram conflict stall on a write. Sum over TCP instances."></metric>
+  <metric name="TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum"                expr=sum(TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES,16) descr="Tagram conflict stall on an atomic. Sum over TCP instances."></metric>
+  <metric name="TCP_VOLATILE_sum"                                       expr=sum(TCP_VOLATILE,16) descr="Total number of L1 volatile pixels/buffers from TA. Sum over TCP instances."></metric>
+  <metric name="TCP_TOTAL_ACCESSES_sum"                                 expr=sum(TCP_TOTAL_ACCESSES,16) descr="Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD. Sum over TCP instances."></metric>
+  <metric name="TCP_TOTAL_READ_sum"                                     expr=sum(TCP_TOTAL_READ,16) descr="Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ. Sum over TCP instances."></metric>
+  <metric name="TCP_TOTAL_WRITE_sum"                                    expr=sum(TCP_TOTAL_WRITE,16) descr="Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+ TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE. Sum over TCP instances."></metric>
+  <metric name="TCP_TOTAL_ATOMIC_WITH_RET_sum"                          expr=sum(TCP_TOTAL_ATOMIC_WITH_RET,16) descr="Total number of atomic with return pixels/buffers from TA. Sum over TCP instances."></metric>
+  <metric name="TCP_TOTAL_ATOMIC_WITHOUT_RET_sum"                       expr=sum(TCP_TOTAL_ATOMIC_WITHOUT_RET,16) descr="Total number of atomic without return pixels/buffers from TA Sum over TCP instances."></metric>
+  <metric name="TCP_TOTAL_WRITEBACK_INVALIDATES_sum"                    expr=sum(TCP_TOTAL_WRITEBACK_INVALIDATES,16) descr="Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. Not Windowed. Sum over TCP instances."></metric>
+  <metric name="TCP_UTCL1_REQUEST_sum"                                  expr=sum(TCP_UTCL1_REQUEST,16) descr="Total CLIENT_UTCL1 NORMAL requests Sum over TCP instances."></metric>
+  <metric name="TCP_UTCL1_TRANSLATION_MISS_sum"                         expr=sum(TCP_UTCL1_TRANSLATION_MISS,16) descr="Total utcl1 translation misses Sum over TCP instances."></metric>
+  <metric name="TCP_UTCL1_TRANSLATION_HIT_sum"                          expr=sum(TCP_UTCL1_TRANSLATION_HIT,16) descr="Total utcl1 translation hits Sum over TCP instances."></metric>
+  <metric name="TCP_UTCL1_PERMISSION_MISS_sum"                          expr=sum(TCP_UTCL1_PERMISSION_MISS,16) descr="Total utcl1 permission misses Sum over TCP instances."></metric>
+  <metric name="TCP_TOTAL_CACHE_ACCESSES_sum"                           expr=sum(TCP_TOTAL_CACHE_ACCESSES,16) descr="Count of total cache line (tag) accesses (includes hits and misses). Sum over TCP instances."></metric>
+  <metric name="TCP_TCP_LATENCY_sum"                                    expr=sum(TCP_TCP_LATENCY,16) descr="Total TCP wave latency (from first clock of wave entering to first clock of wave leaving), divide by TA_TCP_STATE_READ to avg wave latency Sum over TCP instances."></metric>
+  <metric name="TCP_TA_TCP_STATE_READ_sum"                              expr=sum(TCP_TA_TCP_STATE_READ,16) descr="Number of state reads Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_READ_REQ_LATENCY_sum"                           expr=sum(TCP_TCC_READ_REQ_LATENCY,16) descr="Total TCP->TCC request latency for reads and atomics with return. Not Windowed. Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_WRITE_REQ_LATENCY_sum"                          expr=sum(TCP_TCC_WRITE_REQ_LATENCY,16) descr="Total TCP->TCC request latency for writes and atomics without return. Not Windowed. Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_READ_REQ_sum"                                   expr=sum(TCP_TCC_READ_REQ,16) descr="Total read requests from TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_WRITE_REQ_sum"                                  expr=sum(TCP_TCC_WRITE_REQ,16) descr="Total write requests from TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_ATOMIC_WITH_RET_REQ_sum"                        expr=sum(TCP_TCC_ATOMIC_WITH_RET_REQ,16) descr="Total atomic with return requests from TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"                     expr=sum(TCP_TCC_ATOMIC_WITHOUT_RET_REQ,16) descr="Total atomic without return requests from TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_NC_READ_REQ_sum"                                expr=sum(TCP_TCC_NC_READ_REQ,16) descr="Total read requests with NC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_NC_WRITE_REQ_sum"                               expr=sum(TCP_TCC_NC_WRITE_REQ,16) descr="Total write requests with NC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_NC_ATOMIC_REQ_sum"                              expr=sum(TCP_TCC_NC_ATOMIC_REQ,16) descr="Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_UC_READ_REQ_sum"                                expr=sum(TCP_TCC_UC_READ_REQ,16) descr="Total read requests with UC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_UC_WRITE_REQ_sum"                               expr=sum(TCP_TCC_UC_WRITE_REQ,16) descr="Total write requests with UC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_UC_ATOMIC_REQ_sum"                              expr=sum(TCP_TCC_UC_ATOMIC_REQ,16) descr="Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_CC_READ_REQ_sum"                                expr=sum(TCP_TCC_CC_READ_REQ,16) descr="Total write requests with CC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_CC_WRITE_REQ_sum"                               expr=sum(TCP_TCC_CC_WRITE_REQ,16) descr="Total write requests with CC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_CC_ATOMIC_REQ_sum"                              expr=sum(TCP_TCC_CC_ATOMIC_REQ,16) descr="Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_RW_READ_REQ_sum"                                expr=sum(TCP_TCC_RW_READ_REQ,16) descr="Total write requests with RW mtype from this TCP to all TCCs. Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_RW_WRITE_REQ_sum"                               expr=sum(TCP_TCC_RW_WRITE_REQ,16) descr="Total write requests with RW mtype from this TCP to all TCCs. Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_RW_ATOMIC_REQ_sum"                              expr=sum(TCP_TCC_RW_ATOMIC_REQ,16) descr="Total atomic requests with RW mtype from this TCP to all TCCs. Sum over TCP instances."></metric>
+  <metric name="TCP_PENDING_STALL_CYCLES_sum"                           expr=sum(TCP_PENDING_STALL_CYCLES,16) descr="Stall due to data pending from L2. Sum over TCP instances."></metric>
+  <metric name="TCA_CYCLE_sum"                                          expr=sum(TCA_CYCLE,16)  descr="Number of cycles. Sum over all TCA instances "></metric>
+  <metric name="TCA_BUSY_sum"                                           expr=sum(TCA_BUSY,16)   descr="Number of cycles we have a request pending. Sum over all TCA instances."></metric>
+  <metric name="TCC_BUSY_avr"                                           expr=avr(TCC_BUSY,32) descr="TCC_BUSY avr over all memory channels."></metric>
+  <metric name="TCC_WRREQ_STALL_max"                                    expr=max(TCC_EA_WRREQ_STALL,32) descr="Number of cycles a write request was stalled. Max over TCC instances."></metric>
+  <metric name="TCC_CYCLE_sum"                                          expr=sum(TCC_CYCLE,32) descr="Number of cycles. Not windowable. Sum over TCC instances."></metric>
+  <metric name="TCC_BUSY_sum"                                           expr=sum(TCC_BUSY,32) descr="Number of cycles we have a request pending. Not windowable. Sum over TCC instances."></metric>
+  <metric name="TCC_REQ_sum"                                            expr=sum(TCC_REQ,32) descr="Number of requests of all types. This is measured at the tag block. This may be more than the number of requests arriving at the TCC, but it is a good indication of the total amount of work that needs to be performed. Sum over TCC instances."></metric>
+  <metric name="TCC_STREAMING_REQ_sum"                                  expr=sum(TCC_STREAMING_REQ,32) descr="Number of streaming requests. This is measured at the tag block. Sum over TCC instances."></metric>
+  <metric name="TCC_NC_REQ_sum"                                         expr=sum(TCC_NC_REQ,32) descr="The number of noncoherently cached requests. This is measured at the tag block. Sum over TCC instances."></metric>
+  <metric name="TCC_UC_REQ_sum"                                         expr=sum(TCC_UC_REQ,32) descr="The number of uncached requests. This is measured at the tag block. Sum over TCC instances."></metric>
+  <metric name="TCC_CC_REQ_sum"                                         expr=sum(TCC_CC_REQ,32) descr="The number of coherently cached requests. This is measured at the tag block. Sum over TCC instances."></metric>
+  <metric name="TCC_RW_REQ_sum"                                         expr=sum(TCC_RW_REQ,32) descr="The number of RW requests. This is measured at the tag block. Sum over TCC instances."></metric>
+  <metric name="TCC_PROBE_sum"                                          expr=sum(TCC_PROBE,32) descr="Number of probe requests. Not windowable. Sum over TCC instances."></metric>
+  <metric name="TCC_PROBE_ALL_sum"                                      expr=sum(TCC_PROBE_ALL,32) descr="Number of external probe requests with with EA_TCC_preq_all== 1. Not windowable. Sum over TCC instances."></metric>
+  <metric name="TCC_READ_sum"                                           expr=sum(TCC_READ,32) descr="Number of read requests. Compressed reads are included in this, but metadata reads are not included. Sum over TCC instances."></metric>
+  <metric name="TCC_WRITE_sum"                                          expr=sum(TCC_WRITE,32) descr="Number of write requests. Sum over TCC instances."></metric>
+  <metric name="TCC_ATOMIC_sum"                                         expr=sum(TCC_ATOMIC,32) descr="Number of atomic requests of all types. Sum over TCC instances."></metric>
+  <metric name="TCC_HIT_sum"                                            expr=sum(TCC_HIT,32) descr="Number of cache hits. Sum over TCC instances."></metric>
+  <metric name="TCC_MISS_sum"                                           expr=sum(TCC_MISS,32) descr="Number of cache misses. UC reads count as misses. Sum over TCC instances."></metric>
+  <metric name="TCC_WRITEBACK_sum"                                      expr=sum(TCC_WRITEBACK,32) descr="Number of lines written back to main memory. This includes writebacks of dirty lines and uncached write/atomic requests. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_WRREQ_sum"                                       expr=sum(TCC_EA_WRREQ,32) descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_WRREQ_64B_sum"                                   expr=sum(TCC_EA_WRREQ_64B,32) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_WR_UNCACHED_32B_sum"                             expr=sum(TCC_EA_WR_UNCACHED_32B,32) descr="Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_WRREQ_STALL_sum"                                 expr=sum(TCC_EA_WRREQ_STALL,32) descr="Number of cycles a write request was stalled. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_WRREQ_IO_CREDIT_STALL_sum"                       expr=sum(TCC_EA_WRREQ_IO_CREDIT_STALL,32) descr="Number of cycles a EA write request was stalled because the interface was out of IO credits. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_WRREQ_GMI_CREDIT_STALL_sum"                      expr=sum(TCC_EA_WRREQ_GMI_CREDIT_STALL,32) descr="Number of cycles a EA write request was stalled because the interface was out of GMI credits. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum"                     expr=sum(TCC_EA_WRREQ_DRAM_CREDIT_STALL,32) descr="Number of cycles a EA write request was stalled because the interface was out of DRAM credits. Sum over TCC instances."></metric>
+  <metric name="TCC_TOO_MANY_EA_WRREQS_STALL_sum"                       expr=sum(TCC_TOO_MANY_EA_WRREQS_STALL,32) descr="Number of cycles the TCC could not send a EA write request because it already reached its maximum number of pending EA write requests. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_WRREQ_LEVEL_sum"                                 expr=sum(TCC_EA_WRREQ_LEVEL,32) descr="The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_RDREQ_LEVEL_sum"                                 expr=sum(TCC_EA_RDREQ_LEVEL,32) descr="The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_ATOMIC_sum"                                      expr=sum(TCC_EA_ATOMIC,32) descr="Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_ATOMIC_LEVEL_sum"                                expr=sum(TCC_EA_ATOMIC_LEVEL,32) descr="The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_RDREQ_sum"                                       expr=sum(TCC_EA_RDREQ,32) descr="Number of TCC/EA read requests (either 32-byte or 64-byte) Sum over TCC instances."></metric>
+  <metric name="TCC_EA_RDREQ_32B_sum"                                   expr=sum(TCC_EA_RDREQ_32B,32) descr="Number of 32-byte TCC/EA read requests Sum over TCC instances."></metric>
+  <metric name="TCC_EA_RD_UNCACHED_32B_sum"                             expr=sum(TCC_EA_RD_UNCACHED_32B,32) descr="Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2 Sum over TCC instances."></metric>
+  <metric name="TCC_EA_RDREQ_IO_CREDIT_STALL_sum"                       expr=sum(TCC_EA_RDREQ_IO_CREDIT_STALL,32) descr="Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_RDREQ_GMI_CREDIT_STALL_sum"                      expr=sum(TCC_EA_RDREQ_GMI_CREDIT_STALL,32) descr="Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum"                     expr=sum(TCC_EA_RDREQ_DRAM_CREDIT_STALL,32) descr="Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances."></metric>
+  <metric name="TCC_TAG_STALL_sum"                                      expr=sum(TCC_TAG_STALL,32) descr="."></metric>
+  <metric name="TCC_NORMAL_WRITEBACK_sum"                               expr=sum(TCC_NORMAL_WRITEBACK,32) descr="Number of writebacks due to requests that are not writeback requests. Sum over TCC instances."></metric>
+  <metric name="TCC_ALL_TC_OP_WB_WRITEBACK_sum"                         expr=sum(TCC_ALL_TC_OP_WB_WRITEBACK,32) descr="Number of writebacks due to all TC_OP writeback requests. Sum over TCC instances."></metric>
+  <metric name="TCC_NORMAL_EVICT_sum"                                   expr=sum(TCC_NORMAL_EVICT,32) descr="Number of evictions due to requests that are not invalidate or probe requests. Sum over TCC instances."></metric>
+  <metric name="TCC_ALL_TC_OP_INV_EVICT_sum"                            expr=sum(TCC_ALL_TC_OP_INV_EVICT,32) descr="Number of evictions due to all TC_OP invalidate requests. Sum over TCC instances."></metric>
+  <metric name="TCC_EA_RDREQ_DRAM_sum"                                  expr=sum(TCC_EA_RDREQ_DRAM,32) descr="Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). Sum over TCC instances."></metric>
+  <metric name="TCC_EA_WRREQ_DRAM_sum"                                  expr=sum(TCC_EA_WRREQ_DRAM,32) descr="Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum over TCC instances."></metric>
+
+  <metric name="FETCH_SIZE"                                             expr=(TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024 descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="WRITE_SIZE"                                             expr=((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024 descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="WRITE_REQ_32B"                                          expr=TCC_EA_WRREQ_64B_sum*2+(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum) descr="The total number of 32-byte effective memory writes."></metric>
+  <metric name="CU_OCCUPANCY"                                           expr=(SQ_CYCLES/(SQ_WAVE_CYCLES*4))/MAX_WAVE_SIZE descr="The ratio of active waves on a CU to the maximum number of active waves supported by the CU"></metric>
+  <metric name="CU_UTILIZATION"                                         expr=GRBM_GUI_ACTIVE/GRBM_COUNT descr="The total number of active cycles divided by total number of elapsed cycles"></metric>
+  <metric name="TOTAL_16_OPS"                                           expr=(SQ_INSTS_VALU_FMA_F16*2+SQ_INSTS_VALU_ADD_F16+SQ_INSTS_VALU_MUL_F16+SQ_INSTS_VALU_TRANS_F16)*64+((SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16)*512) descr="The number of 16 bits OPS executed"></metric>
+  <metric name="TOTAL_32_OPS"                                           expr=(SQ_INSTS_VALU_FMA_F32*2+SQ_INSTS_VALU_INT32+SQ_INSTS_VALU_ADD_F32+SQ_INSTS_VALU_MUL_F32+SQ_INSTS_VALU_TRANS_F32)*64+(SQ_INSTS_VALU_MFMA_MOPS_F32*512) descr="The number of 32 bits OPS executed"></metric>
+  <metric name="TOTAL_64_OPS"                                           expr=(SQ_INSTS_VALU_FMA_F64*2+SQ_INSTS_VALU_INT64+SQ_INSTS_VALU_ADD_F64+SQ_INSTS_VALU_MUL_F64)*64+(SQ_INSTS_VALU_MFMA_MOPS_F64*512) descr="The number of 64 bits OPS executed"></metric>
+
+  <metric name="AggSysCycles"                                           expr=GRBM_GUI_ACTIVE*CU_NUM                                   descr="Unit: cycles"></metric>
+  ## IP Block Utilization Metrics
+  <metric name="GpuUtil"                                                expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT                           descr="Unit: percent"></metric>
+  <metric name="CpUtil"                                                 expr=100*GRBM_CP_BUSY/GRBM_GUI_ACTIVE                         descr="Unit: percent"></metric>
+  <metric name="SpiUtil"                                                expr=100*GRBM_SPI_BUSY/GRBM_GUI_ACTIVE                        descr="Unit: percent"></metric>
+  <metric name="TaUtil"                                                 expr=100*GRBM_TA_BUSY/GRBM_GUI_ACTIVE                         descr="Unit: percent"></metric>
+  <metric name="TcUtil"                                                 expr=100*GRBM_TC_BUSY/GRBM_GUI_ACTIVE                         descr="Unit: percent"></metric>
+  <metric name="EaUtil"                                                 expr=100*GRBM_EA_BUSY/GRBM_GUI_ACTIVE                         descr="Unit: percent"></metric>
+  ## Instruction Fetch Metrics
+  <metric name="InstrFetchLatency"                                      expr=SQ_ACCUM_PREV_HIRES/SQ_IFETCH                            descr="Unit: cycles"></metric>
+  ## Wavefront Metrics
+  <metric name="WaveOccupancy"                                          expr=SQ_ACCUM_PREV_HIRES/GRBM_GUI_ACTIVE                      descr="Unit: wavefronts"></metric>
+  <metric name="WaveDuration"                                           expr=4*SQ_WAVE_CYCLES/SQ_WAVES                                descr="Unit: cycles"></metric>
+  <metric name="WaveDepWait"                                            expr=100*SQ_WAIT_ANY/SQ_WAVE_CYCLES                           descr="Unit: percent"></metric>
+  <metric name="WaveIssueWait"                                          expr=100*SQ_WAIT_INST_ANY/SQ_WAVE_CYCLES                      descr="Unit: percent"></metric>
+  <metric name="WaveExec"                                               expr=100*SQ_ACTIVE_INST_ANY/SQ_WAVE_CYCLES                    descr="Unit: percent"></metric>
+  ## Compute Unit Metrics
+  <metric name="ValuIops"                                               expr=(SQ_INSTS_VALU_INT32+SQ_INSTS_VALU_INT64)*64             descr="Unit: IOP"></metric>
+  <metric name="MfmaFlops"                                              expr=(SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16+SQ_INSTS_VALU_MFMA_MOPS_F32+SQ_INSTS_VALU_MFMA_MOPS_F64)*512      descr="Unit: FLOP"></metric>
+  <metric name="MfmaFlopsF16"                                           expr=SQ_INSTS_VALU_MFMA_MOPS_F16*512                          descr="Unit: FLOP"></metric>
+  <metric name="MfmaFlopsBF16"                                          expr=SQ_INSTS_VALU_MFMA_MOPS_BF16*512                         descr="Unit: FLOP"></metric>
+  <metric name="MfmaFlopsF32"                                           expr=SQ_INSTS_VALU_MFMA_MOPS_F32*512                          descr="Unit: FLOP"></metric>
+  <metric name="MfmaFlopsF64"                                           expr=SQ_INSTS_VALU_MFMA_MOPS_F64*512                          descr="Unit: IOP"></metric>
+  <metric name="ScaPipeIssueUtil"                                       expr=100*SQ_ACTIVE_INST_SCA/(GRBM_GUI_ACTIVE*CU_NUM)          descr="Unit: percent"></metric>
+  <metric name="ValuPipeIssueUtil"                                      expr=100*SQ_ACTIVE_INST_VALU/(GRBM_GUI_ACTIVE*CU_NUM)         descr="Unit: percent"></metric>
+  <metric name="VmemPipeIssueUtil"                                      expr=100*4*(SQ_ACTIVE_INST_VMEM+SQ_ACTIVE_INST_FLAT)/(GRBM_GUI_ACTIVE*CU_NUM)      descr="Unit: percent"></metric>
+  <metric name="MfmaUtil"                                               expr=100*SQ_VALU_MFMA_BUSY_CYCLES/(GRBM_GUI_ACTIVE*CU_NUM*4)                       descr="Unit: percent"></metric>
+  <metric name="AvgNumActiveThreads"                                    expr=SQ_THREAD_CYCLES_VALU/SQ_ACTIVE_INST_VALU                descr="Unit: percent"></metric>
+  <metric name="VmemLatency"                                            expr=SQ_ACCUM_PREV_HIRES/SQ_INSTS_VMEM                        descr="Unit: cycles"></metric>
+  <metric name="SmemLatency"                                            expr=SQ_ACCUM_PREV_HIRES/SQ_INSTS_SMEM_NORM                   descr="Unit: cycles"></metric>
+  ## Local Data Share (LDS) Metrics
+  <metric name="LdsUtil"                                                expr=100*SQ_LDS_IDX_ACTIVE/(GRBM_GUI_ACTIVE*CU_NUM)                       descr="Unit: percent"></metric>
+  <metric name="LdsPipeIssueUtil"                                       expr=100*4*SQ_ACTIVE_INST_LDS/(GRBM_GUI_ACTIVE*CU_NUM*2)                  descr="Unit: percent"></metric>
+  <metric name="LdsLatency"                                             expr=SQ_ACCUM_PREV_HIRES/SQ_INSTS_LDS                                     descr="Unit: cycles"></metric>
+  <metric name="LdsBankConflict"                                        expr=SQ_LDS_BANK_CONFLICT/(SQ_LDS_IDX_ACTIVE-SQ_LDS_BANK_CONFLICT)        descr="Unit: conflicts/access"></metric>
+  ## L1I and sL1D Cache Metrics
+  <metric name="L1iCacheHitRate"                                        expr=100*SQC_ICACHE_HITS/SQC_ICACHE_REQ                               descr="Unit: percent"></metric>
+  <metric name="sL1dCacheHitRate"                                       expr=100*SQC_DCACHE_HITS/SQC_DCACHE_REQ                               descr="Unit: percent"></metric>
+  ## vL1D Cache Metrics
+  <metric name="vL1dBufCoalesceRate"                                    expr=100*64*TA_TOTAL_WAVEFRONTS_sum/(TCP_TOTAL_ACCESSES_sum*4)        descr="Unit: percent"></metric>
+  <metric name="vL1dCacheUtil"                                          expr=100*TCP_GATE_EN2_sum/TCP_GATE_EN1_sum                            descr="Unit: percent"></metric>
+  <metric name="vL1dCacheTcbHitRate"                                    expr=100*TCP_UTCL1_TRANSLATION_HIT_sum/TCP_UTCL1_REQUEST_sum          descr="Unit: percent"></metric>
+  <metric name="vL1dCacheWaveLatency"                                   expr=TCP_TCP_LATENCY_sum/TCP_TA_TCP_STATE_READ_sum                    descr="Unit: cycles"></metric>
+  <metric name="vL1dReadFromL2Latency"                                  expr=TCP_TCC_READ_REQ_LATENCY_sum/(TCP_TCC_READ_REQ_sum+TCP_TCC_ATOMIC_WITH_RET_REQ_sum)            descr="Unit: cycles"></metric>
+  <metric name="vL1dWriteToL2Latency"                                   expr=TCP_TCC_WRITE_REQ_LATENCY_sum/(TCP_TCC_WRITE_REQ_sum+TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)       descr="Unit: cycles"></metric>
+  <metric name="vL1dRdTagConfStallRate"                                 expr=100*TCP_READ_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN2_sum       descr="Unit: percent"></metric>
+  <metric name="vL1dWrTagConfStallRate"                                 expr=100*TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN2_sum      descr="Unit: percent"></metric>
+  <metric name="vL1dAtomicTagConfStallRate"                             expr=100*TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN2_sum     descr="Unit: percent"></metric>
+  <metric name="vL1dMissReqStallRate"                                   expr=100*TCP_TCR_TCP_STALL_CYCLES_sum/TCP_GATE_EN2_sum                descr="Unit: percent"></metric>
+  <metric name="vL1dDataPendRate"                                       expr=100*TCP_PENDING_STALL_CYCLES_sum/TCP_GATE_EN2_sum                descr="Unit: percent"></metric>
+  <metric name="vL1dDataRetStallRate"                                   expr=100*TD_TC_STALL_sum/TD_TD_BUSY_sum                               descr="Unit: percent"></metric>
+  ## L2 Cache Metrics
+  <metric name="L2CacheHitRate"                                         expr=100*TCC_HIT_sum/(TCC_HIT_sum+TCC_MISS_sum)                       descr="Unit: percent"></metric>
+  <metric name="L2CacheTagRamStallRate"                                 expr=100*TCC_TAG_STALL_sum/TCC_BUSY_sum                               descr="Unit: percent"></metric>
+  <metric name="EaRdLatency"                                            expr=TCC_EA_RDREQ_LEVEL_sum/TCC_EA_RDREQ_sum                          descr="Unit: cycles"></metric>
+  <metric name="EaRdIoStallRate"                                        expr=100*TCC_EA_RDREQ_IO_CREDIT_STALL_sum/TCC_BUSY_sum                descr="Unit: percent"></metric>
+  <metric name="EaRdGmiStallRate"                                       expr=100*TCC_EA_RDREQ_GMI_CREDIT_STALL_sum/TCC_BUSY_sum               descr="Unit: percent"></metric>
+  <metric name="EaRdDramStallRate"                                      expr=100*TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum/TCC_BUSY_sum              descr="Unit: percent"></metric>
+  <metric name="EaWrLatency"                                            expr=TCC_EA_WRREQ_LEVEL_sum/TCC_EA_WRREQ_sum                          descr="Unit: cycles"></metric>
+  <metric name="EaWrIoStallRate"                                        expr=100*TCC_EA_WRREQ_IO_CREDIT_STALL_sum/TCC_BUSY_sum                descr="Unit: percent"></metric>
+  <metric name="EaWrGmiStallRate"                                       expr=100*TCC_EA_WRREQ_GMI_CREDIT_STALL_sum/TCC_BUSY_sum               descr="Unit: percent"></metric>
+  <metric name="EaWrDramStallRate"                                      expr=100*TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum/TCC_BUSY_sum              descr="Unit: percent"></metric>
+  <metric name="EaWrStarveRate"                                         expr=100*TCC_TOO_MANY_EA_WRREQS_STALL_sum/TCC_BUSY_sum                descr="Unit: percent"></metric>
+  <metric name="EaAtomicLatency"                                        expr=TCC_EA_ATOMIC_LEVEL_sum/TCC_EA_ATOMIC_sum                        descr="Unit: cycles"></metric>
+</gfx90a_expr>
+
+<gfx940_expr>
+  <metric name="TCP_TCP_TA_DATA_STALL_CYCLES_sum" expr=sum(TCP_TCP_TA_DATA_STALL_CYCLES,16) descr="Total number of TCP stalls TA data interface."></metric>
+  <metric name="TCP_TCP_TA_DATA_STALL_CYCLES_max" expr=max(TCP_TCP_TA_DATA_STALL_CYCLES,16) descr="Maximum number of TCP stalls TA data interface."></metric>
+
+  <metric name="MeanOccupancyPerCU" expr=SQ_LEVEL_WAVES*0+SQ_ACCUM_PREV_HIRES/GRBM_GUI_ACTIVE/CU_NUM descr="Mean occupancy per compute unit."></metric>
+  <metric name="MeanOccupancyPerActiveCU" expr=SQ_LEVEL_WAVES*0+SQ_ACCUM_PREV_HIRES*4/SQ_BUSY_CYCLES/CU_NUM descr="Mean occupancy per active compute unit."></metric>
+  <metric name="VFetchInsts" expr=(SQ_INSTS_VMEM_RD-TA_FLAT_READ_WAVEFRONTS_sum)/SQ_WAVES descr="The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory."></metric>
+  <metric name="VWriteInsts" expr=(SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES descr="The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory."></metric>
+  <metric name="VALUUtilization" expr=100*SQ_THREAD_CYCLES_VALU/(SQ_ACTIVE_INST_VALU*MAX_WAVE_SIZE) descr="The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence)."></metric>
+  <metric name="VALUBusy" expr=100*SQ_ACTIVE_INST_VALU*4/SIMD_NUM/GRBM_GUI_ACTIVE descr="The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."></metric>
+  <metric name="SALUBusy" expr=100*SQ_INST_CYCLES_SALU*4/SIMD_NUM/GRBM_GUI_ACTIVE descr="The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."></metric>
+  <metric name="FetchSize" expr=FETCH_SIZE descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="WriteSize" expr=WRITE_SIZE descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="MemWrites32B" expr=WRITE_REQ_32B descr="The total number of effective 32B write transactions to the memory"></metric>
+  <metric name="MemUnitStalled" expr=100*TCP_TCP_TA_DATA_STALL_CYCLES_max/GRBM_GUI_ACTIVE/SE_NUM descr="The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad)."></metric>
+  <metric name="TA_BUSY_avr"                                            expr=avr(TA_TA_BUSY,16) descr="TA block is busy. Average over TA instances."></metric>
+  <metric name="TA_BUSY_max"                                            expr=max(TA_TA_BUSY,16) descr="TA block is busy. Max over TA instances."></metric>
+  <metric name="TA_BUSY_min"                                            expr=min(TA_TA_BUSY,16) descr="TA block is busy. Min over TA instances."></metric>
+  <metric name="TA_TA_BUSY_sum"                                         expr=sum(TA_TA_BUSY,16) descr="TA block is busy. Perf_Windowing not supported for this counter. Sum over TA instances."></metric>
+  <metric name="TA_TOTAL_WAVEFRONTS_sum"                                expr=sum(TA_TOTAL_WAVEFRONTS,16) descr="Total number of wavefronts processed by TA. Sum over TA instances."></metric>
+  <metric name="TA_ADDR_STALLED_BY_TC_CYCLES_sum"                       expr=sum(TA_ADDR_STALLED_BY_TC_CYCLES,16) descr="Number of cycles addr path stalled by TC. Perf_Windowing not supported for this counter. Sum over TA instances."></metric>
+  <metric name="TA_ADDR_STALLED_BY_TD_CYCLES_sum"                       expr=sum(TA_ADDR_STALLED_BY_TD_CYCLES,16) descr="Number of cycles addr path stalled by TD. Perf_Windowing not supported for this counter. Sum over TA instances."></metric>
+  <metric name="TA_DATA_STALLED_BY_TC_CYCLES_sum"                       expr=sum(TA_DATA_STALLED_BY_TC_CYCLES,16) descr="Number of cycles data path stalled by TC. Perf_Windowing not supported for this counter. Sum over TA instances."></metric>
+  <metric name="TA_FLAT_WAVEFRONTS_sum"                                 expr=sum(TA_FLAT_WAVEFRONTS,16) descr="Number of flat opcode wavfronts processed by the TA. Sum over TA instances."></metric>
+  <metric name="TA_FLAT_READ_WAVEFRONTS_sum"                            expr=sum(TA_FLAT_READ_WAVEFRONTS,16) descr="Number of flat opcode reads processed by the TA. Sum over TA instances."></metric>
+  <metric name="TA_FLAT_WRITE_WAVEFRONTS_sum"                           expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) descr="Number of flat opcode writes processed by the TA. Sum over TA instances."></metric>
+  <metric name="TA_FLAT_ATOMIC_WAVEFRONTS_sum"                          expr=sum(TA_FLAT_ATOMIC_WAVEFRONTS,16) descr="Number of flat opcode atomics processed by the TA. Sum over TA instances."></metric>
+  <metric name="TA_BUFFER_WAVEFRONTS_sum"                               expr=sum(TA_BUFFER_WAVEFRONTS,16) descr="Number of buffer wavefronts processed by TA. Sum over TA instances."></metric>
+  <metric name="TA_BUFFER_READ_WAVEFRONTS_sum"                          expr=sum(TA_BUFFER_READ_WAVEFRONTS,16) descr="Number of buffer read wavefronts processed by TA. Sum over TA instances."></metric>
+  <metric name="TA_BUFFER_WRITE_WAVEFRONTS_sum"                         expr=sum(TA_BUFFER_WRITE_WAVEFRONTS,16) descr="Number of buffer write wavefronts processed by TA. Sum over TA instances."></metric>
+  <metric name="TA_BUFFER_ATOMIC_WAVEFRONTS_sum"                        expr=sum(TA_BUFFER_ATOMIC_WAVEFRONTS,16) descr="Number of buffer atomic wavefronts processed by TA. Sum over TA instances."></metric>
+  <metric name="TA_BUFFER_TOTAL_CYCLES_sum"                             expr=sum(TA_BUFFER_TOTAL_CYCLES,16) descr="Number of buffer cycles issued to TC. Sum over TA instances."></metric>
+  <metric name="TA_BUFFER_COALESCED_READ_CYCLES_sum"                    expr=sum(TA_BUFFER_COALESCED_READ_CYCLES,16) descr="Number of buffer coalesced read cycles issued to TC. Sum over TA instances."></metric>
+  <metric name="TA_BUFFER_COALESCED_WRITE_CYCLES_sum"                   expr=sum(TA_BUFFER_COALESCED_WRITE_CYCLES,16) descr="Number of buffer coalesced write cycles issued to TC. Sum over TA instances."></metric>
+  <metric name="TD_TD_BUSY_sum"                                         expr=sum(TD_TD_BUSY,16) descr="TD is processing or waiting for data. Perf_Windowing not supported for this counter. Sum over TD instances."></metric>
+  <metric name="TD_TC_STALL_sum"                                        expr=sum(TD_TC_STALL,16) descr="TD is stalled waiting for TC data. Sum over TD instances."></metric>
+  <metric name="TD_LOAD_WAVEFRONT_sum"                                  expr=sum(TD_LOAD_WAVEFRONT,16) descr="Count the wavefronts with opcode = load, include atomics and store. Sum over TD instances."></metric>
+  <metric name="TD_ATOMIC_WAVEFRONT_sum"                                expr=sum(TD_ATOMIC_WAVEFRONT,16) descr="Count the wavefronts with opcode = atomic. Sum over TD instances."></metric>
+  <metric name="TD_STORE_WAVEFRONT_sum"                                 expr=sum(TD_STORE_WAVEFRONT,16) descr="Count the wavefronts with opcode = store. Sum over TD instances."></metric>
+  <metric name="TD_COALESCABLE_WAVEFRONT_sum"                           expr=sum(TD_COALESCABLE_WAVEFRONT,16) descr="Count wavefronts that TA finds coalescable. Sum over TD instances."></metric>
+  <metric name="TD_SPI_STALL_sum"                                       expr=sum(TD_SPI_STALL,16) descr="TD is stalled SPI vinit, sum of TCP instances"></metric>
+  <metric name="TCP_GATE_EN1_sum"                                       expr=sum(TCP_GATE_EN1,16) descr="TCP interface clocks are turned on. Not Windowed. Sum over TCP instances."></metric>
+  <metric name="TCP_GATE_EN2_sum"                                       expr=sum(TCP_GATE_EN2,16) descr="TCP core clocks are turned on. Not Windowed. Sum over TCP instances."></metric>
+  <metric name="TCP_TD_TCP_STALL_CYCLES_sum"                            expr=sum(TCP_TD_TCP_STALL_CYCLES,16) descr="TD stalls TCP. Sum over TCP instances."></metric>
+  <metric name="TCP_TCR_TCP_STALL_CYCLES_sum"                           expr=sum(TCP_TCR_TCP_STALL_CYCLES,16) descr="TCR stalls TCP_TCR_req interface. Sum over TCP instances."></metric>
+  <metric name="TCP_READ_TAGCONFLICT_STALL_CYCLES_sum"                  expr=sum(TCP_READ_TAGCONFLICT_STALL_CYCLES,16) descr="Tagram conflict stall on a read. Sum over TCP instances."></metric>
+  <metric name="TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum"                 expr=sum(TCP_WRITE_TAGCONFLICT_STALL_CYCLES,16) descr="Tagram conflict stall on a write. Sum over TCP instances."></metric>
+  <metric name="TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum"                expr=sum(TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES,16) descr="Tagram conflict stall on an atomic. Sum over TCP instances."></metric>
+  <metric name="TCP_VOLATILE_sum"                                       expr=sum(TCP_VOLATILE,16) descr="Total number of L1 volatile pixels/buffers from TA. Sum over TCP instances."></metric>
+  <metric name="TCP_TOTAL_ACCESSES_sum"                                 expr=sum(TCP_TOTAL_ACCESSES,16) descr="Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD. Sum over TCP instances."></metric>
+  <metric name="TCP_TOTAL_READ_sum"                                     expr=sum(TCP_TOTAL_READ,16) descr="Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ. Sum over TCP instances."></metric>
+  <metric name="TCP_TOTAL_WRITE_sum"                                    expr=sum(TCP_TOTAL_WRITE,16) descr="Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+ TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE. Sum over TCP instances."></metric>
+  <metric name="TCP_TOTAL_ATOMIC_WITH_RET_sum"                          expr=sum(TCP_TOTAL_ATOMIC_WITH_RET,16) descr="Total number of atomic with return pixels/buffers from TA. Sum over TCP instances."></metric>
+  <metric name="TCP_TOTAL_ATOMIC_WITHOUT_RET_sum"                       expr=sum(TCP_TOTAL_ATOMIC_WITHOUT_RET,16) descr="Total number of atomic without return pixels/buffers from TA Sum over TCP instances."></metric>
+  <metric name="TCP_TOTAL_WRITEBACK_INVALIDATES_sum"                    expr=sum(TCP_TOTAL_WRITEBACK_INVALIDATES,16) descr="Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. Not Windowed. Sum over TCP instances."></metric>
+  <metric name="TCP_UTCL1_REQUEST_sum"                                  expr=sum(TCP_UTCL1_REQUEST,16) descr="Total CLIENT_UTCL1 NORMAL requests Sum over TCP instances."></metric>
+  <metric name="TCP_UTCL1_TRANSLATION_MISS_sum"                         expr=sum(TCP_UTCL1_TRANSLATION_MISS,16) descr="Total utcl1 translation misses Sum over TCP instances."></metric>
+  <metric name="TCP_UTCL1_TRANSLATION_HIT_sum"                          expr=sum(TCP_UTCL1_TRANSLATION_HIT,16) descr="Total utcl1 translation hits Sum over TCP instances."></metric>
+  <metric name="TCP_UTCL1_PERMISSION_MISS_sum"                          expr=sum(TCP_UTCL1_PERMISSION_MISS,16) descr="Total utcl1 permission misses Sum over TCP instances."></metric>
+  <metric name="TCP_TOTAL_CACHE_ACCESSES_sum"                           expr=sum(TCP_TOTAL_CACHE_ACCESSES,16) descr="Count of total cache line (tag) accesses (includes hits and misses). Sum over TCP instances."></metric>
+  <metric name="TCP_TA_TCP_STATE_READ_sum"                              expr=sum(TCP_TA_TCP_STATE_READ,16) descr="Number of state reads Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_READ_REQ_sum"                                   expr=sum(TCP_TCC_READ_REQ,16) descr="Total read requests from TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_WRITE_REQ_sum"                                  expr=sum(TCP_TCC_WRITE_REQ,16) descr="Total write requests from TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_ATOMIC_WITH_RET_REQ_sum"                        expr=sum(TCP_TCC_ATOMIC_WITH_RET_REQ,16) descr="Total atomic with return requests from TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"                     expr=sum(TCP_TCC_ATOMIC_WITHOUT_RET_REQ,16) descr="Total atomic without return requests from TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_NC_READ_REQ_sum"                                expr=sum(TCP_TCC_NC_READ_REQ,16) descr="Total read requests with NC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_NC_WRITE_REQ_sum"                               expr=sum(TCP_TCC_NC_WRITE_REQ,16) descr="Total write requests with NC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_NC_ATOMIC_REQ_sum"                              expr=sum(TCP_TCC_NC_ATOMIC_REQ,16) descr="Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_UC_READ_REQ_sum"                                expr=sum(TCP_TCC_UC_READ_REQ,16) descr="Total read requests with UC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_UC_WRITE_REQ_sum"                               expr=sum(TCP_TCC_UC_WRITE_REQ,16) descr="Total write requests with UC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_UC_ATOMIC_REQ_sum"                              expr=sum(TCP_TCC_UC_ATOMIC_REQ,16) descr="Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_CC_READ_REQ_sum"                                expr=sum(TCP_TCC_CC_READ_REQ,16) descr="Total write requests with CC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_CC_WRITE_REQ_sum"                               expr=sum(TCP_TCC_CC_WRITE_REQ,16) descr="Total write requests with CC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_CC_ATOMIC_REQ_sum"                              expr=sum(TCP_TCC_CC_ATOMIC_REQ,16) descr="Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_RW_READ_REQ_sum"                                expr=sum(TCP_TCC_RW_READ_REQ,16) descr="Total write requests with RW mtype from this TCP to all TCCs. Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_RW_WRITE_REQ_sum"                               expr=sum(TCP_TCC_RW_WRITE_REQ,16) descr="Total write requests with RW mtype from this TCP to all TCCs. Sum over TCP instances."></metric>
+  <metric name="TCP_TCC_RW_ATOMIC_REQ_sum"                              expr=sum(TCP_TCC_RW_ATOMIC_REQ,16) descr="Total atomic requests with RW mtype from this TCP to all TCCs. Sum over TCP instances."></metric>
+  <metric name="TCP_PENDING_STALL_CYCLES_sum"                           expr=sum(TCP_PENDING_STALL_CYCLES,16) descr="Stall due to data pending from L2. Sum over TCP instances."></metric>
+  <metric name="TCA_CYCLE_sum"                                          expr=sum(TCA_CYCLE,16)  descr="Number of cycles. Sum over all TCA instances "></metric>
+  <metric name="TCA_BUSY_sum"                                           expr=sum(TCA_BUSY,16)   descr="Number of cycles we have a request pending. Sum over all TCA instances."></metric>
+  <metric name="TCC_BUSY_avr"                                           expr=avr(TCC_BUSY,16) descr="TCC_BUSY avr over all memory channels."></metric>
+  <metric name="TCC_WRREQ_STALL_max"                                    expr=max(TCC_EA0_WRREQ_STALL,16) descr="Number of cycles a write request was stalled. Max over TCC instances."></metric>
+  <metric name="TCC_CYCLE_sum"                                          expr=sum(TCC_CYCLE,16) descr="Number of cycles. Not windowable. Sum over TCC instances."></metric>
+  <metric name="TCC_BUSY_sum"                                           expr=sum(TCC_BUSY,16) descr="Number of cycles we have a request pending. Not windowable. Sum over TCC instances."></metric>
+  <metric name="TCC_REQ_sum"                                            expr=sum(TCC_REQ,16) descr="Number of requests of all types. This is measured at the tag block. This may be more than the number of requests arriving at the TCC, but it is a good indication of the total amount of work that needs to be performed. Sum over TCC instances."></metric>
+  <metric name="TCC_STREAMING_REQ_sum"                                  expr=sum(TCC_STREAMING_REQ,16) descr="Number of streaming requests. This is measured at the tag block. Sum over TCC instances."></metric>
+  <metric name="TCC_NC_REQ_sum"                                         expr=sum(TCC_NC_REQ,16) descr="The number of noncoherently cached requests. This is measured at the tag block. Sum over TCC instances."></metric>
+  <metric name="TCC_UC_REQ_sum"                                         expr=sum(TCC_UC_REQ,16) descr="The number of uncached requests. This is measured at the tag block. Sum over TCC instances."></metric>
+  <metric name="TCC_CC_REQ_sum"                                         expr=sum(TCC_CC_REQ,16) descr="The number of coherently cached requests. This is measured at the tag block. Sum over TCC instances."></metric>
+  <metric name="TCC_RW_REQ_sum"                                         expr=sum(TCC_RW_REQ,16) descr="The number of RW requests. This is measured at the tag block. Sum over TCC instances."></metric>
+  <metric name="TCC_PROBE_sum"                                          expr=sum(TCC_PROBE,16) descr="Number of probe requests. Not windowable. Sum over TCC instances."></metric>
+  <metric name="TCC_PROBE_ALL_sum"                                      expr=sum(TCC_PROBE_ALL,16) descr="Number of external probe requests with with EA_TCC_preq_all== 1. Not windowable. Sum over TCC instances."></metric>
+  <metric name="TCC_READ_sum"                                           expr=sum(TCC_READ,16) descr="Number of read requests. Compressed reads are included in this, but metadata reads are not included. Sum over TCC instances."></metric>
+  <metric name="TCC_WRITE_sum"                                          expr=sum(TCC_WRITE,16) descr="Number of write requests. Sum over TCC instances."></metric>
+  <metric name="TCC_ATOMIC_sum"                                         expr=sum(TCC_ATOMIC,16) descr="Number of atomic requests of all types. Sum over TCC instances."></metric>
+  <metric name="TCC_HIT_sum"                                            expr=sum(TCC_HIT,16) descr="Number of cache hits. Sum over TCC instances."></metric>
+  <metric name="TCC_MISS_sum"                                           expr=sum(TCC_MISS,16) descr="Number of cache misses. UC reads count as misses. Sum over TCC instances."></metric>
+  <metric name="TCC_WRITEBACK_sum"                                      expr=sum(TCC_WRITEBACK,16) descr="Number of lines written back to main memory. This includes writebacks of dirty lines and uncached write/atomic requests. Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_WRREQ_sum"                                       expr=sum(TCC_EA0_WRREQ,16) descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands. Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_WRREQ_64B_sum"                                   expr=sum(TCC_EA0_WRREQ_64B,16) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_WR_UNCACHED_32B_sum"                             expr=sum(TCC_EA0_WR_UNCACHED_32B,16) descr="Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2. Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_WRREQ_STALL_sum"                                 expr=sum(TCC_EA0_WRREQ_STALL,16) descr="Number of cycles a write request was stalled. Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_WRREQ_IO_CREDIT_STALL_sum"                       expr=sum(TCC_EA0_WRREQ_IO_CREDIT_STALL,16) descr="Number of cycles a EA write request was stalled because the interface was out of IO credits. Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum"                      expr=sum(TCC_EA0_WRREQ_GMI_CREDIT_STALL,16) descr="Number of cycles a EA write request was stalled because the interface was out of GMI credits. Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum"                     expr=sum(TCC_EA0_WRREQ_DRAM_CREDIT_STALL,16) descr="Number of cycles a EA write request was stalled because the interface was out of DRAM credits. Sum over TCC instances."></metric>
+  <metric name="TCC_TOO_MANY_EA_WRREQS_STALL_sum"                       expr=sum(TCC_TOO_MANY_EA_WRREQS_STALL,16) descr="Number of cycles the TCC could not send a EA write request because it already reached its maximum number of pending EA write requests. Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_WRREQ_LEVEL_sum"                                 expr=sum(TCC_EA0_WRREQ_LEVEL,16) descr="The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_RDREQ_LEVEL_sum"                                 expr=sum(TCC_EA0_RDREQ_LEVEL,16) descr="The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_ATOMIC_sum"                                      expr=sum(TCC_EA0_ATOMIC,16) descr="Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_ATOMIC_LEVEL_sum"                                expr=sum(TCC_EA0_ATOMIC_LEVEL,16) descr="The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_RDREQ_sum"                                       expr=sum(TCC_EA0_RDREQ,16) descr="Number of TCC/EA read requests (either 32-byte or 64-byte) Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_RDREQ_32B_sum"                                   expr=sum(TCC_EA0_RDREQ_32B,16) descr="Number of 32-byte TCC/EA read requests Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_RD_UNCACHED_32B_sum"                             expr=sum(TCC_EA0_RD_UNCACHED_32B,16) descr="Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2 Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_RDREQ_IO_CREDIT_STALL_sum"                       expr=sum(TCC_EA0_RDREQ_IO_CREDIT_STALL,16) descr="Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum"                      expr=sum(TCC_EA0_RDREQ_GMI_CREDIT_STALL,16) descr="Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum"                     expr=sum(TCC_EA0_RDREQ_DRAM_CREDIT_STALL,16) descr="Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances."></metric>
+  <metric name="TCC_TAG_STALL_sum"                                      expr=sum(TCC_TAG_STALL,16) descr="."></metric>
+  <metric name="TCC_NORMAL_WRITEBACK_sum"                               expr=sum(TCC_NORMAL_WRITEBACK,16) descr="Number of writebacks due to requests that are not writeback requests. Sum over TCC instances."></metric>
+  <metric name="TCC_ALL_TC_OP_WB_WRITEBACK_sum"                         expr=sum(TCC_ALL_TC_OP_WB_WRITEBACK,16) descr="Number of writebacks due to all TC_OP writeback requests. Sum over TCC instances."></metric>
+  <metric name="TCC_NORMAL_EVICT_sum"                                   expr=sum(TCC_NORMAL_EVICT,16) descr="Number of evictions due to requests that are not invalidate or probe requests. Sum over TCC instances."></metric>
+  <metric name="TCC_ALL_TC_OP_INV_EVICT_sum"                            expr=sum(TCC_ALL_TC_OP_INV_EVICT,16) descr="Number of evictions due to all TC_OP invalidate requests. Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_RDREQ_DRAM_sum"                                  expr=sum(TCC_EA0_RDREQ_DRAM,16) descr="Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). Sum over TCC instances."></metric>
+  <metric name="TCC_EA0_WRREQ_DRAM_sum"                                  expr=sum(TCC_EA0_WRREQ_DRAM,16) descr="Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum over TCC instances."></metric>
+  <metric name="FETCH_SIZE"                                             expr=(TCC_EA0_RDREQ_32B_sum*32+(TCC_EA0_RDREQ_sum-TCC_EA0_RDREQ_32B_sum)*64)/1024 descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="WRITE_SIZE"                                             expr=((TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum)*32+TCC_EA0_WRREQ_64B_sum*64)/1024 descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="WRITE_REQ_32B"                                          expr=TCC_EA0_WRREQ_64B_sum*2+(TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum) descr="The total number of 32-byte effective memory writes."></metric>
+  <metric name="CU_OCCUPANCY"                                           expr=(SQ_CYCLES/(SQ_WAVE_CYCLES*4))/MAX_WAVE_SIZE descr="The ratio of active waves on a CU to the maximum number of active waves supported by the CU"></metric>
+  <metric name="CU_UTILIZATION"                                         expr=GRBM_GUI_ACTIVE/GRBM_COUNT descr="The total number of active cycles divided by total number of elapsed cycles"></metric>
+  <metric name="TOTAL_16_OPS"                                           expr=(SQ_INSTS_VALU_FMA_F16*2+SQ_INSTS_VALU_ADD_F16+SQ_INSTS_VALU_MUL_F16+SQ_INSTS_VALU_TRANS_F16)*64+((SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16)*512) descr="The number of 16 bits OPS executed"></metric>
+  <metric name="TOTAL_32_OPS"                                           expr=(SQ_INSTS_VALU_FMA_F32*2+SQ_INSTS_VALU_INT32+SQ_INSTS_VALU_ADD_F32+SQ_INSTS_VALU_MUL_F32+SQ_INSTS_VALU_TRANS_F32)*64+(SQ_INSTS_VALU_MFMA_MOPS_F32*512) descr="The number of 32 bits OPS executed"></metric>
+  <metric name="TOTAL_64_OPS"                                           expr=(SQ_INSTS_VALU_FMA_F64*2+SQ_INSTS_VALU_INT64+SQ_INSTS_VALU_ADD_F64+SQ_INSTS_VALU_MUL_F64)*64+(SQ_INSTS_VALU_MFMA_MOPS_F64*512) descr="The number of 64 bits OPS executed"></metric>
+</gfx940_expr>
+
+<gfx10_expr>
+  <metric name="KERNEL_DURATION" expr=1 descr="The duration of the kernel dispatch"></metric>
+  <metric name="MeanOccupancyPerCU" expr=GRBM_COUNT*0+SQ_LEVEL_WAVES*0+SQ_ACCUM_PREV/GRBM_GUI_ACTIVE/CU_NUM descr="Mean occupancy per compute unit."></metric>
+  <metric name="MeanOccupancyPerActiveCU" expr=GRBM_COUNT*0+SQ_LEVEL_WAVES*0+SQ_ACCUM_PREV*4/SQ_BUSY_CYCLES/CU_NUM descr="Mean occupancy per active compute unit."></metric>
+
+  <metric name="GPU_UTIL" expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT descr="Percentage of the time that GUI is active"></metric>
+  <metric name="CP_UTIL" expr=100*GRBM_CP_BUSY/GRBM_GUI_ACTIVE descr="Percentage of the GRBM_GUI_ACTIVE time that any of the Command Processor (CPG/CPC/CPF) blocks are busy"></metric>
+  <metric name="SPI_UTIL" expr=100*GRBM_SPI_BUSY/GRBM_GUI_ACTIVE descr="Percentage of the GRBM_GUI_ACTIVE time that any of the Shader Pipe Interpolators (SPI) are busy in the shader engine(s)"></metric>
+  <metric name="TA_UTIL" expr=100*GRBM_TA_BUSY/GRBM_GUI_ACTIVE descr="Percentage of the GRBM_GUI_ACTIVE time that any of the Texture Pipes (TA) are busy in the shader engine(s)."></metric>
+  <metric name="GDS_UTIL" expr=100*GRBM_GDS_BUSY/GRBM_GUI_ACTIVE descr="Percentage of the GRBM_GUI_ACTIVE time that the Global Data Share (GDS) is busy."></metric>
+  <metric name="EA_UTIL" expr=100*GRBM_EA_BUSY/GRBM_GUI_ACTIVE descr="Percentage of the GRBM_GUI_ACTIVE time that the Efficiency Arbiter (EA) block is busy."></metric>
+  <metric name="WAVE_DEP_WAIT" expr=100*SQ_WAIT_ANY/SQ_WAVE_CYCLES descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for anything."></metric>
+  <metric name="WAVE_ISSUE_WAIT" expr=100*SQ_WAIT_INST_ANY/SQ_WAVE_CYCLES descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for any instruction issue."></metric>
+
+  <metric name="TA_BUSY_avr" expr=avr(TA_TA_BUSY,16) descr="TA block is busy. Average over TA instances."></metric>
+  <metric name="TA_BUSY_max" expr=max(TA_TA_BUSY,16) descr="TA block is busy. Max over TA instances."></metric>
+  <metric name="TA_BUSY_min" expr=min(TA_TA_BUSY,16) descr="TA block is busy. Min over TA instances."></metric>
+  <metric name="TA_FLAT_LOAD_WAVEFRONTS_sum" expr=sum(TA_FLAT_LOAD_WAVEFRONTS,16) descr="Number of flat load vec32 packets processed by the TA. Sum over TA instances."></metric>
+  <metric name="TA_FLAT_STORE_WAVEFRONTS_sum" expr=sum(TA_FLAT_STORE_WAVEFRONTS,16) descr="Number of flat store vec32 packets processed by the TA. Sum over TA instances."></metric>
+
+  <metric name="GL2C_HIT_sum" expr=sum(GL2C_HIT,16) descr="Number of cache hits. Sum over GL2C instances."></metric>
+  <metric name="GL2C_MISS_sum" expr=sum(GL2C_MISS,16) descr="Number of cache misses. Sum over GL2C instances."></metric>
+  <metric name="GL2C_EA_RDREQ_32B_sum" expr=sum(GL2C_EA_RDREQ_32B,16) descr="Number of 32-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
+  <metric name="GL2C_EA_RDREQ_64B_sum" expr=sum(GL2C_EA_RDREQ_64B,16) descr="Number of 64-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
+  <metric name="GL2C_EA_RDREQ_96B_sum" expr=sum(GL2C_EA_RDREQ_96B,16) descr="Number of 96-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
+  <metric name="GL2C_EA_RDREQ_128B_sum" expr=sum(GL2C_EA_RDREQ_128B,16) descr="Number of 128-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
+  <metric name="GL2C_MC_RDREQ_sum" expr=sum(GL2C_MC_RDREQ,16) descr="Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte). Sum over GL2C instances."></metric>
+  <metric name="GL2C_MC_WRREQ_sum" expr=sum(GL2C_MC_WRREQ,16) descr="Number of transactions (either 32-byte or 64-byte) going over the GL2C_MC_wrreq interface. Sum over GL2C instances."></metric>
+  <metric name="GL2C_EA_WRREQ_64B_sum" expr=sum(GL2C_EA_WRREQ_64B,16) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the GL2C_EA_wrreq interface. Sum over GL2C instances."></metric>
+  <metric name="GL2C_WRREQ_STALL_max" expr=max(GL2C_MC_WRREQ_STALL,16) descr="Number of cycles a write request was stalled. Max over GL2C instances."></metric>
+  <metric name="L2CacheHit" expr=100*sum(GL2C_HIT,16)/(sum(GL2C_HIT,16)+sum(GL2C_MISS,16)) descr="The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal)."></metric>
+  <metric name="FETCH_SIZE" expr=(GL2C_EA_RDREQ_32B_sum*32+GL2C_EA_RDREQ_64B_sum*64+GL2C_EA_RDREQ_96B_sum*96+GL2C_EA_RDREQ_128B_sum*128)/1024 descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="WriteUnitStalled" expr=100*GL2C_WRREQ_STALL_max/GRBM_GUI_ACTIVE descr="The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad)."></metric>
+  <metric name="LDSBankConflict" expr=100*SQC_LDS_BANK_CONFLICT/SQC_LDS_IDX_ACTIVE descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."></metric>
+</gfx10_expr>
+
+<gfx1030_expr base="gfx10_expr">
+</gfx1030_expr>
+
+<gfx1031_expr base="gfx10_expr">
+</gfx1031_expr>
+
+<gfx1032_expr base="gfx10_expr">
+</gfx1032_expr>
+
+<gfx11_expr>
+  <metric name="KERNEL_DURATION" expr=1 descr="The duration of the kernel dispatch"></metric>
+  <metric name="GPU_UTIL" expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT descr="Percentage of the time that GUI is active"></metric>
+  <metric name="WAVE_DEP_WAIT" expr=100*SQ_WAIT_ANY/SQ_WAVE_CYCLES descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for anything."></metric>
+  <metric name="WAVE_ISSUE_WAIT" expr=100*SQ_WAIT_INST_ANY/SQ_WAVE_CYCLES descr="Percentage of the SQ_WAVE_CYCLE time spent waiting for any instruction issue."></metric>
+  <metric name="TA_BUSY_avr" expr=avr(TA_TA_BUSY,16) descr="TA block is busy. Average over TA instances."></metric>
+  <metric name="TA_BUSY_max" expr=max(TA_TA_BUSY,16) descr="TA block is busy. Max over TA instances."></metric>
+  <metric name="TA_BUSY_min" expr=min(TA_TA_BUSY,16) descr="TA block is busy. Min over TA instances."></metric>
+  <metric name="TA_BUFFER_LOAD_WAVEFRONTS_sum" expr=sum(TA_BUFFER_LOAD_WAVEFRONTS,16) descr="Number of buffer load vec32 packets processed by the TA. Sum over TA instances."></metric>
+  <metric name="TA_BUFFER_STORE_WAVEFRONTS_sum" expr=sum(TA_BUFFER_STORE_WAVEFRONTS,16) descr="Number of buffer store vec32 packets processed by the TA. Sum over TA instances."></metric>
+  <metric name="GL2C_HIT_sum" expr=sum(GL2C_HIT,16) descr="Number of cache hits. Sum over GL2C instances."></metric>
+  <metric name="GL2C_MISS_sum" expr=sum(GL2C_MISS,16) descr="Number of cache misses. Sum over GL2C instances."></metric>
+  <metric name="GL2C_EA_RDREQ_32B_sum" expr=sum(GL2C_EA_RDREQ_32B,16) descr="Number of 32-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
+  <metric name="GL2C_EA_RDREQ_64B_sum" expr=sum(GL2C_EA_RDREQ_64B,16) descr="Number of 64-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
+  <metric name="GL2C_EA_RDREQ_96B_sum" expr=sum(GL2C_EA_RDREQ_96B,16) descr="Number of 96-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
+  <metric name="GL2C_EA_RDREQ_128B_sum" expr=sum(GL2C_EA_RDREQ_128B,16) descr="Number of 128-byte GL2C/EA read requests. Sum over GL2C instances."></metric>
+  <metric name="GL2C_MC_RDREQ_sum" expr=sum(GL2C_MC_RDREQ,16) descr="Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte). Sum over GL2C instances."></metric>
+  <metric name="GL2C_MC_WRREQ_sum" expr=sum(GL2C_MC_WRREQ,16) descr="Number of transactions (either 32-byte or 64-byte) going over the GL2C_MC_wrreq interface. Sum over GL2C instances."></metric>
+  <metric name="GL2C_EA_WRREQ_64B_sum" expr=sum(GL2C_EA_WRREQ_64B,16) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the GL2C_EA_wrreq interface. Sum over GL2C instances."></metric>
+  <metric name="GL2C_WRREQ_STALL_max" expr=max(GL2C_MC_WRREQ_STALL,16) descr="Number of cycles a write request was stalled. Max over GL2C instances."></metric>
+  <metric name="L2CacheHit" expr=100*sum(GL2C_HIT,16)/(sum(GL2C_HIT,16)+sum(GL2C_MISS,16)) descr="The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal)."></metric>
+  <metric name="FETCH_SIZE" expr=(GL2C_EA_RDREQ_32B_sum*32+GL2C_EA_RDREQ_64B_sum*64+GL2C_EA_RDREQ_96B_sum*96+GL2C_EA_RDREQ_128B_sum*128)/1024 descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
+  <metric name="WriteUnitStalled" expr=100*GL2C_WRREQ_STALL_max/GRBM_GUI_ACTIVE descr="The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad)."></metric>
+  <metric name="LDSBankConflict" expr=100*SQC_LDS_BANK_CONFLICT/SQC_LDS_IDX_ACTIVE descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."></metric>
+</gfx11_expr>
+
+<gfx1100_expr base="gfx11_expr">
+</gfx1100_expr>
+
+<gfx1101_expr base="gfx11_expr">
+</gfx1101_expr>
+
+<gfx8 base="gfx8_expr"></gfx8>
+<gfx9 base="gfx9_expr"></gfx9>
+<gfx10 base="gfx10_expr"></gfx10>
+<gfx11 base="gfx11_expr"></gfx11>
+# Vega20
+<gfx906 base="gfx906_expr"></gfx906>
+# Arcturus
+<gfx908 base="gfx908_expr"></gfx908>
+# Aldebaran
+<gfx90a base="gfx90a_expr"></gfx90a>
+#Mi300
+<gfx940 base="gfx940_expr"></gfx940>
+<gfx941 base="gfx940_expr"></gfx941>
+<gfx942 base="gfx940_expr"></gfx942>
+#Navi21
+<gfx1030 base="gfx1030_expr"></gfx1030>
+<gfx1031 base="gfx1031_expr"></gfx1031>
+<gfx1032 base="gfx1032_expr"></gfx1032>
+#Navi31
+<gfx1100 base="gfx1100_expr"></gfx1100>
+<gfx1101 base="gfx1101_expr"></gfx1101>
+
+
+<global>
+  # GPUBusy         The percentage of time GPU was busy.
+  <metric
+    name="GPUBusy"
+    descr="The percentage of time GPU was busy."
+    expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT
+  ></metric>
+
+  # Wavefronts      Total wavefronts.
+  <metric
+    name="Wavefronts"
+    descr="Total wavefronts."
+    expr=SQ_WAVES
+  ></metric>
+
+  # VALUInsts       The average number of vector ALU instructions executed per work-item (affected by flow control).
+  <metric
+    name="VALUInsts"
+    descr="The average number of vector ALU instructions executed per work-item (affected by flow control)."
+    expr=SQ_INSTS_VALU/SQ_WAVES
+  ></metric>
+
+  # SALUInsts       The average number of scalar ALU instructions executed per work-item (affected by flow control).
+  <metric
+    name="SALUInsts"
+    descr="The average number of scalar ALU instructions executed per work-item (affected by flow control)."
+    expr=SQ_INSTS_SALU/SQ_WAVES
+  ></metric>
+
+  # SFetchInsts     The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control).
+  <metric
+    name="SFetchInsts"
+    descr="The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control)."
+    expr=SQ_INSTS_SMEM/SQ_WAVES
+  ></metric>
+
+  # GDSInsts        The average number of GDS read or GDS write instructions executed per work item (affected by flow control).
+  <metric
+    name="GDSInsts"
+    descr="The average number of GDS read or GDS write instructions executed per work item (affected by flow control)."
+    expr=SQ_INSTS_GDS/SQ_WAVES
+  ></metric>
+
+  # MemUnitBusy     The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound).
+  <metric
+    name="MemUnitBusy"
+    descr="The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound)."
+    expr=100*max(TA_TA_BUSY,16)/GRBM_GUI_ACTIVE/SE_NUM
+  ></metric>
+
+  # ALUStalledByLDS The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad).
+  <metric
+    name="ALUStalledByLDS"
+    descr="The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad)."
+    expr=100*SQ_WAIT_INST_LDS*4/SQ_WAVES/GRBM_GUI_ACTIVE
+  ></metric>
+
+</global>