Files
rocm-systems/projects/rdc/common/rdc_field.data
T
Galantsev, Dmitrii 83cf97e280 Profiler - Add all required metrics
Change-Id: Iea3938df9407789c061c3a6ead9167a69069d6e6
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/rdc commit: c3a4c899d5]
2024-05-09 23:24:02 -05:00

137 řádky
11 KiB
Plaintext

/*
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Description Fields:
// Arg # Type Meaning
// -------------------------------------------------
// 1 rdc_field_t enum of field
// 2 string description of enum
// 3 string rdci display label
// 4 bool do or do not display in rdci
// rdc_field_t Description rdci label To Display
// =========== =========== ========= ==========
#ifndef FLD_DESC_ENT
#define FLD_DESC_ENT(ID, DESC, LABEL, DISPLAY)
#endif
FLD_DESC_ENT(RDC_FI_INVALID, "Unknown/Invalid field", "INVALID", false)
FLD_DESC_ENT(RDC_FI_GPU_COUNT, "GPU count in the system", "GPU_COUNT", true)
FLD_DESC_ENT(RDC_FI_DEV_NAME, "Name of the device", "DEV_NAME", true)
FLD_DESC_ENT(RDC_FI_GPU_CLOCK, "Current GPU clock frequencies", "GPU_CLOCK", true)
FLD_DESC_ENT(RDC_FI_MEM_CLOCK, "Current Memory clock frequencies", "MEM_CLOCK", true)
FLD_DESC_ENT(RDC_FI_MEMORY_TEMP, "Memory temperature in millidegrees Celsius", "MEMORY_TEMP", true)
FLD_DESC_ENT(RDC_FI_GPU_TEMP, "GPU temperature in millidegrees Celsius", "GPU_TEMP", true)
FLD_DESC_ENT(RDC_FI_POWER_USAGE, "Power usage in microwatts", "POWER_USAGE", true)
FLD_DESC_ENT(RDC_FI_PCIE_TX, "PCIe Tx utilization in bytes/second", "PCIE_TX", true)
FLD_DESC_ENT(RDC_FI_PCIE_RX, "PCIe Rx utilization in bytes/second", "PCIE_RX", true)
FLD_DESC_ENT(RDC_FI_PCIE_BANDWIDTH, "PCIe bandwidth in GB/sec", "PCIE_BANDWIDTH", true)
FLD_DESC_ENT(RDC_FI_GPU_UTIL, "GPU busy percentage", "GPU_UTIL", true)
FLD_DESC_ENT(RDC_FI_GPU_MEMORY_USAGE, "Memory usage of the GPU instance in bytes", "GPU_MEMORY_USAGE", true)
FLD_DESC_ENT(RDC_FI_GPU_MEMORY_TOTAL, "Total memory of the GPU instance", "GPU_MEMORY_TOTAL", true)
FLD_DESC_ENT(RDC_FI_ECC_CORRECT_TOTAL, "Accumulated Single Error Correction", "ECC_CORRECT", true)
FLD_DESC_ENT(RDC_FI_ECC_UNCORRECT_TOTAL, "Accumulated Double Error Detection", "ECC_UNCORRECT", true)
FLD_DESC_ENT(RDC_FI_ECC_SDMA_SEC, "SDMA Single Error Correction", "ECC_SDMA_SEC", true)
FLD_DESC_ENT(RDC_FI_ECC_SDMA_DED, "SDMA Double Error Detection", "ECC_SDMA_DED", true)
FLD_DESC_ENT(RDC_FI_ECC_GFX_SEC, "GFX Single Error Correction", "ECC_GFX_SEC", true)
FLD_DESC_ENT(RDC_FI_ECC_GFX_DED, "GFX Double Error Detection", "ECC_GFX_DED", true)
FLD_DESC_ENT(RDC_FI_ECC_MMHUB_SEC, "MMHUB Single Error Correction", "ECC_MMHUB_SEC", true)
FLD_DESC_ENT(RDC_FI_ECC_MMHUB_DED, "MMHUB Double Error Detection", "ECC_MMHUB_DED", true)
FLD_DESC_ENT(RDC_FI_ECC_ATHUB_SEC, "ATHUB Single Error Correction", "ECC_ATHUB_SEC", true)
FLD_DESC_ENT(RDC_FI_ECC_ATHUB_DED, "ATHUB Double Error Detection", "ECC_ATHUB_DED", true)
FLD_DESC_ENT(RDC_FI_ECC_BIF_SEC, "BIF Single Error Correction", "ECC_BIF_SEC", true)
FLD_DESC_ENT(RDC_FI_ECC_BIF_DED, "BIF Double Error Detection", "ECC_BIF_DED", true)
FLD_DESC_ENT(RDC_FI_ECC_HDP_SEC, "HDP Single Error Correction", "ECC_HDP_SEC", true)
FLD_DESC_ENT(RDC_FI_ECC_HDP_DED, "HDP Double Error Detection", "ECC_HDP_DED", true)
FLD_DESC_ENT(RDC_FI_ECC_XGMI_WAFL_SEC, "XGMI WAFL Single Error Correction", "ECC_XGMI_WAFL_SEC",true)
FLD_DESC_ENT(RDC_FI_ECC_XGMI_WAFL_DED, "XGMI WAFL Double Error Detection", "ECC_XGMI_WAFL_DED",true)
FLD_DESC_ENT(RDC_FI_ECC_DF_SEC, "DF Single Error Correction", "ECC_DF_SEC", true)
FLD_DESC_ENT(RDC_FI_ECC_DF_DED, "DF Double Error Detection", "ECC_DF_DED", true)
FLD_DESC_ENT(RDC_FI_ECC_SMN_SEC, "SMN Single Error Correction", "ECC_SMN_SEC", true)
FLD_DESC_ENT(RDC_FI_ECC_SMN_DED, "SMN Double Error Detection", "ECC_SMN_DED", true)
FLD_DESC_ENT(RDC_FI_ECC_SEM_SEC, "SEM Single Error Correction", "ECC_SEM_SEC", true)
FLD_DESC_ENT(RDC_FI_ECC_SEM_DED, "SEM Double Error Detection", "ECC_SEM_DED", true)
FLD_DESC_ENT(RDC_FI_ECC_MP0_SEC, "MP0 Single Error Correction", "ECC_MP0_SEC", true)
FLD_DESC_ENT(RDC_FI_ECC_MP0_DED, "MP0 Double Error Detection", "ECC_MP0_DED", true)
FLD_DESC_ENT(RDC_FI_ECC_MP1_SEC, "MP1 Single Error Correction", "ECC_MP1_SEC", true)
FLD_DESC_ENT(RDC_FI_ECC_MP1_DED, "MP1 Double Error Detection", "ECC_MP1_DED", true)
FLD_DESC_ENT(RDC_FI_ECC_FUSE_SEC, "FUSE Single Error Correction", "ECC_FUSE_SEC", true)
FLD_DESC_ENT(RDC_FI_ECC_FUSE_DED, "FUSE Double Error Detection", "ECC_FUSE_DED", true)
FLD_DESC_ENT(RDC_FI_ECC_UMC_SEC, "UMC Single Error Correction", "ECC_UMC_SEC", true)
FLD_DESC_ENT(RDC_FI_ECC_UMC_DED, "UMC Double Error Detection", "ECC_UMC_DED", true)
FLD_DESC_ENT(RDC_FI_XGMI_0_READ_KB, "XGMI0 accumulated data read size (KB)", "XGMI_0_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_1_READ_KB, "XGMI1 accumulated data read size (KB)", "XGMI_1_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_2_READ_KB, "XGMI2 accumulated data read size (KB)", "XGMI_2_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_3_READ_KB, "XGMI3 accumulated data read size (KB)", "XGMI_3_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_4_READ_KB, "XGMI4 accumulated data read size (KB)", "XGMI_4_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_5_READ_KB, "XGMI5 accumulated data read size (KB)", "XGMI_5_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_6_READ_KB, "XGMI6 accumulated data read size (KB)", "XGMI_6_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_7_READ_KB, "XGMI7 accumulated data read size (KB)", "XGMI_7_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_0_WRITE_KB, "XGMI0 accumulated data write size (KB)", "XGMI_0_WRITE", true)
FLD_DESC_ENT(RDC_FI_XGMI_1_WRITE_KB, "XGMI1 accumulated data write size (KB)", "XGMI_1_WRITE", true)
FLD_DESC_ENT(RDC_FI_XGMI_2_WRITE_KB, "XGMI2 accumulated data write size (KB)", "XGMI_2_WRITE", true)
FLD_DESC_ENT(RDC_FI_XGMI_3_WRITE_KB, "XGMI3 accumulated data write size (KB)", "XGMI_3_WRITE", true)
FLD_DESC_ENT(RDC_FI_XGMI_4_WRITE_KB, "XGMI4 accumulated data write size (KB)", "XGMI_4_WRITE", true)
FLD_DESC_ENT(RDC_FI_XGMI_5_WRITE_KB, "XGMI5 accumulated data write size (KB)", "XGMI_5_WRITE", true)
FLD_DESC_ENT(RDC_FI_XGMI_6_WRITE_KB, "XGMI6 accumulated data write size (KB)", "XGMI_6_WRITE", true)
FLD_DESC_ENT(RDC_FI_XGMI_7_WRITE_KB, "XGMI7 accumulated data write size (KB)", "XGMI_7_WRITE", true)
// ROCProfiler fields
// This doesn't map to rocprofiler counters directly
// See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h
// See metrics.xml in rocprofiler
FLD_DESC_ENT(RDC_FI_PROF_CU_UTILIZATION, "", "CU_UTILIZATION", false)
FLD_DESC_ENT(RDC_FI_PROF_CU_OCCUPANCY, "", "CU_OCCUPANCY", false)
FLD_DESC_ENT(RDC_FI_PROF_FLOPS_16, "", "FLOPS_16", false)
FLD_DESC_ENT(RDC_FI_PROF_FLOPS_32, "", "FLOPS_32", false)
FLD_DESC_ENT(RDC_FI_PROF_FLOPS_64, "", "FLOPS_64", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "", "ACTIVE_CYCLES", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "", "ACTIVE_WAVES", false)
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "", "ELAPSED_CYCLES", false)
// Events
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_0_REQ_TX, "Outgoing requests to neighbor 0", "XGMI_REQ_0", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_0_RESP_TX, "Outgoing responses to neighbor 0", "XGMI_RES_0", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_0_BEATS_TX, "Data sent to neighbor 0 (32 byte pkts)", "XGMI_BTS_0", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_1_NOP_TX, "NOPs sent to neighbor 1", "XGMI_NOP_1", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_1_REQ_TX, "Outgoing requests to neighbor 1", "XGMI_REQ_1", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_1_RESP_TX, "Outgoing responses to neighbor 1", "XGMI_RES_1", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_1_BEATS_TX, "Data sent to neighbor 1 (32 byte pkts)", "XGMI_BTS_1", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_0_THRPUT, "Tx throughput to XGMI neighbor 0 in b/s", "XGMI_0_T", true)
FLD_DESC_ENT(RDC_EVNT_XGMI_1_THRPUT, "Tx throughput to XGMI neighbor 1 in b/s", "XGMI_1_T", true)
FLD_DESC_ENT(RDC_EVNT_XGMI_2_THRPUT, "Tx throughput to XGMI neighbor 2 in b/s", "XGMI_2_T", true)
FLD_DESC_ENT(RDC_EVNT_XGMI_3_THRPUT, "Tx throughput to XGMI neighbor 3 in b/s", "XGMI_3_T", true)
FLD_DESC_ENT(RDC_EVNT_XGMI_4_THRPUT, "Tx throughput to XGMI neighbor 4 in b/s", "XGMI_4_T", true)
FLD_DESC_ENT(RDC_EVNT_XGMI_5_THRPUT, "Tx throughput to XGMI neighbor 5 in b/s", "XGMI_5_T", true)
// Asynchronous event notifications
FLD_DESC_ENT(RDC_EVNT_NOTIF_VMFAULT, "VM page fault", "VM_PAGE_FAULT", false)
FLD_DESC_ENT(RDC_EVNT_NOTIF_THERMAL_THROTTLE, "Clk freq decrease due to temp", "THERMAL_THROT", false)
FLD_DESC_ENT(RDC_EVNT_NOTIF_PRE_RESET, "GPU reset is about to occur", "GPU_PRE_RESET", false)
FLD_DESC_ENT(RDC_EVNT_NOTIF_POST_RESET, "GPU reset just occurred", "GPU_POST_RESET", false)
FLD_DESC_ENT(RDC_EVNT_NOTIF_RING_HANG, "GPU ring hang just occured", "RING_HANG", false)