Files
rocm-systems/common/rdc_field.data
T
Galantsev, Dmitrii bffe4e22fa Add OAM_ID
Change-Id: I771b2f7f088940838c09ba3521a7955faa64e7ec
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
2024-09-09 21:19:33 -05:00

157 строки
13 KiB
Plaintext

/*
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Description Fields:
// Arg # Type Meaning
// -------------------------------------------------
// 1 rdc_field_t enum of field
// 2 string description of enum
// 3 string rdci display label
// 4 bool do or do not display in rdci
// rdc_field_t Description rdci label To Display
// =========== =========== ========= ==========
#ifndef FLD_DESC_ENT
#define FLD_DESC_ENT(ID, DESC, LABEL, DISPLAY)
#endif
FLD_DESC_ENT(RDC_FI_INVALID, "Unknown/Invalid field", "INVALID", false)
FLD_DESC_ENT(RDC_FI_GPU_COUNT, "GPU count in the system", "GPU_COUNT", true)
FLD_DESC_ENT(RDC_FI_DEV_NAME, "Name of the device", "DEV_NAME", true)
FLD_DESC_ENT(RDC_FI_OAM_ID, "OAM ID of the device", "OAM_ID", true)
FLD_DESC_ENT(RDC_FI_GPU_CLOCK, "Current GPU clock frequencies", "GPU_CLOCK", true)
FLD_DESC_ENT(RDC_FI_MEM_CLOCK, "Current Memory clock frequencies", "MEM_CLOCK", true)
FLD_DESC_ENT(RDC_FI_MEMORY_TEMP, "Memory temperature in millidegrees Celsius", "MEMORY_TEMP", true)
FLD_DESC_ENT(RDC_FI_GPU_TEMP, "GPU temperature in millidegrees Celsius", "GPU_TEMP", true)
FLD_DESC_ENT(RDC_FI_POWER_USAGE, "Power usage in microwatts", "POWER_USAGE", true)
FLD_DESC_ENT(RDC_FI_PCIE_TX, "PCIe Tx utilization in bytes/second", "PCIE_TX", true)
FLD_DESC_ENT(RDC_FI_PCIE_RX, "PCIe Rx utilization in bytes/second", "PCIE_RX", true)
FLD_DESC_ENT(RDC_FI_PCIE_BANDWIDTH, "PCIe bandwidth in GB/sec", "PCIE_BANDWIDTH", true)
FLD_DESC_ENT(RDC_FI_GPU_UTIL, "GPU busy percentage", "GPU_UTIL", true)
FLD_DESC_ENT(RDC_FI_GPU_MEMORY_USAGE, "Memory usage of the GPU instance in bytes", "GPU_MEMORY_USAGE", true)
FLD_DESC_ENT(RDC_FI_GPU_MEMORY_TOTAL, "Total memory of the GPU instance", "GPU_MEMORY_TOTAL", true)
// ECC totals
FLD_DESC_ENT(RDC_FI_ECC_CORRECT_TOTAL, "Accumulated Single Error Correction", "ECC_CORRECT", true)
FLD_DESC_ENT(RDC_FI_ECC_UNCORRECT_TOTAL, "Accumulated Double Error Detection", "ECC_UNCORRECT", true)
// ECC blocks
FLD_DESC_ENT(RDC_FI_ECC_SDMA_CE, "SDMA Correctable Error", "ECC_SDMA_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_SDMA_UE, "SDMA Uncorrectable Error", "ECC_SDMA_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_GFX_CE, "GFX Correctable Error", "ECC_GFX_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_GFX_UE, "GFX Uncorrectable Error", "ECC_GFX_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_MMHUB_CE, "MMHUB Correctable Error", "ECC_MMHUB_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_MMHUB_UE, "MMHUB Uncorrectable Error", "ECC_MMHUB_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_ATHUB_CE, "ATHUB Correctable Error", "ECC_ATHUB_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_ATHUB_UE, "ATHUB Uncorrectable Error", "ECC_ATHUB_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_PCIE_BIF_CE, "PCIE_BIF Correctable Error", "ECC_PCIE_BIF_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_PCIE_BIF_UE, "PCIE_BIF Uncorrectable Error", "ECC_PCIE_BIF_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_HDP_CE, "HDP Correctable Error", "ECC_HDP_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_HDP_UE, "HDP Uncorrectable Error", "ECC_HDP_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_XGMI_WAFL_CE, "XGMI_WAFL Correctable Error", "ECC_XGMI_WAFL_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_XGMI_WAFL_UE, "XGMI_WAFL Uncorrectable Error", "ECC_XGMI_WAFL_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_DF_CE, "DF Correctable Error", "ECC_DF_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_DF_UE, "DF Uncorrectable Error", "ECC_DF_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_SMN_CE, "SMN Correctable Error", "ECC_SMN_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_SMN_UE, "SMN Uncorrectable Error", "ECC_SMN_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_SEM_CE, "SEM Correctable Error", "ECC_SEM_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_SEM_UE, "SEM Uncorrectable Error", "ECC_SEM_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_MP0_CE, "MP0 Correctable Error", "ECC_MP0_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_MP0_UE, "MP0 Uncorrectable Error", "ECC_MP0_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_MP1_CE, "MP1 Correctable Error", "ECC_MP1_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_MP1_UE, "MP1 Uncorrectable Error", "ECC_MP1_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_FUSE_CE, "FUSE Correctable Error", "ECC_FUSE_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_FUSE_UE, "FUSE Uncorrectable Error", "ECC_FUSE_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_UMC_CE, "UMC Correctable Error", "ECC_UMC_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_UMC_UE, "UMC Uncorrectable Error", "ECC_UMC_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_MCA_CE, "MCA Correctable Error", "ECC_MCA_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_MCA_UE, "MCA Uncorrectable Error", "ECC_MCA_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_VCN_CE, "VCN Correctable Error", "ECC_VCN_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_VCN_UE, "VCN Uncorrectable Error", "ECC_VCN_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_JPEG_CE, "JPEG Correctable Error", "ECC_JPEG_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_JPEG_UE, "JPEG Uncorrectable Error", "ECC_JPEG_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_IH_CE, "IH Correctable Error", "ECC_IH_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_IH_UE, "IH Uncorrectable Error", "ECC_IH_UE", true)
FLD_DESC_ENT(RDC_FI_ECC_MPIO_CE, "MPIO Correctable Error", "ECC_MPIO_CE", true)
FLD_DESC_ENT(RDC_FI_ECC_MPIO_UE, "MPIO Uncorrectable Error", "ECC_MPIO_UE", true)
// XGMI
FLD_DESC_ENT(RDC_FI_XGMI_0_READ_KB, "XGMI0 accumulated data read size (KB)", "XGMI_0_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_1_READ_KB, "XGMI1 accumulated data read size (KB)", "XGMI_1_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_2_READ_KB, "XGMI2 accumulated data read size (KB)", "XGMI_2_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_3_READ_KB, "XGMI3 accumulated data read size (KB)", "XGMI_3_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_4_READ_KB, "XGMI4 accumulated data read size (KB)", "XGMI_4_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_5_READ_KB, "XGMI5 accumulated data read size (KB)", "XGMI_5_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_6_READ_KB, "XGMI6 accumulated data read size (KB)", "XGMI_6_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_7_READ_KB, "XGMI7 accumulated data read size (KB)", "XGMI_7_READ", true)
FLD_DESC_ENT(RDC_FI_XGMI_0_WRITE_KB, "XGMI0 accumulated data write size (KB)", "XGMI_0_WRITE", true)
FLD_DESC_ENT(RDC_FI_XGMI_1_WRITE_KB, "XGMI1 accumulated data write size (KB)", "XGMI_1_WRITE", true)
FLD_DESC_ENT(RDC_FI_XGMI_2_WRITE_KB, "XGMI2 accumulated data write size (KB)", "XGMI_2_WRITE", true)
FLD_DESC_ENT(RDC_FI_XGMI_3_WRITE_KB, "XGMI3 accumulated data write size (KB)", "XGMI_3_WRITE", true)
FLD_DESC_ENT(RDC_FI_XGMI_4_WRITE_KB, "XGMI4 accumulated data write size (KB)", "XGMI_4_WRITE", true)
FLD_DESC_ENT(RDC_FI_XGMI_5_WRITE_KB, "XGMI5 accumulated data write size (KB)", "XGMI_5_WRITE", true)
FLD_DESC_ENT(RDC_FI_XGMI_6_WRITE_KB, "XGMI6 accumulated data write size (KB)", "XGMI_6_WRITE", true)
FLD_DESC_ENT(RDC_FI_XGMI_7_WRITE_KB, "XGMI7 accumulated data write size (KB)", "XGMI_7_WRITE", true)
// ROCProfiler fields
// This doesn't map to rocprofiler counters directly
// See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h
// See metrics.xml in rocprofiler
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "Mean occupancy per CU", "MN_OCC_PER_CU", false)
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "Mean occupancy per active CU", "MN_OCC_PER_ACT_CU", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
// metrics below are divided by time passed
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", false)
// Events
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_0_REQ_TX, "Outgoing requests to neighbor 0", "XGMI_REQ_0", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_0_RESP_TX, "Outgoing responses to neighbor 0", "XGMI_RES_0", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_0_BEATS_TX, "Data sent to neighbor 0 (32 byte pkts)", "XGMI_BTS_0", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_1_NOP_TX, "NOPs sent to neighbor 1", "XGMI_NOP_1", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_1_REQ_TX, "Outgoing requests to neighbor 1", "XGMI_REQ_1", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_1_RESP_TX, "Outgoing responses to neighbor 1", "XGMI_RES_1", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_1_BEATS_TX, "Data sent to neighbor 1 (32 byte pkts)", "XGMI_BTS_1", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_0_THRPUT, "Tx throughput to XGMI neighbor 0 in b/s", "XGMI_0_T", true)
FLD_DESC_ENT(RDC_EVNT_XGMI_1_THRPUT, "Tx throughput to XGMI neighbor 1 in b/s", "XGMI_1_T", true)
FLD_DESC_ENT(RDC_EVNT_XGMI_2_THRPUT, "Tx throughput to XGMI neighbor 2 in b/s", "XGMI_2_T", true)
FLD_DESC_ENT(RDC_EVNT_XGMI_3_THRPUT, "Tx throughput to XGMI neighbor 3 in b/s", "XGMI_3_T", true)
FLD_DESC_ENT(RDC_EVNT_XGMI_4_THRPUT, "Tx throughput to XGMI neighbor 4 in b/s", "XGMI_4_T", true)
FLD_DESC_ENT(RDC_EVNT_XGMI_5_THRPUT, "Tx throughput to XGMI neighbor 5 in b/s", "XGMI_5_T", true)
// Asynchronous event notifications
FLD_DESC_ENT(RDC_EVNT_NOTIF_VMFAULT, "VM page fault", "VM_PAGE_FAULT", false)
FLD_DESC_ENT(RDC_EVNT_NOTIF_THERMAL_THROTTLE, "Clk freq decrease due to temp", "THERMAL_THROT", false)
FLD_DESC_ENT(RDC_EVNT_NOTIF_PRE_RESET, "GPU reset is about to occur", "GPU_PRE_RESET", false)
FLD_DESC_ENT(RDC_EVNT_NOTIF_POST_RESET, "GPU reset just occurred", "GPU_POST_RESET", false)
FLD_DESC_ENT(RDC_EVNT_NOTIF_RING_HANG, "GPU ring hang just occured", "RING_HANG", false)