diff --git a/common/rdc_field.data b/common/rdc_field.data index b2850eec2f..c2c325afc0 100644 --- a/common/rdc_field.data +++ b/common/rdc_field.data @@ -42,8 +42,36 @@ FLD_DESC_ENT(RDC_FI_PCIE_RX, "PCIe Rx utilization in bytes/second", FLD_DESC_ENT(RDC_FI_GPU_UTIL, "GPU busy percentage", "GPU_UTIL", true) FLD_DESC_ENT(RDC_FI_GPU_MEMORY_USAGE, "Memory usage of the GPU instance in bytes", "GPU_MEMORY_USAGE", true) FLD_DESC_ENT(RDC_FI_GPU_MEMORY_TOTAL, "Total memory of the GPU instance", "GPU_MEMORY_TOTAL", true) -FLD_DESC_ENT(RDC_FI_ECC_CORRECT_TOTAL, "Accumulated correctable ECC errors", "ECC_CORRECT", true) -FLD_DESC_ENT(RDC_FI_ECC_UNCORRECT_TOTAL, "Accumulated uncorrectable ECC errors", "ECC_UNCORRECT", true) +FLD_DESC_ENT(RDC_FI_ECC_CORRECT_TOTAL, "Accumulated Single Error Correction", "ECC_CORRECT", true) +FLD_DESC_ENT(RDC_FI_ECC_UNCORRECT_TOTAL, "Accumulated Double Error Detection", "ECC_UNCORRECT", true) +FLD_DESC_ENT(RDC_FI_ECC_SDMA_SEC, "SDMA Single Error Correction", "ECC_SDMA_SEC", true) +FLD_DESC_ENT(RDC_FI_ECC_SDMA_DED, "SDMA Double Error Detection", "ECC_SDMA_DED", true) +FLD_DESC_ENT(RDC_FI_ECC_GFX_SEC, "GFX Single Error Correction", "ECC_GFX_SEC", true) +FLD_DESC_ENT(RDC_FI_ECC_GFX_DED, "GFX Double Error Detection", "ECC_GFX_DED", true) +FLD_DESC_ENT(RDC_FI_ECC_MMHUB_SEC, "MMHUB Single Error Correction", "ECC_MMHUB_SEC", true) +FLD_DESC_ENT(RDC_FI_ECC_MMHUB_DED, "MMHUB Double Error Detection", "ECC_MMHUB_DED", true) +FLD_DESC_ENT(RDC_FI_ECC_ATHUB_SEC, "ATHUB Single Error Correction", "ECC_ATHUB_SEC", true) +FLD_DESC_ENT(RDC_FI_ECC_ATHUB_DED, "ATHUB Double Error Detection", "ECC_ATHUB_DED", true) +FLD_DESC_ENT(RDC_FI_ECC_BIF_SEC, "BIF Single Error Correction", "ECC_BIF_SEC", true) +FLD_DESC_ENT(RDC_FI_ECC_BIF_DED, "BIF Double Error Detection", "ECC_BIF_DED", true) +FLD_DESC_ENT(RDC_FI_ECC_HDP_SEC, "HDP Single Error Correction", "ECC_HDP_SEC", true) +FLD_DESC_ENT(RDC_FI_ECC_HDP_DED, "HDP Double Error Detection", "ECC_HDP_DED", true) +FLD_DESC_ENT(RDC_FI_ECC_XGMI_WAFL_SEC, "XGMI WAFL Single Error Correction", "ECC_XGMI_WAFL_SEC",true) +FLD_DESC_ENT(RDC_FI_ECC_XGMI_WAFL_DED, "XGMI WAFL Double Error Detection", "ECC_XGMI_WAFL_DED",true) +FLD_DESC_ENT(RDC_FI_ECC_DF_SEC, "DF Single Error Correction", "ECC_DF_SEC", true) +FLD_DESC_ENT(RDC_FI_ECC_DF_DED, "DF Double Error Detection", "ECC_DF_DED", true) +FLD_DESC_ENT(RDC_FI_ECC_SMN_SEC, "SMN Single Error Correction", "ECC_SMN_SEC", true) +FLD_DESC_ENT(RDC_FI_ECC_SMN_DED, "SMN Double Error Detection", "ECC_SMN_DED", true) +FLD_DESC_ENT(RDC_FI_ECC_SEM_SEC, "SEM Single Error Correction", "ECC_SEM_SEC", true) +FLD_DESC_ENT(RDC_FI_ECC_SEM_DED, "SEM Double Error Detection", "ECC_SEM_DED", true) +FLD_DESC_ENT(RDC_FI_ECC_MP0_SEC, "MP0 Single Error Correction", "ECC_MP0_SEC", true) +FLD_DESC_ENT(RDC_FI_ECC_MP0_DED, "MP0 Double Error Detection", "ECC_MP0_DED", true) +FLD_DESC_ENT(RDC_FI_ECC_MP1_SEC, "MP1 Single Error Correction", "ECC_MP1_SEC", true) +FLD_DESC_ENT(RDC_FI_ECC_MP1_DED, "MP1 Double Error Detection", "ECC_MP1_DED", true) +FLD_DESC_ENT(RDC_FI_ECC_FUSE_SEC, "FUSE Single Error Correction", "ECC_FUSE_SEC", true) +FLD_DESC_ENT(RDC_FI_ECC_FUSE_DED, "FUSE Double Error Detection", "ECC_FUSE_DED", true) +FLD_DESC_ENT(RDC_FI_ECC_UMC_SEC, "UMC Single Error Correction", "ECC_UMC_SEC", true) +FLD_DESC_ENT(RDC_FI_ECC_UMC_DED, "UMC Double Error Detection", "ECC_UMC_DED", true) FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false) FLD_DESC_ENT(RDC_EVNT_XGMI_0_REQ_TX, "Outgoing requests to neighbor 0", "XGMI_REQ_0", false) diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index 254d1d6520..9d213609ff 100755 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -181,6 +181,48 @@ typedef enum { RDC_FI_ECC_CORRECT_TOTAL = 600, //!< Accumulated correctable ECC errors RDC_FI_ECC_UNCORRECT_TOTAL, //!< Accumulated uncorrectable ECC errors + RDC_FI_ECC_SDMA_SEC, //!< SDMA Single Error Correction + RDC_FI_ECC_SDMA_DED, //!< SDMA Double Error Detection + + RDC_FI_ECC_GFX_SEC, //!< GFX Single Error Correction + RDC_FI_ECC_GFX_DED, //!< GFX Double Error Detection + + RDC_FI_ECC_MMHUB_SEC, //!< MMHUB Single Error Correction + RDC_FI_ECC_MMHUB_DED, //!< MMHUB Double Error Detection + + RDC_FI_ECC_ATHUB_SEC, //!< ATHUB Single Error Correction + RDC_FI_ECC_ATHUB_DED, //!< ATHUB Double Error Detection + + RDC_FI_ECC_BIF_SEC, //!< BIF Single Error Correction + RDC_FI_ECC_BIF_DED, //!< BIF Double Error Detection + + RDC_FI_ECC_HDP_SEC, //!< HDP Single Error Correction + RDC_FI_ECC_HDP_DED, //!< HDP Double Error Detection + + RDC_FI_ECC_XGMI_WAFL_SEC, //!< XGMI WAFL Single Error Correction + RDC_FI_ECC_XGMI_WAFL_DED, //!< XGMI WAFL Double Error Detection + + RDC_FI_ECC_DF_SEC, //!< DF Single Error Correction + RDC_FI_ECC_DF_DED, //!< DF Double Error Detection + + RDC_FI_ECC_SMN_SEC, //!< SMN Single Error Correction + RDC_FI_ECC_SMN_DED, //!< SMN Double Error Detection + + RDC_FI_ECC_SEM_SEC, //!< SEM Single Error Correction + RDC_FI_ECC_SEM_DED, //!< SEM Double Error Detection + + RDC_FI_ECC_MP0_SEC, //!< MP0 Single Error Correction + RDC_FI_ECC_MP0_DED, //!< MP0 Double Error Detection + + RDC_FI_ECC_MP1_SEC, //!< MP1 Single Error Correction + RDC_FI_ECC_MP1_DED, //!< MP1 Double Error Detection + + RDC_FI_ECC_FUSE_SEC, //!< FUSE Single Error Correction + RDC_FI_ECC_FUSE_DED, //!< FUSE Double Error Detection + + RDC_FI_ECC_UMC_SEC, //!< UMC Single Error Correction + RDC_FI_ECC_UMC_DED, //!< UMC Double Error Detection + /* * @brief Raw XGMI counter events */ diff --git a/python_binding/rdc_bootstrap.py b/python_binding/rdc_bootstrap.py index f21f2b5e6d..6e9ced859a 100644 --- a/python_binding/rdc_bootstrap.py +++ b/python_binding/rdc_bootstrap.py @@ -87,6 +87,34 @@ class rdc_field_t(c_int): RDC_FI_GPU_MEMORY_TOTAL = 502 RDC_FI_ECC_CORRECT_TOTAL = 600 RDC_FI_ECC_UNCORRECT_TOTAL = 601 + RDC_FI_ECC_SDMA_SEC = 602 + RDC_FI_ECC_SDMA_DED = 603 + RDC_FI_ECC_GFX_SEC = 604 + RDC_FI_ECC_GFX_DED = 605 + RDC_FI_ECC_MMHUB_SEC = 606 + RDC_FI_ECC_MMHUB_DED = 607 + RDC_FI_ECC_ATHUB_SEC = 608 + RDC_FI_ECC_ATHUB_DED = 609 + RDC_FI_ECC_BIF_SEC = 610 + RDC_FI_ECC_BIF_DED = 611 + RDC_FI_ECC_HDP_SEC = 612 + RDC_FI_ECC_HDP_DED = 613 + RDC_FI_ECC_XGMI_WAFL_SEC = 614 + RDC_FI_ECC_XGMI_WAFL_DED = 615 + RDC_FI_ECC_DF_SEC = 616 + RDC_FI_ECC_DF_DED = 617 + RDC_FI_ECC_SMN_SEC = 618 + RDC_FI_ECC_SMN_DED = 619 + RDC_FI_ECC_SEM_SEC = 620 + RDC_FI_ECC_SEM_DED = 621 + RDC_FI_ECC_MP0_SEC = 622 + RDC_FI_ECC_MP0_DED = 623 + RDC_FI_ECC_MP1_SEC = 624 + RDC_FI_ECC_MP1_DED = 625 + RDC_FI_ECC_FUSE_SEC = 626 + RDC_FI_ECC_FUSE_DED = 627 + RDC_FI_ECC_UMC_SEC = 628 + RDC_FI_ECC_UMC_DED = 629 RDC_EVNT_XGMI_0_NOP_TX = 1000 RDC_EVNT_XGMI_0_REQ_TX = 1001 RDC_EVNT_XGMI_0_RESP_TX = 1002 diff --git a/rdc_libs/rdc/src/RdcRasLib.cc b/rdc_libs/rdc/src/RdcRasLib.cc index f08e56d44d..ab44ce2bcf 100644 --- a/rdc_libs/rdc/src/RdcRasLib.cc +++ b/rdc_libs/rdc/src/RdcRasLib.cc @@ -36,6 +36,7 @@ RdcRasLib::RdcRasLib(const char* lib_name): , rdc_module_destroy_(nullptr) { rdc_status_t status = lib_loader_.load(lib_name); if (status != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "RAS related function will not work."); return; } @@ -46,9 +47,11 @@ RdcRasLib::RdcRasLib(const char* lib_name): return; } - if (rdc_module_init_(0) != RDC_ST_OK) { + status = rdc_module_init_(0); + if (status != RDC_ST_OK) { RDC_LOG(RDC_ERROR, "Fail to init librdc_ras.so:" - << rdc_status_string(status)); + << rdc_status_string(status) + << ". RAS related function will not work."); return; } diff --git a/rdc_libs/rdc/src/RdcSmiLib.cc b/rdc_libs/rdc/src/RdcSmiLib.cc index fdb378e5fd..c9649f451b 100644 --- a/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/rdc_libs/rdc/src/RdcSmiLib.cc @@ -117,7 +117,6 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query( RDC_FI_MEM_CLOCK, RDC_FI_MEMORY_TEMP, RDC_FI_GPU_TEMP, RDC_FI_POWER_USAGE, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX, RDC_FI_GPU_UTIL, RDC_FI_GPU_MEMORY_USAGE, RDC_FI_GPU_MEMORY_TOTAL, - RDC_FI_ECC_CORRECT_TOTAL, RDC_FI_ECC_UNCORRECT_TOTAL, RDC_EVNT_XGMI_0_NOP_TX, RDC_EVNT_XGMI_0_REQ_TX, RDC_EVNT_XGMI_0_RESP_TX, RDC_EVNT_XGMI_0_BEATS_TX, RDC_EVNT_XGMI_1_NOP_TX, RDC_EVNT_XGMI_1_REQ_TX,