diff --git a/projects/rdc/common/rdc_field.data b/projects/rdc/common/rdc_field.data index 7de8bc5d3f..7dd40ed07a 100644 --- a/projects/rdc/common/rdc_field.data +++ b/projects/rdc/common/rdc_field.data @@ -110,7 +110,6 @@ FLD_DESC_ENT(RDC_FI_XGMI_4_READ_KB, "XGMI4 accumulated data read size (KB)" FLD_DESC_ENT(RDC_FI_XGMI_5_READ_KB, "XGMI5 accumulated data read size (KB)", "XGMI_5_READ", true) FLD_DESC_ENT(RDC_FI_XGMI_6_READ_KB, "XGMI6 accumulated data read size (KB)", "XGMI_6_READ", true) FLD_DESC_ENT(RDC_FI_XGMI_7_READ_KB, "XGMI7 accumulated data read size (KB)", "XGMI_7_READ", true) -FLD_DESC_ENT(RDC_FI_XGMI_TOTAL_READ_KB, "XGMI accumlated data read size across all lanes (KB)", "XGMI_TOTAL_READ", true) FLD_DESC_ENT(RDC_FI_XGMI_0_WRITE_KB, "XGMI0 accumulated data write size (KB)", "XGMI_0_WRITE", true) FLD_DESC_ENT(RDC_FI_XGMI_1_WRITE_KB, "XGMI1 accumulated data write size (KB)", "XGMI_1_WRITE", true) @@ -120,6 +119,7 @@ FLD_DESC_ENT(RDC_FI_XGMI_4_WRITE_KB, "XGMI4 accumulated data write size (KB) FLD_DESC_ENT(RDC_FI_XGMI_5_WRITE_KB, "XGMI5 accumulated data write size (KB)", "XGMI_5_WRITE", true) FLD_DESC_ENT(RDC_FI_XGMI_6_WRITE_KB, "XGMI6 accumulated data write size (KB)", "XGMI_6_WRITE", true) FLD_DESC_ENT(RDC_FI_XGMI_7_WRITE_KB, "XGMI7 accumulated data write size (KB)", "XGMI_7_WRITE", true) +FLD_DESC_ENT(RDC_FI_XGMI_TOTAL_READ_KB, "XGMI accumlated data read size across all lanes (KB)", "XGMI_TOTAL_READ", true) FLD_DESC_ENT(RDC_FI_XGMI_TOTAL_WRITE_KB, "XGMI accumlated data write size across all lanes (KB)", "XGMI_TOTAL_WRITE", true) diff --git a/projects/rdc/docs/how-to/integration.rst b/projects/rdc/docs/how-to/integration.rst index 27f618bc26..9630095a1a 100644 --- a/projects/rdc/docs/how-to/integration.rst +++ b/projects/rdc/docs/how-to/integration.rst @@ -539,37 +539,46 @@ The ``dmon`` command monitors GPU index 0, field 600, and 601, where 600 is for % rdci dmon -l ... ... - 600 RDC_FI_ECC_CORRECT_TOTAL : Accumulated Single Error Correction. - 601 RDC_FI_ECC_UNCORRECT_TOTAL : Accumulated Double Error Detection. - 602 RDC_FI_ECC_SDMA_SEC : SDMA Single Error Correction. - 603 RDC_FI_ECC_SDMA_DED : SDMA Double Error Detection. - 604 RDC_FI_ECC_GFX_SEC : GFX Single Error Correction. - 605 RDC_FI_ECC_GFX_DED : GFX Double Error Detection. - 606 RDC_FI_ECC_MMHUB_SEC : MMHUB Single Error Correction. - 607 RDC_FI_ECC_MMHUB_DED : MMHUB Double Error Detection. - 608 RDC_FI_ECC_ATHUB_SEC : ATHUB Single Error Correction. - 609 RDC_FI_ECC_ATHUB_DED : ATHUB Double Error Detection. - 610 RDC_FI_ECC_BIF_SEC : BIF Single Error Correction. - 611 RDC_FI_ECC_BIF_DED : BIF Double Error Detection. - 612 RDC_FI_ECC_HDP_SEC : HDP Single Error Correction. - 613 RDC_FI_ECC_HDP_DED : HDP Double Error Detection. - 614 RDC_FI_ECC_XGMI_WAFL_SEC : XGMI WAFL Single Error Correction. - 615 RDC_FI_ECC_XGMI_WAFL_DED : XGMI WAFL Double Error Detection. - 616 RDC_FI_ECC_DF_SEC : DF Single Error Correction. - 617 RDC_FI_ECC_DF_DED : DF Double Error Detection. - 618 RDC_FI_ECC_SMN_SEC : SMN Single Error Correction. - 619 RDC_FI_ECC_SMN_DED : SMN Double Error Detection. - 620 RDC_FI_ECC_SEM_SEC : SEM Single Error Correction. - 621 RDC_FI_ECC_SEM_DED : SEM Double Error Detection. - 622 RDC_FI_ECC_MP0_SEC : MP0 Single Error Correction. - 623 RDC_FI_ECC_MP0_DED : MP0 Double Error Detection. - 624 RDC_FI_ECC_MP1_SEC : MP1 Single Error Correction. - - 625 RDC_FI_ECC_MP1_DED : MP1 Double Error Detection. - 626 RDC_FI_ECC_FUSE_SEC : FUSE Single Error Correction. - 627 RDC_FI_ECC_FUSE_DED : FUSE Double Error Detection. - 628 RDC_FI_ECC_UMC_SEC : UMC Single Error Correction. - 629 RDC_FI_ECC_UMC_DED : UMC Double Error Detection. + 600 RDC_FI_ECC_CORRECT_TOTAL : Accumulated Single Error Correction + 601 RDC_FI_ECC_UNCORRECT_TOTAL : Accumulated Double Error Detection + 602 RDC_FI_ECC_SDMA_CE : SDMA Correctable Error + 603 RDC_FI_ECC_SDMA_UE : SDMA Uncorrectable Error + 604 RDC_FI_ECC_GFX_CE : GFX Correctable Error + 605 RDC_FI_ECC_GFX_UE : GFX Uncorrectable Error + 606 RDC_FI_ECC_MMHUB_CE : MMHUB Correctable Error + 607 RDC_FI_ECC_MMHUB_UE : MMHUB Uncorrectable Error + 608 RDC_FI_ECC_ATHUB_CE : ATHUB Correctable Error + 609 RDC_FI_ECC_ATHUB_UE : ATHUB Uncorrectable Error + 610 RDC_FI_ECC_PCIE_BIF_CE : PCIE_BIF Correctable Error + 611 RDC_FI_ECC_PCIE_BIF_UE : PCIE_BIF Uncorrectable Error + 612 RDC_FI_ECC_HDP_CE : HDP Correctable Error + 613 RDC_FI_ECC_HDP_UE : HDP Uncorrectable Error + 614 RDC_FI_ECC_XGMI_WAFL_CE : XGMI WAFL Correctable Error + 615 RDC_FI_ECC_XGMI_WAFL_UE : XGMI WAFL Uncorrectable Error + 616 RDC_FI_ECC_DF_CE : DF Correctable Error + 617 RDC_FI_ECC_DF_UE : DF Uncorrectable Error + 618 RDC_FI_ECC_SMN_CE : SMN Correctable Error + 619 RDC_FI_ECC_SMN_UE : SMN Uncorrectable Error + 620 RDC_FI_ECC_SEM_CE : SEM Correctable Error + 621 RDC_FI_ECC_SEM_UE : SEM Uncorrectable Error + 622 RDC_FI_ECC_MP0_CE : MP0 Correctable Error + 623 RDC_FI_ECC_MP0_UE : MP0 Uncorrectable Error + 624 RDC_FI_ECC_MP1_CE : MP1 Correctable Error + 625 RDC_FI_ECC_MP1_UE : MP1 Uncorrectable Error + 626 RDC_FI_ECC_FUSE_CE : FUSE Correctable Error + 627 RDC_FI_ECC_FUSE_UE : FUSE Uncorrectable Error + 628 RDC_FI_ECC_UMC_CE : UMC Correctable Error + 629 RDC_FI_ECC_UMC_UE : UMC Uncorrectable Error + 630 RDC_FI_ECC_MCA_CE : MCA Correctable Error + 631 RDC_FI_ECC_MCA_UE : MCA Uncorrectable Error + 632 RDC_FI_ECC_VCN_CE : VCN Correctable Error + 633 RDC_FI_ECC_VCN_UE : VCN Uncorrectable Error + 634 RDC_FI_ECC_JPEG_CE : JPEG Correctable Error + 635 RDC_FI_ECC_JPEG_UE : JPEG Uncorrectable Error + 636 RDC_FI_ECC_IH_CE : IH Correctable Error + 637 RDC_FI_ECC_IH_UE : IH Uncorrectable Error + 638 RDC_FI_ECC_MPIO_CE : MPIO Correctable Error + 639 RDC_FI_ECC_MPIO_UE : MPIO Uncorrectable Error ... ... diff --git a/projects/rdc/example/rocprofiler_example.cc b/projects/rdc/example/rocprofiler_example.cc index 2f7f406d62..4ea0631580 100644 --- a/projects/rdc/example/rocprofiler_example.cc +++ b/projects/rdc/example/rocprofiler_example.cc @@ -128,8 +128,7 @@ int run() { field_ids.push_back(RDC_FI_GPU_MEMORY_USAGE); field_ids.push_back(RDC_FI_POWER_USAGE); // profiler metrics - field_ids.push_back(RDC_FI_PROF_MEAN_OCC_PER_CU); - field_ids.push_back(RDC_FI_PROF_MEAN_OCC_PER_ACTIVE_CU); + field_ids.push_back(RDC_FI_PROF_OCCUPANCY_PERCENT); field_ids.push_back(RDC_FI_PROF_ACTIVE_CYCLES); field_ids.push_back(RDC_FI_PROF_ACTIVE_WAVES); field_ids.push_back(RDC_FI_PROF_ELAPSED_CYCLES); diff --git a/projects/rdc/python_binding/rdc_bootstrap.py b/projects/rdc/python_binding/rdc_bootstrap.py index d7b6d7a0c4..5081734ce7 100644 --- a/projects/rdc/python_binding/rdc_bootstrap.py +++ b/projects/rdc/python_binding/rdc_bootstrap.py @@ -93,36 +93,47 @@ class rdc_field_t(c_int): RDC_FI_GPU_MEMORY_ACTIVITY = 505 RDC_FI_GPU_MEMORY_MAX_BANDWIDTH = 506 RDC_FI_GPU_MEMORY_CUR_BANDWIDTH = 507 + RDC_FI_GPU_PAGE_RETRIED = 550 RDC_FI_ECC_CORRECT_TOTAL = 600 RDC_FI_ECC_UNCORRECT_TOTAL = 601 - RDC_FI_ECC_SDMA_SEC = 602 - RDC_FI_ECC_SDMA_DED = 603 - RDC_FI_ECC_GFX_SEC = 604 - RDC_FI_ECC_GFX_DED = 605 - RDC_FI_ECC_MMHUB_SEC = 606 - RDC_FI_ECC_MMHUB_DED = 607 - RDC_FI_ECC_ATHUB_SEC = 608 - RDC_FI_ECC_ATHUB_DED = 609 - RDC_FI_ECC_BIF_SEC = 610 - RDC_FI_ECC_BIF_DED = 611 - RDC_FI_ECC_HDP_SEC = 612 - RDC_FI_ECC_HDP_DED = 613 - RDC_FI_ECC_XGMI_WAFL_SEC = 614 - RDC_FI_ECC_XGMI_WAFL_DED = 615 - RDC_FI_ECC_DF_SEC = 616 - RDC_FI_ECC_DF_DED = 617 - RDC_FI_ECC_SMN_SEC = 618 - RDC_FI_ECC_SMN_DED = 619 - RDC_FI_ECC_SEM_SEC = 620 - RDC_FI_ECC_SEM_DED = 621 - RDC_FI_ECC_MP0_SEC = 622 - RDC_FI_ECC_MP0_DED = 623 - RDC_FI_ECC_MP1_SEC = 624 - RDC_FI_ECC_MP1_DED = 625 - RDC_FI_ECC_FUSE_SEC = 626 - RDC_FI_ECC_FUSE_DED = 627 - RDC_FI_ECC_UMC_SEC = 628 - RDC_FI_ECC_UMC_DED = 629 + RDC_FI_ECC_SDMA_CE = 602 + RDC_FI_ECC_SDMA_UE = 603 + RDC_FI_ECC_GFX_CE = 604 + RDC_FI_ECC_GFX_UE = 605 + RDC_FI_ECC_MMHUB_CE = 606 + RDC_FI_ECC_MMHUB_UE = 607 + RDC_FI_ECC_ATHUB_CE = 608 + RDC_FI_ECC_ATHUB_UE = 609 + RDC_FI_ECC_PCIE_BIF_CE = 610 + RDC_FI_ECC_PCIE_BIF_UE = 611 + RDC_FI_ECC_HDP_CE = 612 + RDC_FI_ECC_HDP_UE = 613 + RDC_FI_ECC_XGMI_WAFL_CE = 614 + RDC_FI_ECC_XGMI_WAFL_UE = 615 + RDC_FI_ECC_DF_CE = 616 + RDC_FI_ECC_DF_UE = 617 + RDC_FI_ECC_SMN_CE = 618 + RDC_FI_ECC_SMN_UE = 619 + RDC_FI_ECC_SEM_CE = 620 + RDC_FI_ECC_SEM_UE = 621 + RDC_FI_ECC_MP0_CE = 622 + RDC_FI_ECC_MP0_UE = 623 + RDC_FI_ECC_MP1_CE = 624 + RDC_FI_ECC_MP1_UE = 625 + RDC_FI_ECC_FUSE_CE = 626 + RDC_FI_ECC_FUSE_UE = 627 + RDC_FI_ECC_UMC_CE = 628 + RDC_FI_ECC_UMC_UE = 629 + RDC_FI_ECC_MCA_CE = 630 + RDC_FI_ECC_MCA_UE = 631 + RDC_FI_ECC_VCN_CE = 632 + RDC_FI_ECC_VCN_UE = 633 + RDC_FI_ECC_JPEG_CE = 634 + RDC_FI_ECC_JPEG_UE = 635 + RDC_FI_ECC_IH_CE = 636 + RDC_FI_ECC_IH_UE = 637 + RDC_FI_ECC_MPIO_CE = 638 + RDC_FI_ECC_MPIO_UE = 639 RDC_FI_XGMI_0_READ_KB = 700 RDC_FI_XGMI_1_READ_KB = 701 RDC_FI_XGMI_2_READ_KB = 702 @@ -169,6 +180,10 @@ class rdc_field_t(c_int): RDC_EVNT_XGMI_1_BEATS_TX = 1007 RDC_EVNT_XGMI_0_THRPUT = 1500 RDC_EVNT_XGMI_1_THRPUT = 1501 + RDC_EVNT_XGMI_2_THRPUT = 1502 + RDC_EVNT_XGMI_3_THRPUT = 1503 + RDC_EVNT_XGMI_4_THRPUT = 1504 + RDC_EVNT_XGMI_5_THRPUT = 1505 RDC_EVNT_NOTIF_VMFAULT = 2000 RDC_EVNT_NOTIF_THERMAL_THROTTLE = 2001 RDC_EVNT_NOTIF_PRE_RESET = 2002